In [None]:
!pip install pypdf2

Collecting pypdf2
[?25l  Downloading https://files.pythonhosted.org/packages/b4/01/68fcc0d43daf4c6bdbc6b33cc3f77bda531c86b174cac56ef0ffdb96faab/PyPDF2-1.26.0.tar.gz (77kB)
[K     |████▎                           | 10kB 21.8MB/s eta 0:00:01[K     |████████▌                       | 20kB 16.8MB/s eta 0:00:01[K     |████████████▊                   | 30kB 14.9MB/s eta 0:00:01[K     |█████████████████               | 40kB 13.9MB/s eta 0:00:01[K     |█████████████████████▏          | 51kB 7.6MB/s eta 0:00:01[K     |█████████████████████████▍      | 61kB 8.9MB/s eta 0:00:01[K     |█████████████████████████████▋  | 71kB 8.4MB/s eta 0:00:01[K     |████████████████████████████████| 81kB 5.5MB/s 
[?25hBuilding wheels for collected packages: pypdf2
  Building wheel for pypdf2 (setup.py) ... [?25l[?25hdone
  Created wheel for pypdf2: filename=PyPDF2-1.26.0-cp37-none-any.whl size=61102 sha256=dfc19e938d2bbbee665e376ef5358165fb868ad2b70f0518f077d075778fa7b7
  Stored in directory: /

In [None]:
import PyPDF2

In [None]:
import csv
import datetime
import numpy
import scipy
import scipy.sparse
import numpy as np

In [None]:
def _build_patient_mapping(patients):
  """
 Create a mapping of the patient identifier to an index
 """
  pmap = {}
  for ii in range(len(patients)):
    pmap[patients[ii]] = ii
  return patmap
def _build_feature_mapping(features):
  """
 Create a mapping of the feature name to an index
 """
  fmap = {}
  for ii in range(len(features)):
    fmap[features[ii]] = ii
  return fmap
t0 = datetime.datetime.strptime('01/01/1900', "%m/%d/%Y")

def _map_time_to_dayId(time):
  """
 Convert datetime into an integer day offset from some base date (01-01-1900)
 to facilitate date difference computations.
 """
  t = datetime.datetime.strptime(time, "%m/%d/%Y")
  d = t - t0
  return d.days

In [None]:
def loadPatientCohortDataFile(fileName):
  """
 Read in a patient cohort data file with columns: patientId, label, operational
Date (yyyy-mm-dd format)
 label is 0 for controls and 1 for cases
 operationalDate for cases is the diagnosis date
 operationalDate for controls is the diagnosis date for the matching control
 627,0,2001-07-28
 628,0,2001-07-28
 629,1,2004-11-14
 639,1,2004-02-19
 Generate 3 mappings
 pidMap: patientId -> integer (offset)
 labelMap: patientId -> label
 dateIdMap: patientId -> operationalDateId
 """
# Load raw data
  pdf = open(fileName, 'rb')
  pdfReader = PyPDF2.PdfFileReader(pdf)
  num_pages=pdfReader.numPages
  n=0
  pidMap = {}
  labelMap = {}
  dateIdMap = {}

  for i in range(num_pages):
    page=pdfReader.getPage(i)
    lines=page.extractText().split("\n")
    j=0
    while j < (len(lines))-1:
      pidMap[int(lines[j])]=n
      labelMap[int(lines[j])]=int(lines[j+1])
      dateIdMap[int(lines[j])]=int(_map_time_to_dayId(lines[j+2]))
      n+=1
      j+=3
    




# Return results
  return pidMap, labelMap, dateIdMap

In [None]:
def loadPatientDataFile(fileName):
  """
 Read input data with columns: patientId, date (yyyy-mm-dd format), featureName
, featureValue
 627,1998-11-08,DIAGNOSIS:401.9,1.0
 627,1999-05-21,DIAGNOSIS:401.9,1.0
 628,1998-09-30,DIAGNOSIS:401.9,1.0
 628,1998-11-01,DIAGNOSIS:401.9,1.0
 627,1999-08-07,ACE_Inhibitors:ACE_Inhibitors,1.0
 627,1999-12-04,Angiotensin_II_Receptor_Antagonists,1.0
 627,1999-12-04,Loop_Diuretics,1.0
 627,2000-06-11,Beta_Blockers_Cardio-Selective,1.0
 628,1998-09-30,Antiadrenergic_Antihypertensives,1.0
 628,1998-11-01,Antiadrenergic_Antihypertensives,1.0
 627,2000-06-11,Vital:BloodPressure:DIAS_BP,70.0
 627,2000-06-11,Vital:BloodPressure:SYS_BP,142.0
 628,1998-09-30,Vital:BloodPressure:DIAS_BP,78.0
 628,1998-09-30,Vital:BloodPressure:SYS_BP,142.0
 Convert it into a numeric matrix format with columns: patientId, dateId, featu
reId, featureValue
  """
  pdfFileObj = open(fileName, 'rb')
  data=[]
  features=[]
  pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
  for i in range(pdfReader.numPages):
    lines=pdfReader.getPage(i).extractText()
    lines=lines.split("\n")
    j=0
    while j<len(lines)-1:
      d=[int(lines[j]),int(_map_time_to_dayId(lines[j+1])),lines[j+2],int(lines[j+3])]
      data.append(d)
      features.append(lines[j+2])
      j+=4
    feature_map=_build_feature_mapping(list(set(features)))
  for i in range(len(data)):
    data[i][2]=int(feature_map[data[i][2]])
  return np.asarray(data),feature_map
  

In [None]:
def computeFeature_boolean(A, pw, ow, pidMap, featureMap, opDateMap):
  """
 Compute a boolean of all the data in A specified by the prediction window, obs
ervation window, and operational date
 A: patientId dateId featureId featureValue
 pw: prediction window
 ow: observation window
 pidMap: dictionary mapping of patientId -> row offset
 featureMap : dictionary mapping of featureName -> column offset
 opDateMap: dictionary mapping of patientId -> operationalDateId
 """
  np = len(pidMap)
  nf = len(featureMap)
  M = scipy.sparse.lil_matrix((np, nf))

  for ii in range(A.shape[0]):
    if A[ii, 0] in pidMap:
      pid = pidMap[int(A[ii, 0])]
      fid = int(A[ii, 2])
      opd = opDateMap[int(A[ii, 0])]

      val = A[ii, 3]
      if (A[ii, 1] > (opd - ow - pw)) and (A[ii, 1] < (opd - pw)):
        if pid >= 0:
          if val > 0:
            M[pid, fid] = 1.0

  M = M.tocsc()
  return M



In [None]:
def computeFeature_count(A, pw, ow, pidMap, featureMap, opDateMap):
  """
 Compute the count of all the data in A specified by the prediction window, obs
ervation window, and operational date
 A: patientId dateId featureId featureValue
 pw: prediction window
 ow: observation window
 pidMap: dictionary mapping of patientId -> row offset
 featureMap : dictionary mapping of featureName -> column offset
 opDateMap: dictionary mapping of patientId -> operationalDateId
 """
  np = len(pidMap)
  nf = len(featureMap)
  M = scipy.sparse.lil_matrix((np, nf))

  for ii in range(A.shape[0]):
    if A[ii, 0] in pidMap:
      pid = pidMap[(A[ii, 0])]
      fid = int(A[ii, 2])
      opd = opDateMap[int(A[ii, 0])]

      val = A[ii, 3]
      if (A[ii, 1] > (opd - ow - pw)) and (A[ii, 1] < (opd - pw)):
        if pid >= 0:
          if val > 0:
            M[pid, fid] = M[pid, fid] + 1.0

  M = M.tocsc()
  return M


In [None]:
def computeFeature_mean(A, pw, ow, pidMap, featureMap, opDateMap):
  """
 Compute the mean of all the data in A specified by the prediction window, obse
rvation window, and operational date
 A: patientId dateId featureId featureValue
 pw: prediction window
 ow: observation window
 pidMap: dictionary mapping of patientId -> row offset
 featureMap : dictionary mapping of featureName -> column offset
 opDateMap: dictionary mapping of patientId -> operationalDateId
 """
  v = A[:, 2]
  uv = numpy.unique(v)
  nuv = len(uv)
  min_v = numpy.zeros((nuv, 1))
  max_v = numpy.zeros((nuv, 1))
  for ii in range(nuv):
    fv = int(uv[ii])
    idx = numpy.nonzero(v.astype(int) == fv)[0]
    vv = A[idx, 3]
    svv = numpy.sort(vv)
    nsvv = numpy.size(svv)
    min_v[ii] = svv[int(numpy.floor(nsvv * 0.1))]
    max_v[ii] = svv[int(numpy.floor(nsvv * 0.9))]
    min_v[ii] = max(0, min_v[ii])
  features = uv

  np = len(pidMap)
  nf = len(featureMap)
  M = scipy.sparse.lil_matrix((np, nf))
  Mn = scipy.sparse.lil_matrix((np, nf))
  for ii in range(A.shape[0]):
    if A[ii, 0] in pidMap:
      pid = pidMap[int(A[ii, 0])]
      fid = int(A[ii, 2])
      opd = opDateMap[int(A[ii, 0])]
      if (A[ii, 1] > (opd - ow - pw)) and (A[ii, 1] < (opd - pw)):
        if (A[ii, 3] <= max_v[fid]) and (A[ii, 3] >= min_v[fid]):
          if pid >= 0:
            Mn[pid, fid] += 1
            delta = A[ii, 3] - M[pid, fid]
            M[pid, fid] += delta / Mn[pid, fid]

# Populate missing values
  for i in range(np):
    for j in range(nf):
      if Mn[i,j] == 0:
        M[i,j] = numpy.nan

  M = M.tocsc()
  return M


In [None]:
def constructFeatures(cohortFile, dataFile, outputFile, predictionWindow=365, observationWindow=730, aggregationMethod="boolean"):
  """
  Compute features for the patients listed in the cohortFile using the longitudi
  nal data in dataFile.
  Use the specified predictionWindow and observationWindow values to determine w
  hat patient data to use.
  Use the specified aggregation method (boolean, count, mean) to generate the su
  mmary value of the features.
 """
  # Load the data
  pidMap, labelMap, opDateMap = loadPatientCohortDataFile(cohortFile)
  data, featureMap = loadPatientDataFile(dataFile)

# Compute the feature vector matrix
  M = None
  if aggregationMethod == 'mean':
    M = computeFeature_mean(data, predictionWindow, observationWindow, pidMap,featureMap, opDateMap)
  elif aggregationMethod == 'count':
    M = computeFeature_count(data, predictionWindow, observationWindow, pidMap, featureMap, opDateMap)
  elif aggregationMethod == 'boolean':
    M = computeFeature_boolean(data, predictionWindow, observationWindow, pidMap, featureMap, opDateMap)
  else:
    print ('Aggregation Method Unknown: ', aggregationMethod)

# Write out feature vector matrix with the patient id and label columns and header
  outfile = open(outputFile, 'w')
  outwriter = csv.writer(outfile)
  header = ['patientId', 'label']
  fList = sorted(featureMap, key=featureMap.get)
  header.extend(fList)
  outwriter.writerow(header)

# Get the patient id list
  pidList = sorted(pidMap, key=pidMap.get)

# Write out one row for each patient
  print (M.shape)
  nrows, ncols = M.shape
  for n in range(nrows):
    pid = pidList[n]
    label = labelMap[pid]
    rowData = [pid, label]
    for m in range(ncols):
      rowData.append(M[n,m])
    outwriter.writerow(rowData)
  

In [None]:
# Construct the features from the diagnosis data aggreation_method=boolean
dataFile = "/content/daignosis_record.pdf"
diagnosisOutputFile = "/content/"+ 'output_diagnosis.csv'
cohortFile="/content/patient_cohort.pdf"
constructFeatures(cohortFile, dataFile, diagnosisOutputFile, predictionWindow=365,
observationWindow=730, aggregationMethod="boolean")


(500, 6)


In [None]:
# Construct the features for the medication data
baseDirectory = '/content/'
cohortFile = "/content/patient_cohort.pdf"
dataFile = "/content/medication_record.pdf"
medicationOutputFile = baseDirectory + 'output_medication.csv'
constructFeatures(cohortFile, dataFile, medicationOutputFile, predictionWindow=365
, observationWindow=730, aggregationMethod="count")


(500, 51)


In [None]:
# Construct the features for the vitals data
dataFile = "/content/vitals_record.pdf"
vitalsOutputFile = baseDirectory + 'output_vital.csv'
constructFeatures(cohortFile, dataFile, vitalsOutputFile, predictionWindow=365, observationWindow=730, aggregationMethod="mean")


(500, 2)


In [None]:
def combineFeatures(outputFile, *featureFiles):
  """
 Combines the feature files (generated by constructFeatures) into one file base
d on patientId
 Will only keep one patientId and Label column
 """
  data = {}
  header = ['patientId', 'label']
  for f in featureFiles:
    csvReader = csv.reader(open(f), delimiter=',', quotechar='"')
    i = 0
    for x in csvReader:
# Header line
      if i == 0:
        featureNames = x[2:]
        header.extend(featureNames)
      else:
        pid = x[0]
        label = x[1]
        featureValues = x[2:]
        if not (pid in data):
          data[pid] = [pid, label]
        data[pid].extend(featureValues)
      i = i + 1

# Output combined data
  outfile = open(outputFile, 'w')
  outwriter = csv.writer(outfile)
  outwriter.writerow(header)
  for pid in sorted(data.keys()):
    outwriter.writerow(data[pid])
  print (len(data), len(data[pid]))


In [None]:
# Combine the features into one file
allOutputFile = baseDirectory + 'output_all.csv'
combineFeatures(allOutputFile, vitalsOutputFile, diagnosisOutputFile, medicationOutputFile)


500 61
