In [1]:
import numpy as np
import pandas as pd

from operator import itemgetter

### Load Case Data

load case data -> get count of each event -> add 1 for HF

In [2]:
without_time = False

if without_time:
    caseFilename = "./cleaned_data/case.csv"
else:
    caseFilename = "./cleaned_data/case_w_time.csv"


In [3]:
caseDF = pd.read_csv(caseFilename, sep=',')
caseDF.head()

Unnamed: 0.1,Unnamed: 0,SUBJECT_ID,TIME,EVE_INDEX
0,0,10168,2110-12-02 14:57:00,1146
1,1,10168,2110-12-02 14:57:00,1146
2,2,10168,2110-12-02 14:57:00,343
3,3,10168,2110-12-02 14:57:00,335
4,4,10168,2110-12-02 14:57:00,346


In [4]:
caseDF = caseDF.groupby(['SUBJECT_ID', 'EVE_INDEX'])['EVE_INDEX'].size().reset_index()
caseDF["HF"] = 1
caseDF.columns = (['SUBJECT_ID','EVE_INDEX','VALUE','HF'])
caseDF.head()

Unnamed: 0,SUBJECT_ID,EVE_INDEX,VALUE,HF
0,111,202,1,1
1,111,206,1,1
2,111,326,1,1
3,111,331,1,1
4,111,334,1,1


### Load control data

load control data -> get counts of the events -> add 0 for HF

In [5]:
if without_time:
    controlFilename = "./cleaned_data/control.csv"
else:
    controlFilename = "./cleaned_data/control_w_time.csv"

In [6]:
controlDF = pd.read_csv(controlFilename, sep=',')
controlDF.head()

Unnamed: 0.1,Unnamed: 0,SUBJECT_ID,TIME,EVE_INDEX
0,0,10170,2117-09-18 00:00:00,2623
1,1,10170,2117-09-18 00:00:00,2623
2,2,10170,2117-09-18 00:00:00,4902
3,3,10170,2117-09-18 00:00:00,3282
4,4,10170,2117-09-18 00:00:00,3282


In [7]:
controlDF = controlDF.groupby(['SUBJECT_ID', 'EVE_INDEX'])['EVE_INDEX'].size().reset_index()
controlDF["HF"] = 0
controlDF.columns = (['SUBJECT_ID','EVE_INDEX','VALUE','HF'])
controlDF.head()

Unnamed: 0,SUBJECT_ID,EVE_INDEX,VALUE,HF
0,94,45,1,0
1,94,206,1,0
2,94,215,1,0
3,94,263,1,0
4,94,326,2,0


### Concatenate case and control Data

In [8]:
fullDF = pd.concat([caseDF, controlDF], ignore_index=True)


In [9]:
fullDF.head()

Unnamed: 0,SUBJECT_ID,EVE_INDEX,VALUE,HF
0,111,202,1,1
1,111,206,1,1
2,111,326,1,1
3,111,331,1,1
4,111,334,1,1


In [10]:
print fullDF.EVE_INDEX.values.max()

5484


### Assemble dict of features, value tuples

In [11]:
patient_features = {}

for row in fullDF.itertuples():
    if row[1] not in patient_features:
        patient_features[row[1]] = []
    entry = (row[2], row[3])
    patient_features[row[1]].append(entry)

hfDF = fullDF[['SUBJECT_ID', 'HF']]

hfLabel = {}
for row in hfDF.itertuples():
    if row[1] not in hfLabel:
        if row[2] == 1:
            hfLabel[row[1]] = 1
        else:
            hfLabel[row[1]] = 0

### Write to svmlight file

In [12]:
if without_time:
    filename1 = "./cleaned_data/features_svmlight_eventCounts.train"
    filename2 = "./cleaned_data/features_eventCounts.train"
else:
    filename1 = "./cleaned_data/features_svmlight_eventCounts_wt.train"
    filename2 = "./cleaned_data/features_eventCounts_wt.train"


In [13]:
fileWriter1 = open(filename1, 'wb')
fileWriter2 = open(filename2, 'wb')

for patient in sorted(patient_features):
    fileWriter1.write('{:.0f}'.format(hfLabel[patient]))
    fileWriter2.write('{:.0f} {:.0f}'.format(patient, hfLabel[patient]))
    for record in sorted(patient_features[patient], key=itemgetter(0)):
        fileWriter1.write(' {:.0f}:{:.0f}'.format(record[0], record[1]))
        fileWriter2.write(' {:.0f}:{:.0f}'.format(record[0], record[1]))
    fileWriter1.write(" \n")
    fileWriter2.write(" \n")

fileWriter2.close()
fileWriter1.close()