In [1]:
import os
import pandas as pd
import copy
# from biosppy.signals import ecg




In [2]:
test_dir = 'TestECGData'
train_dir = 'TrainECGData'
sample = 'sample_file.csv'
train_main = 'Patient_Records_Train.csv'
test_main = 'Patient_Records_Test.csv'

In [3]:
def get_ecg_records(base_dir):
    all_pa_ecgs = pd.DataFrame()
    for idx, f in enumerate(os.listdir(base_dir)):
        print idx
        pa_ecgs = pd.DataFrame()
        ecg_files = os.listdir(base_dir+'/'+f)
        for efile in ecg_files:
            df_ecg = pd.read_csv(base_dir+'/'+f+'/'+efile, index_col=False)
            df_ecg['ecg_id'] = efile.replace('.csv','')
            pa_ecgs = pd.concat([pa_ecgs, df_ecg])
        pa_ecgs['pat_id'] = f
        all_pa_ecgs = pd.concat([pa_ecgs, all_pa_ecgs])
        
    return all_pa_ecgs

In [None]:
# all_ecgs_rec_train = get_ecg_records(train_dir)

In [3]:
def get_ecg_counts(base_dir):
    all_pa_ecgs = pd.DataFrame()
    
    for idx, f in enumerate(os.listdir(base_dir)):
        ecg_files = os.listdir(base_dir+'/'+f)
        pa_ecgs = pd.DataFrame([[f, len(ecg_files)]], columns=['patientID','ecg_count'])
        all_pa_ecgs = pd.concat([pa_ecgs, all_pa_ecgs])
        
    return all_pa_ecgs

In [4]:
def work_on_columns(df, to_remove_list, to_replace):
    df = copy.deepcopy(df)
    
    for col in df.columns:
        df[col] = df[col].astype(str).str.lower().str.strip()
        
        for r in to_remove_list:
            true_indices = df[col] == r
            df.loc[true_indices, col] = to_replace
            
        try:
            """ Try to Convert to Float if the columns has to be in float """
            df[col] = df[col].str.replace('n/a','-1').astype(float)

        except:
            """ Column has to be in string format """
            pass
    
    return df

In [5]:
def split_and_hot_encode(df, col_name, delimiter=''):
    df = copy.deepcopy(df)
    if delimiter == '':
        splitted_df = df[col_name].apply(lambda x: [x.strip()])
    else:
        splitted_df = df[col_name].apply(lambda x: [i.strip()  for i in x.split(delimiter)] )
    splitted_df = splitted_df.str.join('|').str.get_dummies()
    splitted_df.columns = col_name+"_"+splitted_df.columns
    df = df.drop(col_name,1).join(splitted_df)
    return df

In [6]:
def convert_to_boolean(df, col_name, false_val='n/a'):
    df = copy.deepcopy(df)
    true_indices = df[col_name] == false_val
    df.loc[true_indices, col_name] = 0
    df.loc[~(true_indices), col_name] = 1
    return df

In [7]:
def numeric_and_hot_encode(df):
    df = copy.deepcopy(df)
    df = split_and_hot_encode(df, 'Acute infarction (localization)', '-')
    df = split_and_hot_encode(df, 'Additional diagnoses', ',')
    df = split_and_hot_encode(df, 'Additional medication', ' ')
    df = split_and_hot_encode(df, 'Dosage (lytic agent)', '')
    df = split_and_hot_encode(df, 'In hospital medication', ' ')
    df = split_and_hot_encode(df, 'Lytic agent', ' ')
    df = split_and_hot_encode(df, 'Smoker', '')
    df = split_and_hot_encode(df, 'Sex', '')
    df = split_and_hot_encode(df, 'Former infarction (localization)', ',')
    
    to_del_col = ['Chest X-ray','Admission date']
    to_convert_boolean = ['Catheterization date','Echocardiography','Infarction date','Infarction date (acute)','Left coronary artery stenoses (RCX)','Left coronary artery stenoses (RIVA)','Medication after discharge','Medication pre admission','Previous infarction (2) date','Right coronary artery stenoses (RCA)','Ventriculography' ]
    
    for col_bool in to_convert_boolean:
        df = convert_to_boolean(df, col_bool)
        
    for to_del in to_del_col:
        df = df.drop(to_del, 1)
    
    return df

In [671]:
train_df = pd.read_csv(train_main)
test_df = pd.read_csv(test_main)
use_test = copy.deepcopy(test_df)

train_df = train_df.fillna('-1')
test_df = test_df.fillna('-1')

In [672]:
imp_cols = ['Smoker','Sex','patientID','Acute infarction (localization)','Additional diagnoses', 'Former infarction (localization)','Ventriculography','Number of coronary vessels involved','Medication after discharge','In hospital medication','Dosage (lytic agent)']
train_df = train_df[imp_cols+['Disease']]
test_df = test_df[imp_cols]


general_garbage = ['n/a']
train_df= work_on_columns(df=train_df, to_remove_list=general_garbage, to_replace='n/a')
test_df= work_on_columns(df=test_df, to_remove_list=general_garbage, to_replace='n/a')

In [673]:
# to_use_diag = ['aortic stenosis','regurgitation','aortic','coronary artery disease','hypertrophic','renal insufficiency','ventricular','atrial fibrillation','atrial flutter','congenital complete av-block','hypertropic','palpitation'] 
to_use_diag = ['cardiomyopathy','no'] 

In [674]:
to_use_diag_cols = []
all_diags = set(','.join(list(train_df['Additional diagnoses'].unique())).split(','))
for diag in all_diags:
    for to_use in to_use_diag:
        if to_use in diag:
            to_use_diag_cols.append(diag.strip())
            break

In [675]:
train_df['Additional diagnoses'] = train_df['Additional diagnoses'].apply(lambda x: ','.join(list(set([i.strip() for i in x.split(',')]) & set(to_use_diag_cols))))
test_df['Additional diagnoses'] = test_df['Additional diagnoses'].apply(lambda x: ','.join(list(set([i.strip() for i in x.split(',')]) & set(to_use_diag_cols))))

train_df.loc[train_df['Additional diagnoses'] == '', ['Additional diagnoses']] = 'default'
test_df.loc[test_df['Additional diagnoses'] == '', ['Additional diagnoses']] = 'default'

train_df['Acute infarction (localization)'] = train_df['Acute infarction (localization)'].apply(lambda x: x if (x=='no' or x=='n/a' or x=='unknown') else 'position')
test_df['Acute infarction (localization)'] = test_df['Acute infarction (localization)'].apply(lambda x: x if (x=='no' or x=='n/a' or x=='unknown') else 'position')

train_df['Former infarction (localization)'] = train_df['Former infarction (localization)'].apply(lambda x: x if (x=='no' or x=='unknown') else 'position')
test_df['Former infarction (localization)'] = test_df['Former infarction (localization)'].apply(lambda x: x if (x=='no' or x=='unknown') else 'position')

In [679]:
# train_df.loc[train_df['Additional diagnoses'].str.contains('cardiomyopathy'), ['Additional diagnoses']] = 'cardiomyopathy'
# test_df.loc[test_df['Additional diagnoses'].str.contains('cardiomyopathy'), ['Additional diagnoses']] = 'cardiomyopathy'

In [243]:
# train_df['Additional diagnoses'] = train_df['Additional diagnoses'].apply(lambda x: x if (x=='no' or x=='cardiomyopathy') else 'position')
# test_df['Additional diagnoses'] = test_df['Additional diagnoses'].apply(lambda x: x if (x=='no' or x=='cardiomyopathy') else 'position')

In [302]:
def clear_records(df, col_name, to_replace):
    df = copy.deepcopy(df)
#     df = df
    df[col_name] = df[col_name].apply(lambda x: [to_replace if x in i else i for i in to_replace])
    return df

In [303]:
# to_replace = ['hyperlipoproteinemia','aortic stenosis','cardioverter-defibrillator','dilated cardiomyopathy','heart failure','hypertrophic obstructive cardiomyopathy','mitral regurgitation','postop. acvb','recurrent ventricular tachycardia','atherosclerosis','wpw','dilated cardiomyopathy']
# col_name = 'Additional diagnoses'
# # train_df[col_name] = train_df[col_name].str.split(',')
# # test_df[col_name] = test_df[col_name].str.split(',')

In [393]:
# train_ecg_counts = get_ecg_counts(train_dir).astype(float).reset_index(drop=True)
# test_ecg_counts = get_ecg_counts(test_dir).astype(float).reset_index(drop=True)

In [14]:
# train_df = train_df.merge(train_ecg_counts, on=['patientID'], how='inner')
# test_df = test_df.merge(test_ecg_counts, on=['patientID'], how='inner')

In [680]:
# train_df[['ecg_count','Disease']].value_counts()
def selected_cols(df):
    df = copy.deepcopy(df)
    df = split_and_hot_encode(df, 'Former infarction (localization)', '-')
    df = split_and_hot_encode(df, 'Acute infarction (localization)', '')
    df = split_and_hot_encode(df, 'Additional diagnoses', ',')
    df = split_and_hot_encode(df, 'Ventriculography', ' ')
    df = split_and_hot_encode(df, 'Medication after discharge', ' ')
    df = split_and_hot_encode(df, 'In hospital medication', ' ')
    df = split_and_hot_encode(df, 'Dosage (lytic agent)', ' ')    
    df = split_and_hot_encode(df, 'Sex', '')
    df = split_and_hot_encode(df, 'Smoker', '')
    
    return df

In [681]:
# train_df = numeric_and_hot_encode(train_df)
# test_df = numeric_and_hot_encode(test_df)

train_df = selected_cols(train_df)
test_df = selected_cols(test_df)

train_df.columns = [c.lower().strip() for c in train_df.columns]
test_df.columns = [c.lower().strip() for c in test_df.columns]

common_cols = list(set(train_df.columns) & set(test_df.columns))

In [682]:
label = 'disease'

train_df = train_df[common_cols+[label]]
test_df = test_df[common_cols]

In [683]:
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.linear_model import LinearRegression
from sklearn import svm
# from sklearn.ensemble import GradientBoostingClassifier

In [684]:
lb_make = LabelEncoder()
train_df[label] = lb_make.fit_transform(train_df[label])

In [685]:
X = train_df[train_df.columns.difference([label])]
Y = train_df[[label]]

In [686]:
X = X[X.columns.difference(['patientid'])]

In [687]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=42)

In [706]:
# X_train.columns
model = GradientBoostingClassifier(n_estimators=50, max_features=0.8)

In [704]:
model = RandomForestClassifier(max_features=0.5, n_estimators=50, criterion='gini')

In [707]:
model.fit(X, Y)

# model.fit(X_train, y_train)

# ypredicted = model.predict(X_test)
# print model.score(X_test, y_test)


# print f1_score(y_test.disease.values, ypredicted, average='weighted')

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=0.8, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=50, presort='auto', random_state=None,
              subsample=1.0, verbose=0, warm_start=False)

In [424]:
important_cols = sorted(zip(map(lambda x: round(x, 4), model.feature_importances_), X_train.columns), 
             reverse=True)

In [425]:
important_cols

[(0.1891, 'acute infarction (localization)_no'),
 (0.1889, 'additional diagnoses_no'),
 (0.0931, 'additional diagnoses_default'),
 (0.0849, 'former infarction (localization)_no'),
 (0.0772, 'number of coronary vessels involved'),
 (0.0769, 'additional diagnoses_hypertrophic obstructive cardiomyopathy'),
 (0.072, 'ventriculography_n/a'),
 (0.0519, 'former infarction (localization)_n/a'),
 (0.0486, 'former infarction (localization)_unknown'),
 (0.0437, 'acute infarction (localization)_n/a'),
 (0.038, 'additional diagnoses_atrial fibrillation'),
 (0.0207, 'additional diagnoses_congenital complete av-block'),
 (0.0077, 'in hospital medication_n/a'),
 (0.0072, 'medication after discharge_n/a')]

In [708]:
pat_id_test = test_df.patientid.values
results = model.predict(test_df[test_df.columns.difference(['patientid'])])
results = list(lb_make.inverse_transform(results))

pred_result = pd.DataFrame(zip(pat_id_test, results), columns=['patientID','Disease'])
pred_result.patientID = pred_result.patientID.astype(int)
pred_result.Disease = pred_result.Disease.apply(lambda x: (x[0].upper())+x[1:])

pred_result.Disease.value_counts()

Myocardial infarction    50
Healthy control          28
Dysrhythmia              11
Unidentified              8
Hypertrophy               6
Cardiomyopathy            5
Bundle branch block       2
Name: Disease, dtype: int64

In [716]:
pred_result.to_csv('pred_v9.csv', index=False)


In [709]:
umer = pd.read_csv('nsubmit11.csv', index_col=False)
comp = pred_result.merge(umer, on=['patientID'])
comp = comp.merge(use_test[['patientID','Usage']], on=['patientID'], how='inner')
a = comp[comp.Disease_x != comp.Disease_y]


In [714]:
pred_result.Disease.value_counts()

Myocardial infarction    50
Healthy control          28
Dysrhythmia              11
Unidentified              8
Hypertrophy               6
Cardiomyopathy            5
Bundle branch block       2
Name: Disease, dtype: int64

In [715]:
umer.Disease.value_counts()

Myocardial infarction     50
Healthy control           27
Unidentified               8
Cardiomyopathy             7
Bundle branch block        5
Hypertrophy                4
Dysrhythmia                4
Myocarditis                2
Valvular heart disease     2
Bundle branch block        1
Name: Disease, dtype: int64

In [651]:
# # dis = 'Myocardial infarction'
# # dis = 'Unidentified'
# # dis = 'Healthy control'
# # dis = 'Hypertrophy'
# # dis = 'Dysrhythmia'
# dis = 'Cardiomyopathy'
# # dis = 'Dysrhythmia'
# # dis = 'Bundle branch block'
# umer_dis = set(umer[umer.Disease == dis].patientID.values)
# model_dis = set(pred_result[pred_result.Disease == dis].patientID.values)
# len(umer_dis & model_dis)

5

In [None]:
umer_dis = set(umer[umer.Disease == dis].patientID.values)
model_dis = set(abc[abc.Disease == dis].patientID.values)
len(umer_dis & model_dis)

In [342]:
pred_result.to_csv('pred_v4_90.csv', index=False)

In [416]:
col_to_use = []
for idx, col in enumerate(important_cols):
    if idx == 14:
        break
    col_to_use.append(col[1])

In [None]:
all_pa_ecgs.to_csv('All_ECGS_Train.csv', index=False)
all_train_ecg = pd.read_csv('All_ECGS_Train.csv')
ecg_rec = pd.read_csv('TestECGData/1/0.csv')
result = ecg.ecg(signal=ecg_rec["'i'"].values, sampling_rate=1000, show=True)
print result