In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
import matplotlib.pyplot as plt
import seaborn as sns
import category_encoders as ce
%matplotlib inline

In [2]:
train = pd.read_csv('../data/raw/train_values.csv')
X_train = train.loc[:,[c for c in train.columns if c != 'patient_id']]
test = pd.read_csv('../data/raw/test_values.csv')
X_test = test.loc[:,[c for c in test.columns if c != 'patient_id']]
y_train = pd.read_csv('../data/raw/train_labels.csv', usecols=['heart_disease_present'])

In [3]:
train.head()

Unnamed: 0,patient_id,slope_of_peak_exercise_st_segment,thal,resting_blood_pressure,chest_pain_type,num_major_vessels,fasting_blood_sugar_gt_120_mg_per_dl,resting_ekg_results,serum_cholesterol_mg_per_dl,oldpeak_eq_st_depression,sex,age,max_heart_rate_achieved,exercise_induced_angina
0,0z64un,1,normal,128,2,0,0,2,308,0.0,1,45,170,0
1,ryoo3j,2,normal,110,3,0,0,0,214,1.6,0,54,158,0
2,yt1s1x,1,normal,125,4,3,0,2,304,0.0,1,77,162,1
3,l2xjde,1,reversible_defect,152,4,0,0,0,223,0.0,1,40,181,0
4,oyt4ek,3,reversible_defect,178,1,0,0,2,270,4.2,1,59,145,0


In [4]:
df_train = pd.concat([train, y_train], axis=1)

In [5]:
df_train.head()

Unnamed: 0,patient_id,slope_of_peak_exercise_st_segment,thal,resting_blood_pressure,chest_pain_type,num_major_vessels,fasting_blood_sugar_gt_120_mg_per_dl,resting_ekg_results,serum_cholesterol_mg_per_dl,oldpeak_eq_st_depression,sex,age,max_heart_rate_achieved,exercise_induced_angina,heart_disease_present
0,0z64un,1,normal,128,2,0,0,2,308,0.0,1,45,170,0,0
1,ryoo3j,2,normal,110,3,0,0,0,214,1.6,0,54,158,0,0
2,yt1s1x,1,normal,125,4,3,0,2,304,0.0,1,77,162,1,1
3,l2xjde,1,reversible_defect,152,4,0,0,0,223,0.0,1,40,181,0,1
4,oyt4ek,3,reversible_defect,178,1,0,0,2,270,4.2,1,59,145,0,0


In [6]:
df_all = pd.concat([train, test], axis=0)

In [7]:
df_all.shape

(270, 14)

In [8]:
df_all.head()

Unnamed: 0,patient_id,slope_of_peak_exercise_st_segment,thal,resting_blood_pressure,chest_pain_type,num_major_vessels,fasting_blood_sugar_gt_120_mg_per_dl,resting_ekg_results,serum_cholesterol_mg_per_dl,oldpeak_eq_st_depression,sex,age,max_heart_rate_achieved,exercise_induced_angina
0,0z64un,1,normal,128,2,0,0,2,308,0.0,1,45,170,0
1,ryoo3j,2,normal,110,3,0,0,0,214,1.6,0,54,158,0
2,yt1s1x,1,normal,125,4,3,0,2,304,0.0,1,77,162,1
3,l2xjde,1,reversible_defect,152,4,0,0,0,223,0.0,1,40,181,0
4,oyt4ek,3,reversible_defect,178,1,0,0,2,270,4.2,1,59,145,0


In [None]:
import seaborn as sns
%matplotlib inline

corr = X_train.corr()

sns.heatmap(corr, 
        xticklabels=corr.columns,
        yticklabels=corr.columns)

In [None]:
corr[corr>0.6].iloc[1:,0][corr[corr>0.6].iloc[1:,0].notnull()]

In [9]:
len_train = train.shape[0]
len_test = test.shape[0]

In [10]:
def create_count(df, columns, output_name):
    grouped_data = df.groupby(columns).agg('count')['patient_id']
    g_d = df.join(grouped_data, on=columns, rsuffix='_r')
    g_d = g_d.rename({'patient_id_r': output_name}, axis='columns')
    return g_d

In [11]:
gd = create_count(df_all,
                  ['slope_of_peak_exercise_st_segment',
                   'thal'],
                  'count0')
gd = create_count(gd,
                  ['resting_ekg_results',
                   'slope_of_peak_exercise_st_segment'],
                  'count1')
gd = create_count(gd,
                  ['exercise_induced_angina',
                   'slope_of_peak_exercise_st_segment'],
                  'count2')
gd = create_count(gd,
                  ['chest_pain_type',
                   'num_major_vessels'],
                  'count3')
gd = create_count(gd,
                  ['exercise_induced_angina',
                   'slope_of_peak_exercise_st_segment',
                   'thal',
                   'resting_ekg_results',
                   'chest_pain_type',
                   'num_major_vessels',
                   'fasting_blood_sugar_gt_120_mg_per_dl'],
                  'count4')

In [12]:
gd.head(10)

Unnamed: 0,patient_id,slope_of_peak_exercise_st_segment,thal,resting_blood_pressure,chest_pain_type,num_major_vessels,fasting_blood_sugar_gt_120_mg_per_dl,resting_ekg_results,serum_cholesterol_mg_per_dl,oldpeak_eq_st_depression,sex,age,max_heart_rate_achieved,exercise_induced_angina,count0,count1,count2,count3,count4
0,0z64un,1,normal,128,2,0,0,2,308,0.0,1,45,170,0,94,56,105,32,5
1,ryoo3j,2,normal,110,3,0,0,0,214,1.6,0,54,158,0,51,51,66,53,6
2,yt1s1x,1,normal,125,4,3,0,2,304,0.0,1,77,162,1,94,56,25,13,1
3,l2xjde,1,reversible_defect,152,4,0,0,0,223,0.0,1,40,181,0,34,74,105,61,3
4,oyt4ek,3,reversible_defect,178,1,0,0,2,270,4.2,1,59,145,0,10,12,10,14,1
5,ldukkw,1,normal,130,3,0,0,0,180,0.0,1,42,150,0,94,74,105,53,13
6,2gbyh9,2,reversible_defect,150,4,2,0,2,258,2.6,0,60,157,0,60,69,66,25,2
7,daa9kp,2,fixed_defect,150,4,1,0,2,276,0.6,1,57,112,1,11,69,56,30,2
8,3nwy2n,3,reversible_defect,170,4,0,0,2,326,3.4,1,59,140,1,10,12,8,61,1
9,1r508r,2,normal,120,3,0,0,0,219,1.6,0,50,158,0,51,51,66,53,6


In [13]:
gd['presure_p_rate'] = gd['resting_blood_pressure'] * gd['max_heart_rate_achieved']
gd['cholesterol_p_age'] = gd['serum_cholesterol_mg_per_dl'] * gd['age']

In [14]:
gd.head()

Unnamed: 0,patient_id,slope_of_peak_exercise_st_segment,thal,resting_blood_pressure,chest_pain_type,num_major_vessels,fasting_blood_sugar_gt_120_mg_per_dl,resting_ekg_results,serum_cholesterol_mg_per_dl,oldpeak_eq_st_depression,...,age,max_heart_rate_achieved,exercise_induced_angina,count0,count1,count2,count3,count4,presure_p_rate,cholesterol_p_age
0,0z64un,1,normal,128,2,0,0,2,308,0.0,...,45,170,0,94,56,105,32,5,21760,13860
1,ryoo3j,2,normal,110,3,0,0,0,214,1.6,...,54,158,0,51,51,66,53,6,17380,11556
2,yt1s1x,1,normal,125,4,3,0,2,304,0.0,...,77,162,1,94,56,25,13,1,20250,23408
3,l2xjde,1,reversible_defect,152,4,0,0,0,223,0.0,...,40,181,0,34,74,105,61,3,27512,8920
4,oyt4ek,3,reversible_defect,178,1,0,0,2,270,4.2,...,59,145,0,10,12,10,14,1,25810,15930


In [10]:
cols_ss = [c for c in df_all.columns if c not in ['patient_id', 'thal']]

In [12]:
ss = StandardScaler()
ss.fit(df_all[cols_ss])
df_all[cols_ss] = ss.transform(df_all[cols_ss])

  return self.partial_fit(X, y)
  This is separate from the ipykernel package so we can avoid doing imports until


In [13]:
df_all.head()

Unnamed: 0,patient_id,slope_of_peak_exercise_st_segment,thal,resting_blood_pressure,chest_pain_type,num_major_vessels,fasting_blood_sugar_gt_120_mg_per_dl,resting_ekg_results,serum_cholesterol_mg_per_dl,oldpeak_eq_st_depression,sex,age,max_heart_rate_achieved,exercise_induced_angina
0,0z64un,-0.954234,normal,-0.18759,-1.238045,-0.711535,-0.417029,0.981664,1.130844,-0.918565,0.6895,-1.037521,0.878883,-0.701222
1,ryoo3j,0.676419,normal,-1.197209,-0.183559,-0.711535,-0.417029,-1.026285,-0.691199,0.481153,-1.450327,-0.04766,0.359914,-0.701222
2,yt1s1x,-0.954234,normal,-0.35586,0.870928,2.472682,-0.417029,0.981664,1.05331,-0.918565,0.6895,2.481986,0.532904,1.426081
3,l2xjde,-0.954234,reversible_defect,1.158569,0.870928,-0.711535,-0.417029,-1.026285,-0.516748,-0.918565,0.6895,-1.587444,1.354605,-0.701222
4,oyt4ek,2.307073,reversible_defect,2.616909,-2.292532,-0.711535,-0.417029,0.981664,0.394274,2.755695,0.6895,0.502263,-0.202302,-0.701222


In [10]:
train = df_all[:len_train]
test = df_all[len_train:]

In [15]:
train.shape, test.shape

((180, 14), (90, 14))

In [16]:
train['slope_of_peak_exercise_st_segment'].unique()

array([-0.95423434,  0.67641928,  2.3070729 ])

In [21]:
# sns.pairplot(df_train[['slope_of_peak_exercise_st_segment', 'thal',
#        'resting_blood_pressure', 'chest_pain_type', 'num_major_vessels',
#        'fasting_blood_sugar_gt_120_mg_per_dl', 'resting_ekg_results',
#        'serum_cholesterol_mg_per_dl', 'oldpeak_eq_st_depression', 'sex', 'age',
#        'max_heart_rate_achieved', 'exercise_induced_angina',
#        'heart_disease_present']],
#              hue="heart_disease_present")

In [22]:
# g = sns.lmplot(x="age", y="resting_blood_pressure", hue="heart_disease_present",
#                truncate=True, height=5, data=df_train)

In [12]:
le = LabelEncoder()
le.fit(train['thal'])
train['thal'] = le.transform(train['thal'])
test['thal'] = le.transform(test['thal'])

In [13]:
test.head()

Unnamed: 0,patient_id,slope_of_peak_exercise_st_segment,thal,resting_blood_pressure,chest_pain_type,num_major_vessels,fasting_blood_sugar_gt_120_mg_per_dl,resting_ekg_results,serum_cholesterol_mg_per_dl,oldpeak_eq_st_depression,sex,age,max_heart_rate_achieved,exercise_induced_angina
0,olalu7,2,2,170,1,0,0,2,288,0.2,1,59,159,0
1,z9n6mx,1,1,138,4,0,0,0,183,1.4,0,35,182,0
2,5k4413,2,2,120,4,0,0,2,177,2.5,1,43,120,1
3,mrg7q5,1,1,102,3,1,0,0,318,0.0,0,60,160,0
4,uki4do,2,1,138,4,1,0,2,166,3.6,1,61,125,1


In [17]:
encoder = ce.TargetEncoder(cols=['thal'])
encoder.fit(train, y_train)
train_encoder = encoder.transform(train)
test_encoder = encoder.transform(test)

In [18]:
train_encoder.head()

Unnamed: 0,patient_id,slope_of_peak_exercise_st_segment,thal,resting_blood_pressure,chest_pain_type,num_major_vessels,fasting_blood_sugar_gt_120_mg_per_dl,resting_ekg_results,serum_cholesterol_mg_per_dl,oldpeak_eq_st_depression,sex,age,max_heart_rate_achieved,exercise_induced_angina
0,0z64un,-0.954234,0.204082,-0.18759,-1.238045,-0.711535,-0.417029,0.981664,1.130844,-0.918565,0.6895,-1.037521,0.878883,-0.701222
1,ryoo3j,0.676419,0.204082,-1.197209,-0.183559,-0.711535,-0.417029,-1.026285,-0.691199,0.481153,-1.450327,-0.04766,0.359914,-0.701222
2,yt1s1x,-0.954234,0.204082,-0.35586,0.870928,2.472682,-0.417029,0.981664,1.05331,-0.918565,0.6895,2.481986,0.532904,1.426081
3,l2xjde,-0.954234,0.756757,1.158569,0.870928,-0.711535,-0.417029,-1.026285,-0.516748,-0.918565,0.6895,-1.587444,1.354605,-0.701222
4,oyt4ek,2.307073,0.756757,2.616909,-2.292532,-0.711535,-0.417029,0.981664,0.394274,2.755695,0.6895,0.502263,-0.202302,-0.701222


In [19]:
test_encoder.head()

Unnamed: 0,patient_id,slope_of_peak_exercise_st_segment,thal,resting_blood_pressure,chest_pain_type,num_major_vessels,fasting_blood_sugar_gt_120_mg_per_dl,resting_ekg_results,serum_cholesterol_mg_per_dl,oldpeak_eq_st_depression,sex,age,max_heart_rate_achieved,exercise_induced_angina
0,olalu7,0.676419,0.756757,2.168189,-2.292532,-0.711535,-0.417029,0.981664,0.743175,-0.7436,0.6895,0.502263,0.403162,-0.701222
1,z9n6mx,-0.954234,0.204082,0.37331,0.870928,-0.711535,-0.417029,-1.026285,-1.292086,0.306188,-1.450327,-2.137367,1.397852,-0.701222
2,5k4413,0.676419,0.756757,-0.63631,0.870928,-0.711535,-0.417029,0.981664,-1.408386,1.268495,0.6895,-1.257491,-1.283487,1.426081
3,mrg7q5,-0.954234,0.204082,-1.645929,-0.183559,0.349871,-0.417029,-1.026285,1.324679,-0.918565,-1.450327,0.612248,0.446409,-0.701222
4,uki4do,0.676419,0.204082,0.37331,0.870928,0.349871,-0.417029,0.981664,-1.621604,2.230801,0.6895,0.722232,-1.06725,1.426081


In [26]:
min_max_cols = [c for c in train_encoder.columns if c not in ['patient_id', 'thal']]
min_max = MinMaxScaler()
min_max.fit(train_encoder[min_max_cols])
train_encoder[min_max_cols] = min_max.transform(train_encoder[min_max_cols])
test_encoder[min_max_cols] = min_max.transform(test_encoder[min_max_cols])

  return self.partial_fit(X, y)


In [27]:
train_encoder.head()

Unnamed: 0,patient_id,slope_of_peak_exercise_st_segment,thal,resting_blood_pressure,chest_pain_type,num_major_vessels,fasting_blood_sugar_gt_120_mg_per_dl,resting_ekg_results,serum_cholesterol_mg_per_dl,oldpeak_eq_st_depression,...,age,max_heart_rate_achieved,exercise_induced_angina,count0,count1,count2,count3,count4,presure_p_rate,cholesterol_p_age
0,0z64un,0.0,0.204082,0.395349,0.333333,0.0,0.0,1.0,0.415525,0.0,...,0.333333,0.698113,0.0,1.0,0.75,1.0,0.508475,0.333333,0.573669,0.249247
1,ryoo3j,0.5,0.204082,0.186047,0.666667,0.0,0.0,0.0,0.200913,0.258065,...,0.520833,0.584906,0.0,0.537634,0.680556,0.597938,0.864407,0.416667,0.328291,0.176958
2,yt1s1x,0.0,0.204082,0.360465,1.0,1.0,0.0,1.0,0.406393,0.0,...,1.0,0.622642,1.0,1.0,0.75,0.175258,0.186441,0.0,0.489076,0.54882
3,l2xjde,0.0,0.756757,0.674419,1.0,0.0,0.0,0.0,0.221461,0.0,...,0.229167,0.801887,0.0,0.354839,1.0,1.0,1.0,0.166667,0.89591,0.094252
4,oyt4ek,1.0,0.756757,0.976744,0.0,0.0,0.0,1.0,0.328767,0.677419,...,0.625,0.462264,0.0,0.096774,0.138889,0.020619,0.20339,0.0,0.80056,0.314194


In [28]:
test_encoder.head()

Unnamed: 0,patient_id,slope_of_peak_exercise_st_segment,thal,resting_blood_pressure,chest_pain_type,num_major_vessels,fasting_blood_sugar_gt_120_mg_per_dl,resting_ekg_results,serum_cholesterol_mg_per_dl,oldpeak_eq_st_depression,...,age,max_heart_rate_achieved,exercise_induced_angina,count0,count1,count2,count3,count4,presure_p_rate,cholesterol_p_age
0,olalu7,0.5,0.756757,0.883721,0.0,0.0,0.0,1.0,0.369863,0.032258,...,0.625,0.59434,0.0,0.634409,0.930556,0.597938,0.20339,0.083333,0.868908,0.347515
1,z9n6mx,0.0,0.204082,0.511628,1.0,0.0,0.0,0.0,0.130137,0.225806,...,0.125,0.811321,0.0,1.0,1.0,1.0,1.0,0.5,0.761681,0.015343
2,5k4413,0.5,0.756757,0.302326,1.0,0.0,0.0,1.0,0.116438,0.403226,...,0.291667,0.226415,1.0,0.634409,0.930556,0.494845,1.0,0.25,0.161345,0.053181
3,mrg7q5,0.0,0.204082,0.093023,0.666667,0.333333,0.0,0.0,0.438356,0.0,...,0.645833,0.603774,0.0,1.0,1.0,1.0,0.288136,0.25,0.268908,0.413027
4,uki4do,0.5,0.204082,0.511628,1.0,0.333333,0.0,1.0,0.091324,0.580645,...,0.666667,0.273585,1.0,0.537634,0.930556,0.494845,0.474576,0.166667,0.321008,0.132091


In [30]:
from sklearn.decomposition import PCA

In [41]:
num_cols = ['resting_blood_pressure', 'serum_cholesterol_mg_per_dl',
            'age', 'max_heart_rate_achieved', 'count0', 'count1', 'count2', 'count3', 'count4',
           'presure_p_rate', 'cholesterol_p_age']

In [88]:
pca = PCA(n_components=5, random_state=42)

In [103]:
pca.fit(train_encoder[num_cols])
X_train_pca = pca.transform(train_encoder[num_cols])
X_train_pca = pd.DataFrame(X_train_pca, columns=['pca0', 'pca1', 'pca2', 'pca3', 'pca4'])
X_test_pca = pca.transform(test_encoder[num_cols])
X_test_pca = pd.DataFrame(X_test_pca, columns=['pca0', 'pca1', 'pca2', 'pca3', 'pca4'])

In [104]:
len(X_train_pca), len(X_test_pca)

(180, 90)

In [105]:
X_train_pca[:5]

Unnamed: 0,pca0,pca1,pca2,pca3,pca4
0,-0.501489,0.123362,0.087445,-0.083436,-0.106061
1,-0.015952,-0.387159,-0.184208,-0.104957,-0.017874
2,0.29854,0.518534,0.023228,0.258871,-0.455704
3,-0.300924,-0.394787,0.611781,-0.214619,0.429825
4,1.005994,0.110212,0.646136,0.033044,-0.384605


In [92]:
from sklearn.decomposition import FastICA

In [93]:
f_ica = FastICA(n_components=5, random_state=42)

In [106]:
f_ica.fit(train_encoder[num_cols])
X_train_ica = f_ica.transform(train_encoder[num_cols])
X_train_ica = pd.DataFrame(X_train_ica, columns=['ica0', 'ica1', 'ica2', 'ica3', 'ica4'])
X_test_ica = f_ica.transform(test_encoder[num_cols])
X_test_ica = pd.DataFrame(X_test_ica, columns=['ica0', 'ica1', 'ica2', 'ica3', 'ica4'])

In [107]:
X_test_ica[:5]

Unnamed: 0,ica0,ica1,ica2,ica3,ica4
0,0.03487,0.093641,0.112809,-0.087931,-0.037466
1,0.028697,-0.016497,0.108571,0.08932,0.111992
2,-0.017712,-0.048292,-0.100041,0.089397,0.03867
3,-0.035569,-0.123271,0.008775,-0.059683,-0.034138
4,-0.003413,-0.010372,-0.050099,-0.021673,-0.021762


In [96]:
from sklearn.decomposition import TruncatedSVD

In [97]:
svd = TruncatedSVD(n_components=5, random_state=42)

In [108]:
svd.fit(train_encoder[num_cols])
X_train_svd = svd.transform(train_encoder[num_cols])
X_train_svd = pd.DataFrame(X_train_svd, columns=['svd0', 'svd1', 'svd2', 'svd3', 'svd4'])
X_test_svd = svd.transform(test_encoder[num_cols])
X_test_svd = pd.DataFrame(X_test_svd, columns=['svd0', 'svd1', 'svd2', 'svd3', 'svd4'])

In [109]:
X_train_svd[:5]

Unnamed: 0,svd0,svd1,svd2,svd3,svd4
0,1.998694,-0.349453,0.127289,0.189645,0.120264
1,1.621227,-0.101847,-0.385119,-0.096236,-0.047911
2,1.703174,0.432918,0.512625,-0.180056,0.376107
3,2.106877,0.082825,-0.398068,0.57339,-0.388193
4,1.045223,0.909018,0.098206,0.508642,0.461912


In [110]:
train_frames = [train_encoder, X_train_pca, X_train_ica, X_train_svd]
test_frames = [test_encoder, X_test_pca, X_test_ica, X_test_svd]

In [111]:
train_encoder = pd.concat(train_frames, axis = 1)
test_encoder = pd.concat(test_frames, axis = 1)

In [112]:
train_encoder.shape, test_encoder.shape

((180, 36), (90, 36))

In [14]:
train.to_csv('../data/Processed/train.csv', index=False)
test.to_csv('../data/Processed/test.csv', index=False)