This Notebook is being written to merge additional_features into a single file. This new file will be used to run machine learning model with additional features.

In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
import matplotlib.pyplot as plt

import seaborn as sns
sns.set(style='ticks')
sns.set_context("notebook", font_scale=1)
sns.set_style("darkgrid")


import warnings
warnings.filterwarnings('ignore')
import datetime
from datetime import date

%matplotlib inline

## Load Medicine Features

In [2]:
final_features = pd.read_csv("./features/medicine_finalized_features.csv")
final_features.shape

(4940, 9205)

In [3]:
final_features.head(2)

Unnamed: 0,PatientGuid,00002034603,00002036302,00002036303,00002036333,00002036343,00002080302,00002080303,00002109402,00002109502,...,Zymar_(gatifloxacin_ophthalmic)_ophthalmic_solution-0.3%,"ZyrTEC-D_(cetirizine-pseudoephedrine)_oral_tablet,_extended_release-5 mg-120 mg","ZyrTEC_(cetirizine)_oral_tablet,_chewable-10 mg",ZyrTEC_(cetirizine)_oral_tablet-10 mg,ZyrTEC_Hives_(cetirizine)_oral_tablet-10 mg,Zyvox_(linezolid)_oral_tablet-600 mg,alli_(orlistat)_oral_capsule-60 mg,depo-subQ_provera_104_(medroxyPROGESTERone)_subcutaneous_suspension-104 mg/0.65 mL,optive_(ocular_lubricant)_ophthalmic_solution--,acid_reflux_patient
0,00033D2D-D57C-48A7-B201-F9183F6E640B,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,00044CC1-7911-4C35-B73C-B5623FA3A60F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


## Load and Merge smoking features

In [4]:
smoking_features = pd.read_csv("./features/smoking_features.csv")
smoking_features.shape

(2195, 10)

In [5]:
smoking_features.head(2)

Unnamed: 0,PatientGuid,0cigarettesperday(non-smokeror,0cigarettesperday(previoussmok,1-2packsperday,2ormorepacksperday,CurrentTobaccouser,Currentstatusunknown,Few(1-3)cigarettesperday,Notacurrenttobaccouser,Upto1packperday
0,FFD99EE9-B289-4A27-9FF6-328ABFE8047E,1,0,0,0,0,0,0,0,0
1,FFD755A4-3DA7-43B8-8477-99BFBCED4A10,0,0,0,0,0,0,0,0,1


In [6]:
final_features = pd.merge(final_features,smoking_features,how='left',on="PatientGuid")
final_features.shape

(4940, 9214)

In [7]:
final_features.fillna(0, inplace=True)
final_features.head()

Unnamed: 0,PatientGuid,00002034603,00002036302,00002036303,00002036333,00002036343,00002080302,00002080303,00002109402,00002109502,...,acid_reflux_patient,0cigarettesperday(non-smokeror,0cigarettesperday(previoussmok,1-2packsperday,2ormorepacksperday,CurrentTobaccouser,Currentstatusunknown,Few(1-3)cigarettesperday,Notacurrenttobaccouser,Upto1packperday
0,00033D2D-D57C-48A7-B201-F9183F6E640B,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,00044CC1-7911-4C35-B73C-B5623FA3A60F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,00136822-9A6F-4EF5-ACE7-8B5C836F0635,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,001EDAC4-4584-4801-8383-4AD784EEA4C9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,002E58E8-8314-4000-9BDC-73D21DDD6880,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Load and Merge Condition Features

In [8]:
patients_conditions_features = pd.read_csv("./features/patients_conditions_feature.csv")
patients_conditions_features.shape

(1424, 3)

In [9]:
final_features = pd.merge(final_features,patients_conditions_features,how='left',on="PatientGuid")
final_features.shape

(4940, 9216)

In [10]:
final_features.fillna(0, inplace=True)
final_features.head()

Unnamed: 0,PatientGuid,00002034603,00002036302,00002036303,00002036333,00002036343,00002080302,00002080303,00002109402,00002109502,...,0cigarettesperday(previoussmok,1-2packsperday,2ormorepacksperday,CurrentTobaccouser,Currentstatusunknown,Few(1-3)cigarettesperday,Notacurrenttobaccouser,Upto1packperday,NOKNOWNALLERGIES,NOKNOWNMEDICATIONS
0,00033D2D-D57C-48A7-B201-F9183F6E640B,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,00044CC1-7911-4C35-B73C-B5623FA3A60F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,00136822-9A6F-4EF5-ACE7-8B5C836F0635,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,001EDAC4-4584-4801-8383-4AD784EEA4C9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,002E58E8-8314-4000-9BDC-73D21DDD6880,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


## Load and Merge Transcript

In [11]:
patients_transcript_features = pd.read_csv("./features/patients_transcript_features.csv")
patients_transcript_features.shape

(4979, 6)

In [12]:
final_features = pd.merge(final_features,patients_transcript_features,how='left',on="PatientGuid")
final_features.shape

(4940, 9221)

In [13]:
final_features.fillna(0, inplace=True)
final_features.head()

Unnamed: 0,PatientGuid,00002034603,00002036302,00002036303,00002036333,00002036343,00002080302,00002080303,00002109402,00002109502,...,Few(1-3)cigarettesperday,Notacurrenttobaccouser,Upto1packperday,NOKNOWNALLERGIES,NOKNOWNMEDICATIONS,Weight,BMI,SystolicBP,DiastolicBP,RespiratoryRate
0,00033D2D-D57C-48A7-B201-F9183F6E640B,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,190.8,30.771,141.3,72.5,15.888889
1,00044CC1-7911-4C35-B73C-B5623FA3A60F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,56.375,26.209667,52.0,33.625,16.0
2,00136822-9A6F-4EF5-ACE7-8B5C836F0635,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,65.333333,26.5795,54.666667,29.333333,16.0
3,001EDAC4-4584-4801-8383-4AD784EEA4C9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,139.166667,35.27475,82.333333,54.0,15.333333
4,002E58E8-8314-4000-9BDC-73D21DDD6880,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,105.9875,20.697028,120.527778,49.763889,17.138889


## Load and merge Diagnosis data

In [14]:
patients_diagnosis_features = pd.read_csv("./features/patients_diagnosis_features.csv")
patients_diagnosis_features.shape

(4964, 237)

In [15]:
final_features = pd.merge(final_features,patients_diagnosis_features,how='left',on="PatientGuid")
final_features.shape

(4940, 9457)

In [16]:
final_features.fillna(0, inplace=True)
final_features.head()

Unnamed: 0,PatientGuid,00002034603,00002036302,00002036303,00002036333,00002036343,00002080302,00002080303,00002109402,00002109502,...,Substance-relateddisorders,Superficialinjury;contusion,Syncope,Systemiclupuserythematosusandconnectivetissuedisorders,Thyroiddisorders,Transientcerebralischemia,Tuberculosis,Urinarytractinfections,Varicoseveinsoflowerextremity,Viralinfection
0,00033D2D-D57C-48A7-B201-F9183F6E640B,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,00044CC1-7911-4C35-B73C-B5623FA3A60F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,00136822-9A6F-4EF5-ACE7-8B5C836F0635,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,001EDAC4-4584-4801-8383-4AD784EEA4C9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,002E58E8-8314-4000-9BDC-73D21DDD6880,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Write final features to csv

In [None]:
final_features.to_csv("./features/final_with_multiple_features.csv",index=False)