#### Requisite cells for sagemaker

In [33]:
import boto3, re, sys, math, json, os, sagemaker, urllib.request
from sagemaker import get_execution_role
import numpy as np                                
import pandas as pd                               
import matplotlib.pyplot as plt                   
from IPython.display import Image                 
from IPython.display import display               
from time import gmtime, strftime                 
from sagemaker.predictor import csv_serializer 

from collections import Counter
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.multiclass import OneVsRestClassifier
import os
import joblib
import pickle

### The next cell imports the raw data and drops
* Null columns
* Rows with disease name as null

In [34]:
patients_df = pd.read_csv("ML DATA.csv",encoding = "ISO-8859-1")
patients_df.drop(columns=['HEART_RATE','HEAD_CIRCUMFERENCE','UPPER_ARM_CIRCUMFERENCE'],inplace = True)
patients_df = patients_df[pd.notnull(patients_df['DISEASE_ID'])]
patients_df.drop(patients_df[patients_df.REFERRED == 'Y'].index, inplace=True)
patients_df.drop(patients_df[patients_df.DISEASE_ID == '0'].index, inplace=True)

  interactivity=interactivity, compiler=compiler, result=result)


In [35]:
patients_df.head()

Unnamed: 0,PATIENT_VISIT_ID,GENDER,AGE,STATE_NAME,DISTRICT_NAME,CENTER_SHORT_NAME,SYSTOLIC_BP,DIASTOLIC_BP,PULSE,RESPIRATION_RATE,BODY_TEMPERATURE,BODY_WEIGHT,HEIGHT,SPO2,SYMPTOM_ID,SYMPTOM_DESC,MEDICAL_TEST_NAME,DISEASE_ID,REFERRED
1,PV000000554650,Male,42.0,Rajasthan,Jhunjhunun,B D K Hospital Jhunjhunu,138.0,96.0,86.0,,97.8,85.0,,98.0,SYMP00099~SYMP00499,,CBC~LIVER FUNCTION TEST (LFT),DISE00447~DISE02143,N
2,PV000000554656,Male,60.0,Rajasthan,Dholpur,Bari,120.0,79.0,73.0,,,60.0,,98.0,,COUGH BODY PAIN GHABRAHAT,"CBC~HB, TLC, DLC, ESR~X-RAY - CHEST CHEST ...",DISE00630,N
3,PV000000554828,Female,45.5,Rajasthan,Ganganagar,Govt Hospitals Sriganganagar,110.0,80.0,78.0,18.0,,,,,,BRITHLESS NESS,2D DOPPLER ECHOCARDIOGRAM WITH COLOR FLOW~SPIR...,DISE02117,N
5,PV000000554890,Male,63.0,Rajasthan,Ajmer,Y N Hospital Kishangarh,120.0,80.0,74.0,26.0,98.6,65.0,,98.0,SYMP04504,,UGI ENDOSCOPY,DISE02045,N
7,PV000000554912,Male,65.5,Rajasthan,Dholpur,Bari,146.0,100.0,89.0,,,75.0,,98.0,SYMP00875,,ECG,DISE02897,N


In [36]:
patients_df.columns

Index(['PATIENT_VISIT_ID', 'GENDER', 'AGE', 'STATE_NAME', 'DISTRICT_NAME',
       'CENTER_SHORT_NAME', 'SYSTOLIC_BP', 'DIASTOLIC_BP', 'PULSE',
       'RESPIRATION_RATE', 'BODY_TEMPERATURE', 'BODY_WEIGHT', 'HEIGHT', 'SPO2',
       'SYMPTOM_ID', 'SYMPTOM_DESC', 'MEDICAL_TEST_NAME', 'DISEASE_ID',
       'REFERRED'],
      dtype='object')

### The next cell drops all rows with wrongly entered vitals

In [37]:
patients_df.drop(patients_df[(patients_df.SYSTOLIC_BP>230) | (patients_df.DIASTOLIC_BP>150)].index, inplace = True)
patients_df.drop(patients_df[(patients_df.SYSTOLIC_BP<80) | (patients_df.DIASTOLIC_BP<50)].index, inplace=True)
patients_df.drop(patients_df[(patients_df.BODY_TEMPERATURE<90) | (patients_df.BODY_TEMPERATURE>110)].index, inplace=True)
patients_df.drop(patients_df[patients_df.AGE>100].index, inplace=True)
patients_df.drop(patients_df[(patients_df.SPO2>100) | (patients_df.SPO2<80)].index,inplace=True)
patients_df.drop(patients_df[(patients_df.PULSE<50) | (patients_df.PULSE>120)].index,inplace=True)
patients_df.drop(patients_df[(patients_df.RESPIRATION_RATE>30) | (patients_df.RESPIRATION_RATE<13)].index,inplace=True)
patients_df.drop(patients_df[patients_df.BODY_WEIGHT>200].index,inplace=True)
patients_df.drop(patients_df[(patients_df.HEIGHT>200) | (patients_df.HEIGHT<45)].index,inplace=True)

Vitals to be categorized:

- [X] Age
- [X] Sys and Dias BP
- [X] Pulse
- [X] RESPIRATION_RATE
- [ ] BODY_TEMPERATURE
- [ ] BODY_WEIGHT
- [ ] Height
- [X] SPO2

(For XGBoost, should be numeric)

### Binning age values:

In [38]:
bins = [x for x in range(0,101,5)]
labels = [x for x in range(1,21)]

patients_df['age_binned'] = pd.cut(patients_df.AGE,bins,labels=labels,include_lowest=True)

#### As we see, most of the vitals consist of null values

In [39]:
patients_df.isnull().sum()/patients_df.shape[0]*100

PATIENT_VISIT_ID      0.000000
GENDER                0.000000
AGE                   0.000000
STATE_NAME            0.000000
DISTRICT_NAME         0.000000
CENTER_SHORT_NAME     0.000000
SYSTOLIC_BP          49.047558
DIASTOLIC_BP         49.023506
PULSE                33.133438
RESPIRATION_RATE     55.549498
BODY_TEMPERATURE     42.537601
BODY_WEIGHT          30.925504
HEIGHT               61.052497
SPO2                 35.753455
SYMPTOM_ID           42.611359
SYMPTOM_DESC         57.372607
MEDICAL_TEST_NAME    89.053330
DISEASE_ID            0.000000
REFERRED              0.000000
age_binned            0.000000
dtype: float64

#### Replacing with default values if available, else median

<b>NOTE:</b> This might be risky since many patients with missing vitals may actually contain abnormal values, which the model may fail to consider. Thus, better to infer values depending on other vitals (to check later)

In [40]:
patients_df.SYSTOLIC_BP.fillna(120,inplace=True)
patients_df.DIASTOLIC_BP.fillna(80,inplace=True)
patients_df.PULSE.fillna(patients_df.PULSE.median(),inplace=True)
patients_df.RESPIRATION_RATE.fillna(patients_df.RESPIRATION_RATE.median(),inplace=True)
patients_df.BODY_TEMPERATURE.fillna(patients_df.BODY_TEMPERATURE.median(),inplace=True)
patients_df.BODY_WEIGHT.fillna(patients_df.BODY_WEIGHT.median(),inplace=True)
patients_df.HEIGHT.fillna(patients_df.HEIGHT.median(),inplace=True)
patients_df.SPO2.fillna(patients_df.SPO2.median(),inplace=True)

In [41]:
patients_df.describe()

Unnamed: 0,AGE,SYSTOLIC_BP,DIASTOLIC_BP,PULSE,RESPIRATION_RATE,BODY_TEMPERATURE,BODY_WEIGHT,HEIGHT,SPO2
count,62366.0,62366.0,62366.0,62366.0,62366.0,62366.0,62366.0,62366.0,62366.0
mean,30.88089,119.425697,78.77236,83.541978,18.724369,97.981415,43.026369,147.836538,98.399785
std,20.599724,8.676262,5.781984,9.205772,2.098382,0.782968,16.00302,16.853247,1.524062
min,0.0,80.0,50.0,50.0,13.0,90.0,0.41,45.0,80.0
25%,14.0,120.0,80.0,78.0,18.0,98.0,40.0,151.0,98.0
50%,28.0,120.0,80.0,82.0,18.0,98.0,45.0,151.0,99.0
75%,47.0,120.0,80.0,86.0,18.0,98.2,50.0,151.0,99.0
max,100.0,223.0,146.0,120.0,30.0,107.7,180.0,200.0,100.0


#### Categorizing BP

In [42]:
patients_df['blood_pressure']=0
patients_df['blood_pressure'][(patients_df.SYSTOLIC_BP<90) | (patients_df.DIASTOLIC_BP<60)] = 0
patients_df['blood_pressure'][((patients_df.SYSTOLIC_BP>=90) & (patients_df.SYSTOLIC_BP<=120)) & ((patients_df.DIASTOLIC_BP>=60) & (patients_df.DIASTOLIC_BP<=80))] = 1
patients_df['blood_pressure'][((patients_df.SYSTOLIC_BP>120) & (patients_df.SYSTOLIC_BP<130)) & ((patients_df.DIASTOLIC_BP>=60) & (patients_df.DIASTOLIC_BP<=80))] = 2
patients_df['blood_pressure'][((patients_df.SYSTOLIC_BP>=130) & (patients_df.SYSTOLIC_BP<140)) | ((patients_df.DIASTOLIC_BP>80) & (patients_df.DIASTOLIC_BP<90))] = 3
patients_df['blood_pressure'][((patients_df.SYSTOLIC_BP>=140) & (patients_df.SYSTOLIC_BP<180)) | ((patients_df.DIASTOLIC_BP>=90) & (patients_df.DIASTOLIC_BP<120))] = 4
patients_df['blood_pressure'][(patients_df.SYSTOLIC_BP>=180) | (patients_df.DIASTOLIC_BP>=120)] = 5

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-vie

In [43]:
patients_df['blood_pressure'].value_counts(normalize=True)*100

1    89.348363
4     4.896899
3     4.409454
2     0.894718
5     0.266171
0     0.184395
Name: blood_pressure, dtype: float64

#### Categorizing pulse rate

In [44]:
patients_df['pulse_rate_categorized']=0
patients_df['pulse_rate_categorized'][patients_df.PULSE<60]=0
patients_df['pulse_rate_categorized'][(patients_df.PULSE>=60) & (patients_df.PULSE<=100)]=1
patients_df['pulse_rate_categorized'][patients_df.PULSE>100]=2

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [45]:
patients_df.pulse_rate_categorized.value_counts(normalize=True)*100

1    93.416285
2     6.408941
0     0.174775
Name: pulse_rate_categorized, dtype: float64

#### Categorizing respiration rate

In [46]:
patients_df['respiration_rate_categorized']=0
patients_df['respiration_rate_categorized'][patients_df.RESPIRATION_RATE<16]=0
patients_df['respiration_rate_categorized'][(patients_df.RESPIRATION_RATE>=16) & (patients_df.RESPIRATION_RATE<=20)]=1
patients_df['respiration_rate_categorized'][patients_df.RESPIRATION_RATE>20]=2

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [47]:
patients_df.respiration_rate_categorized.value_counts(normalize=True)*100

1    88.578713
2    11.082962
0     0.338325
Name: respiration_rate_categorized, dtype: float64

#### Categorizing SPO2

In [48]:
patients_df['SPO2_categorized']=0
patients_df['SPO2_categorized'][patients_df.SPO2>=95]=1
patients_df['SPO2_categorized'][patients_df.SPO2<95]=0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


In [49]:
patients_df.SPO2_categorized.value_counts(normalize=True)*100

1    97.918738
0     2.081262
Name: SPO2_categorized, dtype: float64

<b>For symptoms, currently working with only those with symptom ID's</b>

In [50]:
patients_df[patients_df.SYMPTOM_ID.notnull()].shape

(35791, 24)

In [51]:
patients_df_symptomId = patients_df[patients_df.SYMPTOM_ID.notnull()]

In [52]:
patients_df_symptomId.head()

Unnamed: 0,PATIENT_VISIT_ID,GENDER,AGE,STATE_NAME,DISTRICT_NAME,CENTER_SHORT_NAME,SYSTOLIC_BP,DIASTOLIC_BP,PULSE,RESPIRATION_RATE,...,SYMPTOM_ID,SYMPTOM_DESC,MEDICAL_TEST_NAME,DISEASE_ID,REFERRED,age_binned,blood_pressure,pulse_rate_categorized,respiration_rate_categorized,SPO2_categorized
1,PV000000554650,Male,42.0,Rajasthan,Jhunjhunun,B D K Hospital Jhunjhunu,138.0,96.0,86.0,18.0,...,SYMP00099~SYMP00499,,CBC~LIVER FUNCTION TEST (LFT),DISE00447~DISE02143,N,9,4,1,1,1
5,PV000000554890,Male,63.0,Rajasthan,Ajmer,Y N Hospital Kishangarh,120.0,80.0,74.0,26.0,...,SYMP04504,,UGI ENDOSCOPY,DISE02045,N,13,1,1,2,1
7,PV000000554912,Male,65.5,Rajasthan,Dholpur,Bari,146.0,100.0,89.0,18.0,...,SYMP00875,,ECG,DISE02897,N,14,4,1,1,1
9,PV000000554982,Female,26.0,Chhattisgarh,Dantewada,Nerli,110.0,80.0,84.0,22.0,...,SYMP03453,,XRAY LSSPINE AP AND LAT,DISE02534,N,6,1,1,2,1
20,PV000000555256,Male,20.0,Rajasthan,Dholpur,Sadar Hospital Dholpur,120.0,80.0,82.0,18.0,...,SYMP01844,,CBC~LIVER FUNCTION TEST (LFT)~RENAL FUNCTION T...,DISE05609,N,4,1,1,1,1


In [53]:
patients_df_symptomId.drop(columns=['SYMPTOM_DESC'],inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


#### Symptoms and diseases

In [54]:
symptoms = pd.read_csv('SYMPTOM MASTER.csv',encoding = "ISO-8859-1")

In [55]:
symptoms.head()

Unnamed: 0,SYMPTOM_ID,SYMPTOM_NAME,GENDER
0,SYMP04148,Increased Urinary Infection,C
1,SYMP04158,Increased Vocal Resonance,C
2,SYMP00998,indentation on either side of the nasal tip,C
3,SYMP00130,Excessive Burping/ Belching,C
4,SYMP00135,Indigestion,C


In [56]:
diseases = pd.read_csv('disease master.csv',encoding="ISO-8859-1")

In [57]:
diseases.head()

Unnamed: 0,DISEASE_ID,DISEASE_NAME,DISEASE_DESCRIPTION,DISEASE_LOCATION_ID,CREATE_DOCTOR_ID,CREATE_DATE,CREATE_FROM_HOSPITAL_ID,ICD_10_TYPE,ICD_10_BRANCH,ICD_10_ROOT,...,DISEASE_WEIGHTAGE,USER_ID,USER_DATE,MACHINE_IP_ADDRESS,MIN_LL,MAX_LL,MIN_UL,MAX_UL,AVILABLE_FLAG,CENTER_FLAG
0,DISE02063,Genetal Prolapse - Rectocele,Genetal Prolapse - Rectocele,GLID00306,,,,,,,...,0.0,,,,0.0,0.0,0.0,150.0,N,2
1,DISE02064,Genetal Prolapse - Uterine Discent,Genetal Prolapse - Uterine Discent,GLID00306,,,,,,,...,0.0,,,,0.0,0.0,0.0,150.0,N,2
2,DISE02065,Genetal Prolapse - Enterocele,Genetal Prolapse - Enterocele,GLID00306,,,,,,,...,0.0,,,,0.0,0.0,0.0,150.0,N,2
3,DISE02066,Genetal Prolapse - Relax Vaginal Outlet,Genetal Prolapse - Relax Vaginal Outlet,GLID00306,,,,,,,...,0.0,,,,0.0,0.0,0.0,150.0,N,2
4,DISE02067,Endometrial Cancer,Endometrial Cancer,GLID00306,,,,,,,...,0.0,,,,0.0,0.0,0.0,150.0,Y,2


In [58]:
diseases.shape

(18460, 23)

In [59]:
if os.path.exists('symptom_count_df.csv'):
    symptom_count_df = pd.read_csv('symptom_count_df.csv')
    
else:
    
    symptom_ids_list = sorted(list(symptoms.SYMPTOM_ID))
    symptom_count_df = pd.DataFrame(columns=['Symptom_id','Symptom_name','Count'])
    
    for i,symptom_id in enumerate(symptom_ids_list):
        count = patients_df_symptomId[patients_df_symptomId.SYMPTOM_ID.str.contains(symptom_id)].shape[0]
        name = symptoms.SYMPTOM_NAME[symptoms.SYMPTOM_ID == symptom_id].tolist()[0]
        symptom_count_df.loc[i] = [symptom_id,name,count]    
        
    symptom_count_df.to_csv('symptom_count_df.csv', index=False)
    
symptom_count_df.sort_values(by=['Count'],ascending=False)[symptom_count_df.Count>0]



Unnamed: 0,Symptom_id,Symptom_name,Count
747,SYMP00768,Itching,5588
1580,SYMP01641,Cough,4340
735,SYMP00756,Fever,3598
2526,SYMP04504,Allergy,3528
846,SYMP00875,Headache,3066
1168,SYMP01207,Joint Pain,2394
2363,SYMP04032,Skin Infections,1940
2308,SYMP03974,Pain - Lower Back,1391
120,SYMP00127,Acid Regurgitation,1233
724,SYMP00745,Dizziness,1164


In [60]:
if os.path.exists('disease_count_df.csv'):
    disease_count_df = pd.read_csv('disease_count_df.csv')
    
else:

    disease_ids_list = sorted(list(diseases.DISEASE_ID))
    disease_count_df = pd.DataFrame(columns=['Disease_id','Disease_name','Count'])

    for i,disease_id in enumerate(disease_ids_list):
        count = patients_df_symptomId[patients_df_symptomId.DISEASE_ID.str.contains(disease_id)].shape[0]
        name = diseases.DISEASE_NAME[diseases.DISEASE_ID == disease_id].tolist()[0]
        disease_count_df.loc[i] = [disease_id,name,count]
        
    disease_count_df.to_csv('disease_count_df.csv', index=False)

disease_count_df.sort_values(by=['Count'],ascending=False)[disease_count_df.Count>0]



Unnamed: 0,Disease_id,Disease_name,Count
198,DISE00200,Common Cold,5156
730,DISE00734,Scabies,3334
1532,DISE02534,Musculoskeletal Lower Back Pain,2964
334,DISE00336,Gastritis,2717
4193,DISE05607,Tinea corporis,1919
4195,DISE05609,Tinea cruris,1720
885,DISE00889,Gastroenteritis,1030
1119,DISE02112,Acne Vulgaris,973
268,DISE00270,Eczema,902
1894,DISE02897,Headache,863


In [61]:
patients_df_symptomId.SYMPTOM_ID = (patients_df_symptomId.SYMPTOM_ID.str.split('~'))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [62]:
mlb_symtoms = MultiLabelBinarizer()

In [63]:
symptoms_encoded = mlb_symtoms.fit_transform(patients_df_symptomId.SYMPTOM_ID)
patients_df_symptomId['symptoms_encoded'] = symptoms_encoded.tolist()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [64]:
symptoms_encoded

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [34]:
patients_df_symptomId.DISEASE_ID = (patients_df_symptomId.DISEASE_ID.str.split('~'))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [35]:
mlb_diseases = MultiLabelBinarizer()

In [36]:
diseases_encoded = mlb_diseases.fit_transform(patients_df_symptomId.DISEASE_ID)
patients_df_symptomId['diseases_encoded'] = diseases_encoded.tolist()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


<b>Finally, before moving on to training, categorizing gender and district name too</b>

In [37]:
patients_df_symptomId['gender_categorized'] = 0
patients_df_symptomId['gender_categorized'][patients_df_symptomId.GENDER == 'Male'] = 0
patients_df_symptomId['gender_categorized'][patients_df_symptomId.GENDER == 'Female'] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  exec(code_obj, self.user_global_ns, self.user_ns)
A value is trying to be set on a

In [38]:
label_enc_district = LabelEncoder()

In [39]:
patients_df_symptomId['district_categorized'] = label_enc_district.fit_transform(patients_df_symptomId.DISTRICT_NAME)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [40]:
patients_df_symptomId.head()

Unnamed: 0,PATIENT_VISIT_ID,GENDER,AGE,STATE_NAME,DISTRICT_NAME,CENTER_SHORT_NAME,SYSTOLIC_BP,DIASTOLIC_BP,PULSE,RESPIRATION_RATE,...,REFERRED,age_binned,blood_pressure,pulse_rate_categorized,respiration_rate_categorized,SPO2_categorized,symptoms_encoded,diseases_encoded,gender_categorized,district_categorized
1,PV000000554650,Male,42.0,Rajasthan,Jhunjhunun,B D K Hospital Jhunjhunu,138.0,96.0,86.0,18.0,...,N,9,4,1,1,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,27
5,PV000000554890,Male,63.0,Rajasthan,Ajmer,Y N Hospital Kishangarh,120.0,80.0,74.0,26.0,...,N,13,1,1,2,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0
7,PV000000554912,Male,65.5,Rajasthan,Dholpur,Bari,146.0,100.0,89.0,18.0,...,N,14,4,1,1,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,16
9,PV000000554982,Female,26.0,Chhattisgarh,Dantewada,Nerli,110.0,80.0,84.0,22.0,...,N,6,1,1,2,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1,15
20,PV000000555256,Male,20.0,Rajasthan,Dholpur,Sadar Hospital Dholpur,120.0,80.0,82.0,18.0,...,N,4,1,1,1,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,16


<b>Preparing data for training:</b>

In [41]:
patients_df_symptomId.columns

Index(['PATIENT_VISIT_ID', 'GENDER', 'AGE', 'STATE_NAME', 'DISTRICT_NAME',
       'CENTER_SHORT_NAME', 'SYSTOLIC_BP', 'DIASTOLIC_BP', 'PULSE',
       'RESPIRATION_RATE', 'BODY_TEMPERATURE', 'BODY_WEIGHT', 'HEIGHT', 'SPO2',
       'SYMPTOM_ID', 'MEDICAL_TEST_NAME', 'DISEASE_ID', 'REFERRED',
       'age_binned', 'blood_pressure', 'pulse_rate_categorized',
       'respiration_rate_categorized', 'SPO2_categorized', 'symptoms_encoded',
       'diseases_encoded', 'gender_categorized', 'district_categorized'],
      dtype='object')

In [42]:
features_df = patients_df_symptomId[['gender_categorized','age_binned','district_categorized','blood_pressure','pulse_rate_categorized',
                              'respiration_rate_categorized','BODY_TEMPERATURE','BODY_WEIGHT','HEIGHT','SPO2_categorized']]

In [43]:
features = features_df.values

In [44]:
features = np.hstack((features, symptoms_encoded))

In [45]:
labels = diseases_encoded

In [46]:
if not os.path.exists("features.csv"):
    np.savetxt("features.csv", features, delimiter=",")
if not os.path.exists("labels.csv"):
    np.savetxt("labels.csv", labels, delimiter=",")

In [47]:
print(features.shape,labels.shape)

(35791, 787) (35791, 301)


In [48]:
X_train, X_test, Y_train, Y_test = train_test_split(features, labels, test_size = 0.3, random_state = 42)
print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)

(25053, 787) (10738, 787) (25053, 301) (10738, 301)


<b>Start training</b>

In [49]:
# from xgboost import XGBClassifier
# from skmultilearn.problem_transform import BinaryRelevance, LabelPowerset

<b>Using random forest model with default hyperparameters</b>

In [50]:
from sklearn.ensemble import RandomForestClassifier

In [6]:
if os.path.exists('random_forest_model.sav'):
    clf = pickle.load(open('random_forest_model.sav', 'rb'))
else:
    clf = OneVsRestClassifier(RandomForestClassifier())
    clf.fit(X_train, Y_train)
    pickle.dump(clf, open('random_forest_model.sav', 'wb'))



In [7]:
joblib.dump(clf, "random_forest_model.joblib")

['random_forest_model.joblib']

In [52]:
Y_predict = clf.predict(X_test)

In [53]:
Y_predict_proba = clf.predict_proba(X_test)

In [54]:
Y_predict.shape

(10738, 301)

In [55]:
clf.score(X_test,Y_predict)

1.0

In [56]:
from sklearn.metrics import hamming_loss

In [57]:
hamming_loss(Y_test,Y_predict)

0.0031146566142906027

<b> Trying a lightgbm model with Grid Search</b>

In [58]:
from lightgbm import LGBMClassifier

In [59]:
clf_lgb = OneVsRestClassifier(LGBMClassifier(objective='multiclassova',num_class=301))

In [60]:
clf_lgb

OneVsRestClassifier(estimator=LGBMClassifier(boosting_type='gbdt',
                                             class_weight=None,
                                             colsample_bytree=1.0,
                                             importance_type='split',
                                             learning_rate=0.1, max_depth=-1,
                                             min_child_samples=20,
                                             min_child_weight=0.001,
                                             min_split_gain=0.0,
                                             n_estimators=100, n_jobs=-1,
                                             num_class=301, num_leaves=31,
                                             objective='multiclassova',
                                             random_state=None, reg_alpha=0.0,
                                             reg_lambda=0.0, silent=True,
                                             subsample=1.0,
                   

In [None]:
clf_lgb.fit(X_train,Y_train)

  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))


<b>Serving model</b>

In [12]:
# Define IAM role
role = get_execution_role()

my_region = boto3.session.Session().region_name # set the region of the instance
print("Success - the MySageMakerInstance is in the " + my_region + " region.")

Success - the MySageMakerInstance is in the us-east-1 region.


In [13]:
bucket_name = 'test-medikate'
s3 = boto3.resource('s3')

In [21]:
prefix = 'disease_classification'

sagemaker_session = sagemaker.Session()
train_input = sagemaker_session.upload_data(path='ML DATA.csv',bucket=bucket_name, key_prefix="{}/{}".format(prefix, WORK_DIRECTORY))

In [68]:
from sagemaker.sklearn.estimator import SKLearn

script_path = 'random_forest_predictor.py'

sklearn = SKLearn(
    entry_point=script_path,
    train_instance_type="ml.c4.xlarge",
    role=role,
    sagemaker_session=sagemaker_session,
    hyperparameters={'max_depth': 50})

In [69]:
sklearn.fit({'train': train_input})

2019-12-06 11:32:08 Starting - Starting the training job...
2019-12-06 11:32:09 Starting - Launching requested ML instances......
2019-12-06 11:33:18 Starting - Preparing the instances for training...
2019-12-06 11:33:55 Downloading - Downloading input data...
2019-12-06 11:34:27 Training - Training image download completed. Training in progress.[31m2019-12-06 11:34:27,816 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[31m2019-12-06 11:34:27,818 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)[0m
[31m2019-12-06 11:34:27,828 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[31m2019-12-06 11:34:28,098 sagemaker-containers INFO     Module random_forest_predictor does not provide a setup.py. [0m
[31mGenerating setup.py[0m
[31m2019-12-06 11:34:28,098 sagemaker-containers INFO     Generating setup.cfg[0m
[31m2019-12-06 11:34:28,098 sagemaker-containers INFO     Generating MANIFE















[31m2019-12-06 11:40:42,128 sagemaker-containers INFO     Reporting training SUCCESS[0m

2019-12-06 11:40:56 Uploading - Uploading generated training model
2019-12-06 11:40:56 Completed - Training job completed
Training seconds: 421
Billable seconds: 421


In [70]:
predictor = sklearn.deploy(initial_instance_count=1, instance_type="ml.m4.xlarge")

--------------------------------------------------------------------------------------------------------------!