In [10]:
import boto3, re, sys, math, json, os, sagemaker, urllib.request, csv
from sagemaker import get_execution_role
import numpy as np                                
import pandas as pd                               
import matplotlib.pyplot as plt                                 
from time import gmtime, strftime                 
from sagemaker.predictor import csv_serializer 

from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.externals import joblib

from io import StringIO

In [35]:
cols_list = ['gender_categorized','age_binned','blood_pressure','pulse_rate_categorized','respiration_rate_categorized',
             'BODY_TEMPERATURE','BODY_WEIGHT','HEIGHT','SPO2_categorized'] #'district_categorized',
vitals =['SYSTOLIC_BP','DIASTOLIC_BP','PULSE','RESPIRATION_RATE','BODY_TEMPERATURE','BODY_WEIGHT','HEIGHT','SPO2']

In [13]:
def drop_unnecessary_cols(df):

    df.drop(columns=['HEART_RATE','HEAD_CIRCUMFERENCE','UPPER_ARM_CIRCUMFERENCE'],inplace = True)
    df = df[pd.notnull(df['DISEASE_ID'])]
    #df.drop(df[df.REFERRED == 'Y'].index, inplace=True)
    df.drop(df[df.DISEASE_ID == '0'].index, inplace=True)
    df = df[df.SYMPTOM_ID.notnull()]
    df[vitals] = df[vitals].replace(0, np.nan)
    df.drop(df[df.DISEASE_ID == 'Confusion'].index, inplace=True)

    return df

def drop_wrong_vitals(df):

    df.drop(df[(df.SYSTOLIC_BP>230) | (df.DIASTOLIC_BP>150)].index, inplace = True)
    df.drop(df[(df.SYSTOLIC_BP<80) | (df.DIASTOLIC_BP<50)].index, inplace=True)
    df.drop(df[(df.BODY_TEMPERATURE<90) | (df.BODY_TEMPERATURE>110)].index, inplace=True)
    df.drop(df[df.AGE>100].index, inplace=True)
    df.drop(df[(df.SPO2>100) | (df.SPO2<80)].index,inplace=True)
    df.drop(df[(df.PULSE<50) | (df.PULSE>120)].index,inplace=True)
    df.drop(df[(df.RESPIRATION_RATE>30) | (df.RESPIRATION_RATE<13)].index,inplace=True)
    df.drop(df[df.BODY_WEIGHT>200].index,inplace=True)
    df.drop(df[(df.HEIGHT>200) | (df.HEIGHT<45)].index,inplace=True)

    return df

def bin_ages(df):

    bins = [x for x in range(0,101,5)]
    labels = [x for x in range(1,21)]
    df['age_binned'] = pd.cut(df.AGE,bins,labels=labels,include_lowest=True)
    
    return df

def fill_nas(df):
    
    '''WARNING: These values need to be entered as per the median of data'''

    df.SYSTOLIC_BP.fillna(120,inplace=True)
    df.DIASTOLIC_BP.fillna(80,inplace=True)
    df.PULSE.fillna(84,inplace=True)
    df.RESPIRATION_RATE.fillna(18,inplace=True)
    df.BODY_TEMPERATURE.fillna(98.2,inplace=True)
    df.BODY_WEIGHT.fillna(45,inplace=True)
    df.HEIGHT.fillna(151,inplace=True)
    df.SPO2.fillna(99,inplace=True)

    return df

def categorize_bp(df):

    df['blood_pressure']=0
    df['blood_pressure'][(df.SYSTOLIC_BP<90) | (df.DIASTOLIC_BP<60)] = 0
    df['blood_pressure'][((df.SYSTOLIC_BP>=90) & (df.SYSTOLIC_BP<=120)) & ((df.DIASTOLIC_BP>=60) & (df.DIASTOLIC_BP<=80))] = 1
    df['blood_pressure'][((df.SYSTOLIC_BP>120) & (df.SYSTOLIC_BP<130)) & ((df.DIASTOLIC_BP>=60) & (df.DIASTOLIC_BP<=80))] = 2
    df['blood_pressure'][((df.SYSTOLIC_BP>=130) & (df.SYSTOLIC_BP<140)) | ((df.DIASTOLIC_BP>80) & (df.DIASTOLIC_BP<90))] = 3
    df['blood_pressure'][((df.SYSTOLIC_BP>=140) & (df.SYSTOLIC_BP<180)) | ((df.DIASTOLIC_BP>=90) & (df.DIASTOLIC_BP<120))] = 4
    df['blood_pressure'][(df.SYSTOLIC_BP>=180) | (df.DIASTOLIC_BP>=120)] = 5

    return df

def categorize_pulse_rate(df):
    
    df['pulse_rate_categorized']=0
    df['pulse_rate_categorized'][df.PULSE<60]=0
    df['pulse_rate_categorized'][(df.PULSE>=60) & (df.PULSE<=100)]=1
    df['pulse_rate_categorized'][df.PULSE>100]=2

    return df

def categorize_respiration_rate(df):

    df['respiration_rate_categorized']=0
    df['respiration_rate_categorized'][df.RESPIRATION_RATE<16]=0
    df['respiration_rate_categorized'][(df.RESPIRATION_RATE>=16) & (df.RESPIRATION_RATE<=20)]=1
    df['respiration_rate_categorized'][df.RESPIRATION_RATE>20]=2

    return df

def categorize_spo2(df):

    df['SPO2_categorized']=0
    df['SPO2_categorized'][df.SPO2>=95]=1
    df['SPO2_categorized'][df.SPO2<95]=0

    return df

def categorize_gender(df):

    df['gender_categorized'] = 0
    df['gender_categorized'][df.GENDER == 'Male'] = 0
    df['gender_categorized'][df.GENDER == 'Female'] = 1

    return df

def categorize_district(df):
        
    label_enc_district = LabelEncoder()
    df['district_categorized'] = label_enc_district.fit_transform(df.DISTRICT_NAME)

    return df

def encode_symptoms(df):

    df.SYMPTOM_ID = [[symptom for symptom in symptoms_row if symptom!=''] for symptoms_row in df.SYMPTOM_ID.str.split('~').tolist()]
    
    symptoms_encoded = []
                    
    mlb_symtoms = MultiLabelBinarizer()
    symptoms_encoded = mlb_symtoms.fit_transform(df.SYMPTOM_ID)        
        
    df['symptoms_encoded'] = symptoms_encoded.tolist()
    
    return df, symptoms_encoded

def encode_diseases(df):

    df.DISEASE_ID = (df.DISEASE_ID.str.split('~'))
    mlb_diseases = MultiLabelBinarizer()
    diseases_encoded = mlb_diseases.fit_transform(df.DISEASE_ID)
        
    df['diseases_encoded'] = diseases_encoded.tolist()

    return df, diseases_encoded

def format_data(df):

    df = drop_unnecessary_cols(df)
    df = drop_wrong_vitals(df)
    df = fill_nas(df)
    return df


In [11]:
test_data = pd.read_excel('ML DATA 03012020.xlsx')

In [14]:
test_data.head()

Unnamed: 0,PATIENT_VISIT_ID,GENDER,AGE,STATE_NAME,DISTRICT_NAME,DISEASE_ID,SYMPTOM_ID,SYSTOLIC_BP,DIASTOLIC_BP,PULSE,RESPIRATION_RATE,BODY_TEMPERATURE,BODY_WEIGHT,HEART_RATE,HEIGHT,HEAD_CIRCUMFERENCE,UPPER_ARM_CIRCUMFERENCE,SPO2
0,PV000000183054,Female,18.0,Orissa,NABARANGAPUR,DISE02857,SYMP00756~SYMP00875~~~~~~~~~,110.0,70.0,114.0,19.0,101.3,45.0,,120.0,,,
1,PV000000191142,Male,24.0,Orissa,NABARANGAPUR,DISE18070,SYMP00103~SYMP01220~~~~~~~~~,120.0,80.0,70.0,18.0,98.6,45.0,,153.0,,,
2,PV000000165538,Male,14.0,Orissa,NABARANGAPUR,DISE03581,SYMP00875~SYMP03669~~~~~~~~~,80.0,60.0,96.0,20.0,98.0,0.0,,164.0,,,
3,PV000000175974,Male,54.0,Orissa,NABARANGAPUR,DISE04688,SYMP01641~SYMP01207~~~~~~~~~,120.0,80.0,95.0,20.0,98.0,0.0,,165.0,,,
4,PV000000181454,Male,25.0,Orissa,NABARANGAPUR,DISE02587,SYMP00823~SYMP00829~~~~~~~~~,120.0,80.0,81.0,18.0,98.0,53.0,,154.0,,,


In [16]:
test_data = format_data(test_data)

In [17]:
test_data.head()

Unnamed: 0,PATIENT_VISIT_ID,GENDER,AGE,STATE_NAME,DISTRICT_NAME,DISEASE_ID,SYMPTOM_ID,SYSTOLIC_BP,DIASTOLIC_BP,PULSE,RESPIRATION_RATE,BODY_TEMPERATURE,BODY_WEIGHT,HEIGHT,SPO2
0,PV000000183054,Female,18.0,Orissa,NABARANGAPUR,DISE02857,SYMP00756~SYMP00875~~~~~~~~~,110.0,70.0,114.0,19.0,101.3,45.0,120.0,99.0
1,PV000000191142,Male,24.0,Orissa,NABARANGAPUR,DISE18070,SYMP00103~SYMP01220~~~~~~~~~,120.0,80.0,70.0,18.0,98.6,45.0,153.0,99.0
2,PV000000165538,Male,14.0,Orissa,NABARANGAPUR,DISE03581,SYMP00875~SYMP03669~~~~~~~~~,80.0,60.0,96.0,20.0,98.0,45.0,164.0,99.0
3,PV000000175974,Male,54.0,Orissa,NABARANGAPUR,DISE04688,SYMP01641~SYMP01207~~~~~~~~~,120.0,80.0,95.0,20.0,98.0,45.0,165.0,99.0
4,PV000000181454,Male,25.0,Orissa,NABARANGAPUR,DISE02587,SYMP00823~SYMP00829~~~~~~~~~,120.0,80.0,81.0,18.0,98.0,53.0,154.0,99.0


In [18]:
test_data.head(1000).to_csv("Test DF.csv", index = False)

In [19]:
# Define IAM role
role = get_execution_role()

my_region = boto3.session.Session().region_name # set the region of the instance
print("Success - the MySageMakerInstance is in the " + my_region + " region.")

Success - the MySageMakerInstance is in the us-east-1 region.


In [21]:
endpoint_new = 'sagemaker-scikit-learn-2020-01-07-07-13-43-598'

In [22]:
predictor = sagemaker.predictor.RealTimePredictor(
    endpoint = endpoint_new,
    content_type='text/csv')

In [34]:
f = open('Test DF.csv', 'r')

header = f.readline()
# print(cols)
for line in f.readlines():
    prediction_text = header + '\n' + line
    print("Actual:"+line.split(',')[5]+"\tPredicted:"+predictor.predict(prediction_text).decode('utf-8'))#['diseases'])

Actual:DISE02857	Predicted:{"diseases": ["DISE02857", "DISE19879"]}
Actual:DISE18070	Predicted:{"diseases": ["DISE02386", "DISE18070"]}
Actual:DISE03581	Predicted:{"diseases": ["DISE03581"]}
Actual:DISE04688	Predicted:{"diseases": ["DISE04688"]}
Actual:DISE02587	Predicted:{"diseases": ["DISE02587"]}
Actual:DISE04122	Predicted:{"diseases": ["DISE04122"]}
Actual:DISE02120	Predicted:{"diseases": ["DISE02120"]}
Actual:DISE02120	Predicted:{"diseases": []}
Actual:DISE04000	Predicted:{"diseases": ["DISE04000"]}
Actual:DISE02534	Predicted:{"diseases": ["DISE02534"]}
Actual:DISE04394	Predicted:{"diseases": ["DISE04394", "DISE09728"]}
Actual:DISE02938	Predicted:{"diseases": ["DISE02938"]}
Actual:DISE03071	Predicted:{"diseases": ["DISE03071"]}
Actual:DISE02534	Predicted:{"diseases": ["DISE02534"]}
Actual:DISE00293	Predicted:{"diseases": ["DISE00293"]}
Actual:DISE02045	Predicted:{"diseases": ["DISE02045"]}
Actual:DISE02534	Predicted:{"diseases": ["DISE02534"]}
Actual:DISE09221	Predicted:{"diseases

KeyboardInterrupt: 

In [None]:
df = pd.read_excel()