In [1]:
# General Data Science Libraries
import numpy as np
import pandas as pd
from imblearn.over_sampling import SMOTE
import warnings
import pickle

# Machine Learning Libraries
import sklearn as sk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score 
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
from sklearn.decomposition import PCA


# Natural Language Processing Libraries
from gensim.parsing.preprocessing import remove_stopwords
from nltk.stem import PorterStemmer  
from sklearn.feature_extraction.text import TfidfVectorizer

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

warnings.filterwarnings('ignore')

/kaggle/input/synthea-dataset/observations.csv
/kaggle/input/synthea-dataset/careplans.csv
/kaggle/input/synthea-dataset/conditions.csv
/kaggle/input/synthea-dataset/encounters.csv
/kaggle/input/synthea-dataset/procedures.csv
/kaggle/input/synthea-dataset/allergies.csv
/kaggle/input/synthea-dataset/claims.csv
/kaggle/input/synthea-dataset/medications.csv
/kaggle/input/synthea-dataset/payers.csv
/kaggle/input/synthea-dataset/organizations.csv
/kaggle/input/synthea-dataset/imaging_studies.csv
/kaggle/input/synthea-dataset/supplies.csv
/kaggle/input/synthea-dataset/patients.csv
/kaggle/input/synthea-dataset/devices.csv
/kaggle/input/synthea-dataset/claims_transactions.csv
/kaggle/input/synthea-dataset/payer_transitions.csv
/kaggle/input/synthea-dataset/providers.csv
/kaggle/input/synthea-dataset/immunizations.csv


In [2]:
df_patients = pd.read_csv('../input/synthea-dataset/patients.csv')
df_patients.rename(columns = {'Id' : 'PATIENT'}, inplace = True)
patients_useful = ['PATIENT', 'BIRTHDATE', 'GENDER', 'ETHNICITY', 'STATE', 'HEALTHCARE_EXPENSES', 'INCOME']
df_patients = df_patients[patients_useful]
df_patients.head()

Unnamed: 0,PATIENT,BIRTHDATE,GENDER,ETHNICITY,STATE,HEALTHCARE_EXPENSES,INCOME
0,6132a397-93f1-3f41-a63b-2c86042ae94c,2000-03-01,F,nonhispanic,Massachusetts,26886.74,204335
1,d4fd6e52-f235-162f-7688-ee163a2635df,2001-11-26,F,nonhispanic,Massachusetts,30793.14,30686
2,1462dde3-e611-2c7d-1ecb-843949c77f02,1996-04-15,M,nonhispanic,Massachusetts,33688.44,93953
3,448f4d95-6ede-06fe-0142-b2afe8792711,1963-01-10,F,hispanic,Massachusetts,1635021.82,40514
4,57f4670b-b6b0-df0d-0122-5d9f68e39f63,1989-01-02,F,nonhispanic,Massachusetts,40714.75,946218


In [3]:
def text_cleaning(df, column):
    df_column_array = np.array(df[column])
    text = []
    ps = PorterStemmer()
    for index in range(len(df_column_array)):
        modifiedText = remove_stopwords(df.at[index, column])
        stemmedSentence = ""
        for word in modifiedText.split():
            stemmedSentence += ps.stem(word)
            stemmedSentence += " "
        df.at[index, column] = stemmedSentence

In [4]:
df_allergies = pd.read_csv('../input/synthea-dataset/allergies.csv')
allergies_useful = ['START', 'PATIENT', 'ENCOUNTER', 'CODE', 'DESCRIPTION', 'DESCRIPTION1', 'DESCRIPTION2']
df_allergies = df_allergies[allergies_useful]

for description_column in ['DESCRIPTION', 'DESCRIPTION1', 'DESCRIPTION2']:
    df_allergies[description_column].fillna("", inplace = True);

text_cleaning(df_allergies, 'DESCRIPTION')
text_cleaning(df_allergies, 'DESCRIPTION1')
text_cleaning(df_allergies, 'DESCRIPTION2')
    
df_allergies.tail()

Unnamed: 0,START,PATIENT,ENCOUNTER,CODE,DESCRIPTION,DESCRIPTION1,DESCRIPTION2
826,2002-11-14,856b0887-e9d8-7429-736e-fc9b8e177f7b,74e78060-ea36-6108-aa53-f26980821678,84489001,mold (organism),,
827,2002-11-14,856b0887-e9d8-7429-736e-fc9b8e177f7b,74e78060-ea36-6108-aa53-f26980821678,260147004,hous dust mite (organism),,
828,2002-11-14,856b0887-e9d8-7429-736e-fc9b8e177f7b,74e78060-ea36-6108-aa53-f26980821678,264287008,anim dander (substance),rhinoconjunct (disorder),cough (finding)
829,2002-11-14,856b0887-e9d8-7429-736e-fc9b8e177f7b,74e78060-ea36-6108-aa53-f26980821678,782576004,tree pollen (substance),,
830,2002-11-14,856b0887-e9d8-7429-736e-fc9b8e177f7b,74e78060-ea36-6108-aa53-f26980821678,442571000124108,tree nut (substance),anaphylaxi (disorder),allerg angioedema (disorder)


In [5]:
df_encounters = pd.read_csv('../input/synthea-dataset/encounters.csv')

df_encounters.rename(columns = {'Id': 'ENCOUNTER'}, inplace = True)

encounters_useful = ['ENCOUNTER', 'PATIENT', 'ORGANIZATION', 'ENCOUNTERCLASS', 'DESCRIPTION']
df_encounters = df_encounters[encounters_useful]

df_encounters.head()

Unnamed: 0,ENCOUNTER,PATIENT,ORGANIZATION,ENCOUNTERCLASS,DESCRIPTION
0,7e8c4227-0504-8acd-98d0-bfba8b5506ef,6132a397-93f1-3f41-a63b-2c86042ae94c,218e58e4-9642-3ebd-9b39-34d15f22c620,wellness,Well child visit (procedure)
1,cd5c1b99-8a07-325f-6709-23b97091612e,6132a397-93f1-3f41-a63b-2c86042ae94c,218e58e4-9642-3ebd-9b39-34d15f22c620,wellness,Well child visit (procedure)
2,a7bea3d0-0a80-6238-9833-ca0485985514,6132a397-93f1-3f41-a63b-2c86042ae94c,bdc3ee76-9cf3-316d-b202-a8da1ea3fa20,emergency,Emergency room admission (procedure)
3,4442b292-4233-8ba8-f3c6-0df9f394818b,6132a397-93f1-3f41-a63b-2c86042ae94c,218e58e4-9642-3ebd-9b39-34d15f22c620,wellness,Well child visit (procedure)
4,693b145d-cd1c-3ec1-c00e-3802b5e05e5e,6132a397-93f1-3f41-a63b-2c86042ae94c,25431f9a-00e4-36d8-9810-4125328eec50,ambulatory,Encounter for 'check-up'


In [6]:
df_allergies_joined_encounters = pd.merge(df_allergies, df_encounters, on = "ENCOUNTER")
df_allergies_joined_encounters.drop(columns = ['START', 'CODE', 'ORGANIZATION', 'PATIENT_y', 'DESCRIPTION_y'], inplace = True)
df_allergies_joined_encounters.rename(columns = {'PATIENT_x': 'PATIENT', 'DESCRIPTION_x': 'ALLERGY'}, inplace = True)

df_final = pd.merge(df_patients, df_allergies_joined_encounters, on = "PATIENT")
df_final.drop(columns = ['PATIENT'], inplace = True)

df_final_encounter = df_final.copy()
df_final.drop(columns = ['ENCOUNTER'], inplace = True)

df_final['ENCOUNTERCLASS'] = df_final['ENCOUNTERCLASS'].map({'ambulatory':0,
                                                             'emergency':1,
                                                             'inpatient':2,
                                                             'outpatient':3,
                                                             'wellness':4,
                                                            })

for i in range(len(df_final)):
    date = df_final.at[i, 'BIRTHDATE']
    df_final.at[i, 'BIRTHDATE'] = int(date[:4])
    
df_final['BIRTHDATE'] = pd.to_numeric(df_final['BIRTHDATE'])

genders = {'M': 1, 'F': 0}
df_final['GENDER'] = df_final['GENDER'].map(genders)

In [7]:
ohe = preprocessing.OneHotEncoder(sparse = False, handle_unknown = "ignore")
new_columns_generated = []

for value in sorted(df_final['ETHNICITY'].unique()):
    new_columns_generated.append(value)

for value in sorted(df_final['STATE'].unique()):
    new_columns_generated.append(value)

transformed_array = ohe.fit_transform(df_final[['ETHNICITY', 'STATE']])
df_temp = pd.DataFrame(transformed_array, columns = new_columns_generated)

df_final = pd.concat([df_final, df_temp], axis = 1)
df_final.drop(columns = ['ETHNICITY', 'STATE'], inplace = True)
df_final.head()

Unnamed: 0,BIRTHDATE,GENDER,HEALTHCARE_EXPENSES,INCOME,ALLERGY,DESCRIPTION1,DESCRIPTION2,ENCOUNTERCLASS,hispanic,nonhispanic,Massachusetts
0,1989,0,40714.75,946218,mold (organism),sneez,allerg skin rash,0,0.0,1.0,1.0
1,1995,0,51676.82,39161,mold (organism),,,0,0.0,1.0,1.0
2,1995,0,51676.82,39161,hous dust mite (organism),,,0,0.0,1.0,1.0
3,1995,0,51676.82,39161,anim dander (substance),rhinoconjunct (disorder),,0,0.0,1.0,1.0
4,1995,0,51676.82,39161,grass pollen (substance),,,0,0.0,1.0,1.0


In [8]:
patient_features = list(df_final.columns)
patient_features.remove('ENCOUNTERCLASS')
patient_features.remove('ALLERGY')
patient_features

['BIRTHDATE',
 'GENDER',
 'HEALTHCARE_EXPENSES',
 'INCOME',
 'DESCRIPTION1',
 'DESCRIPTION2',
 'hispanic',
 'nonhispanic',
 'Massachusetts']

In [9]:
for index in range(df_final.shape[0]):
    df_final.at[index, 'ALLERGY'] = df_final.at[index, 'ALLERGY'] + " " + df_final.at[index, 'DESCRIPTION1'] + " "
    df_final.at[index, 'ALLERGY'] += df_final.at[index, 'DESCRIPTION2']

df_final.drop(columns = ['DESCRIPTION1', 'DESCRIPTION2'], inplace = True)
df_final.head()

Unnamed: 0,BIRTHDATE,GENDER,HEALTHCARE_EXPENSES,INCOME,ALLERGY,ENCOUNTERCLASS,hispanic,nonhispanic,Massachusetts
0,1989,0,40714.75,946218,mold (organism) sneez allerg skin rash,0,0.0,1.0,1.0
1,1995,0,51676.82,39161,mold (organism),0,0.0,1.0,1.0
2,1995,0,51676.82,39161,hous dust mite (organism),0,0.0,1.0,1.0
3,1995,0,51676.82,39161,anim dander (substance) rhinoconjunct (disord...,0,0.0,1.0,1.0
4,1995,0,51676.82,39161,grass pollen (substance),0,0.0,1.0,1.0


In [10]:
def vectorize(data,tfidf_vect_fit):
    x_tfidf = tfidf_vect_fit.transform(data)
    words = tfidf_vect_fit.get_feature_names()
    x_tfidf_df = pd.DataFrame(x_tfidf.toarray())
    x_tfidf_df.columns = words
    return (x_tfidf_df)

In [11]:
tfidf_vect = TfidfVectorizer(analyzer = 'word', stop_words = 'english',ngram_range = (1,3), max_df = 0.5, use_idf = True, smooth_idf = True, max_features = 1000)
tfidf_vect_fit = tfidf_vect.fit(df_final['ALLERGY'])

df_final.reset_index(drop = True, inplace = True)
df_temp = vectorize(df_final['ALLERGY'], tfidf_vect_fit)
df_temp.reset_index(drop = True, inplace = True)

df_final = pd.concat([df_final, df_temp], axis = 1)
df_final.drop(columns = ['ALLERGY'], inplace = True)

df_final.head()

Unnamed: 0,BIRTHDATE,GENDER,HEALTHCARE_EXPENSES,INCOME,ENCOUNTERCLASS,hispanic,nonhispanic,Massachusetts,abdomin,abdomin pain,...,wheal finding wheez,wheat,wheat substance,wheez,wheez allerg,wheez allerg skin,wheez finding,wheez finding diarrhea,wheez nose,wheez nose run
0,1989,0,40714.75,946218,0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1995,0,51676.82,39161,0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1995,0,51676.82,39161,0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1995,0,51676.82,39161,0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1995,0,51676.82,39161,0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
df_final = df_final[df_final['ENCOUNTERCLASS'] != 5]
df_final['ENCOUNTERCLASS'] = df_final['ENCOUNTERCLASS'].fillna(0)

encounter_class_list = ['ambulatory', 'emergency', 'inpatient', 'outpatient', 'wellness']

In [13]:
space = {
        'objective': 'multi:softprob',
        'gamma': hp.uniform ('gamma', 1,9),
        'reg_alpha' : hp.loguniform('reg_alpha', 1e-8,100),
        'reg_lambda' : hp.loguniform('reg_lambda', 1e-8,100),
        'colsample_bytree' : hp.uniform('colsample_bytree', 0.1,1),
        'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1),
        'n_estimators': 1800,
        'seed': 0,
        'learning_rate': hp.quniform("learning_rate", 1e-2, 0.25),
        'max_depth': hp.uniform('max_depth', 1, 7)
}

def objective(space):
    clf = XGBClassifier(
                    n_estimators =space['n_estimators'], max_depth = int(space['max_depth']), gamma = space['gamma'],
                    reg_alpha = int(space['reg_alpha']), min_child_weight = int(space['min_child_weight']),
                    colsample_bytree = int(space['colsample_bytree']))
        
    xx_train = df_train.drop(columns = ['ENCOUNTERCLASS'])
    yy_train = df_train['ENCOUNTERCLASS']
    
    xx_test = df_test.drop(columns = ['ENCOUNTERCLASS'])
    yy_test = df_test['ENCOUNTERCLASS']
    
    evaluation = [(xx_train, yy_train), 
                  (xx_test, yy_test)]
    
    clf.fit(xx_train, yy_train, eval_set=evaluation, eval_metric=None, early_stopping_rounds=20,verbose=False)
    
    pred = clf.predict(xx_test)
    accuracy = accuracy_score(yy_test, pred)
    print ("SCORE:", accuracy)
    return {'loss': -accuracy, 'status': STATUS_OK }

trials = Trials()

# best_hyperparams = fmin(fn = objective, space = space, algo = tpe.suggest, max_evals = 100, trials = trials)

In [14]:
def is_float(element) -> bool:
    try:
        float(element)
        return True
    except ValueError:
        return False

In [15]:
df_observations = pd.read_csv('../input/synthea-dataset/observations.csv')
observations_class = df_observations.groupby(df_observations['DESCRIPTION'])
filtered_observation_class = observations_class.filter(lambda x : x.shape[0] > 10000)
df_observations_grouped = filtered_observation_class.groupby(filtered_observation_class['DESCRIPTION'])
df_observations_grouped = df_observations_grouped.size().reset_index(name = 'count')

In [16]:
medical_features = ['ENCOUNTER', 'Heart rate', 'Diastolic Blood Pressure', 'Respiratory rate', 'Systolic Blood Pressure', 'Glucose', 'Tobacco smoking status', "Stress is when someone feels tense  nervous  anxious or can't sleep at night because their mind is troubled. How stressed are you?", 'Do you feel physically and emotionally safe where you currently live?']
df_features = pd.DataFrame(columns = medical_features)

encounter_index = {}
entry = 0

for index in range(df_observations.shape[0]):
    encounter = df_observations.at[index, 'ENCOUNTER']
    feature = df_observations.at[index, 'DESCRIPTION']
    value = df_observations.at[index, 'VALUE']
    if encounter in encounter_index:
        insert_at = encounter_index[encounter]
        df_features.at[insert_at, 'ENCOUNTER'] = encounter
    else:
        encounter_index[encounter] = entry
        entry = entry + 1
    if feature in medical_features:
        df_features.at[insert_at, feature] = value

df_features.head()

Unnamed: 0,ENCOUNTER,Heart rate,Diastolic Blood Pressure,Respiratory rate,Systolic Blood Pressure,Glucose,Tobacco smoking status,Stress is when someone feels tense nervous anxious or can't sleep at night because their mind is troubled. How stressed are you?,Do you feel physically and emotionally safe where you currently live?
0,cd5c1b99-8a07-325f-6709-23b97091612e,69.0,74.0,14.0,111.0,,Never smoked tobacco (finding),,
1,4442b292-4233-8ba8-f3c6-0df9f394818b,87.0,79.0,12.0,118.0,,Never smoked tobacco (finding),,
2,fa2cd5c6-de7b-5d30-4ddc-753e1c11498e,96.0,78.0,14.0,116.0,,Never smoked tobacco (finding),,
3,29956fe2-39f4-90a6-9312-ab53c1786b8f,81.0,78.0,14.0,111.0,,Never smoked tobacco (finding),,
4,042434ef-5872-e038-ee90-45d15c33c188,75.0,77.0,13.0,115.0,,Never smoked tobacco (finding),,


In [17]:
for numeric_features in medical_features[1:6]:
    df_features[numeric_features] = pd.to_numeric(df_features[numeric_features])
    df_features[numeric_features].fillna(df_features[numeric_features].mean(), inplace = True)

In [18]:
smoke = {
    'Never smoked tobacco (finding)': '0',
    'Smokes tobacco daily (finding)': '1'
}

df_features['Tobacco smoking status'] = df_features['Tobacco smoking status'].map(smoke)
df_features['Tobacco smoking status'] = pd.to_numeric(df_features['Tobacco smoking status'])
df_features['Tobacco smoking status'].fillna(0, inplace = True)
df_features['Tobacco smoking status'].value_counts()

0.0    22352
1.0       58
Name: Tobacco smoking status, dtype: int64

In [19]:
stress = {
    'Not at all': '0',
    'A little bit': '1',
    'Somewhat': '2',
    'Quite a bit': '3',
    'Very much': '4'
}

df_features[medical_features[-2]] = df_features[medical_features[-2]].map(stress)
df_features[medical_features[-2]] = pd.to_numeric(df_features[medical_features[-2]])
df_features[medical_features[-2]].fillna(0, inplace = True)
df_features[medical_features[-2]].value_counts()

0.0    16056
1.0     2897
2.0     2010
3.0      897
4.0      550
Name: Stress is when someone feels tense  nervous  anxious or can't sleep at night because their mind is troubled. How stressed are you?, dtype: int64

In [20]:
safe = {
    'Yes': '1',
    'No': '0',
    'Unsure': '0',
    'I choose not to answer this question': '0'
}

df_features[medical_features[-1]] = df_features[medical_features[-1]].map(safe)
df_features[medical_features[-1]] = pd.to_numeric(df_features[medical_features[-1]])
df_features[medical_features[-1]].fillna(0, inplace = True)
df_features[medical_features[-1]].value_counts()

0.0    11966
1.0    10444
Name: Do you feel physically and emotionally safe where you currently live?, dtype: int64

In [21]:
df_final['ENCOUNTER'] = df_final_encounter['ENCOUNTER']

In [22]:
df_final.head()

Unnamed: 0,BIRTHDATE,GENDER,HEALTHCARE_EXPENSES,INCOME,ENCOUNTERCLASS,hispanic,nonhispanic,Massachusetts,abdomin,abdomin pain,...,wheat,wheat substance,wheez,wheez allerg,wheez allerg skin,wheez finding,wheez finding diarrhea,wheez nose,wheez nose run,ENCOUNTER
0,1989,0,40714.75,946218,0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7d7fb0e7-a45e-ffc2-ca9c-d5a0d8c183db
1,1995,0,51676.82,39161,0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,34aec7b9-5ba2-8a77-9b33-48cccae078c7
2,1995,0,51676.82,39161,0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,34aec7b9-5ba2-8a77-9b33-48cccae078c7
3,1995,0,51676.82,39161,0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,34aec7b9-5ba2-8a77-9b33-48cccae078c7
4,1995,0,51676.82,39161,0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,34aec7b9-5ba2-8a77-9b33-48cccae078c7


In [23]:
df_features.head()

Unnamed: 0,ENCOUNTER,Heart rate,Diastolic Blood Pressure,Respiratory rate,Systolic Blood Pressure,Glucose,Tobacco smoking status,Stress is when someone feels tense nervous anxious or can't sleep at night because their mind is troubled. How stressed are you?,Do you feel physically and emotionally safe where you currently live?
0,cd5c1b99-8a07-325f-6709-23b97091612e,69.0,74.0,14.0,111.0,87.622147,0.0,0.0,0.0
1,4442b292-4233-8ba8-f3c6-0df9f394818b,87.0,79.0,12.0,118.0,87.622147,0.0,0.0,0.0
2,fa2cd5c6-de7b-5d30-4ddc-753e1c11498e,96.0,78.0,14.0,116.0,87.622147,0.0,0.0,0.0
3,29956fe2-39f4-90a6-9312-ab53c1786b8f,81.0,78.0,14.0,111.0,87.622147,0.0,0.0,0.0
4,042434ef-5872-e038-ee90-45d15c33c188,75.0,77.0,13.0,115.0,87.622147,0.0,0.0,0.0


In [24]:
df_final = pd.merge(df_final, df_features, on = "ENCOUNTER")
df_final.head()

Unnamed: 0,BIRTHDATE,GENDER,HEALTHCARE_EXPENSES,INCOME,ENCOUNTERCLASS,hispanic,nonhispanic,Massachusetts,abdomin,abdomin pain,...,wheez nose run,ENCOUNTER,Heart rate,Diastolic Blood Pressure,Respiratory rate,Systolic Blood Pressure,Glucose,Tobacco smoking status,Stress is when someone feels tense nervous anxious or can't sleep at night because their mind is troubled. How stressed are you?,Do you feel physically and emotionally safe where you currently live?
0,2017,0,1100.0,14656,0,0.0,1.0,1.0,0.0,0.0,...,0.0,8a096ce2-050c-1cb0-d38b-cd03420fc4ac,80.252302,82.098652,14.092692,119.986929,87.622147,0.0,0.0,0.0
1,2017,0,1100.0,14656,0,0.0,1.0,1.0,0.0,0.0,...,0.0,8a096ce2-050c-1cb0-d38b-cd03420fc4ac,80.252302,82.098652,14.092692,119.986929,87.622147,0.0,0.0,0.0
2,2017,0,1100.0,14656,0,0.0,1.0,1.0,0.0,0.0,...,0.0,8a096ce2-050c-1cb0-d38b-cd03420fc4ac,80.252302,82.098652,14.092692,119.986929,87.622147,0.0,0.0,0.0
3,2017,0,1100.0,14656,0,0.0,1.0,1.0,0.0,0.0,...,0.0,8a096ce2-050c-1cb0-d38b-cd03420fc4ac,80.252302,82.098652,14.092692,119.986929,87.622147,0.0,0.0,0.0
4,2017,0,1100.0,14656,0,0.0,1.0,1.0,0.0,0.0,...,0.0,8a096ce2-050c-1cb0-d38b-cd03420fc4ac,80.252302,82.098652,14.092692,119.986929,87.622147,0.0,0.0,0.0


In [25]:
df_final.drop(columns = ['ENCOUNTER'], inplace = True)
df_final.shape

(188, 346)

In [26]:
sorted_columns = list(df_final.columns)
sorted_columns.remove('ENCOUNTERCLASS')
sorted_columns = sorted(sorted_columns)

final_ordered_features = []
for col in sorted_columns:
    if col in patient_features or col in medical_features:
        final_ordered_features.append(col)

for col in sorted_columns:
    if col not in final_ordered_features:
        final_ordered_features.append(col)

print(len(final_ordered_features))

345


In [27]:
df_final.head()
df_final = df_final[final_ordered_features + ['ENCOUNTERCLASS']]
df_final.head()

Unnamed: 0,BIRTHDATE,Diastolic Blood Pressure,Do you feel physically and emotionally safe where you currently live?,GENDER,Glucose,HEALTHCARE_EXPENSES,Heart rate,INCOME,Massachusetts,Respiratory rate,...,wheat,wheat substance,wheez,wheez allerg,wheez allerg skin,wheez finding,wheez finding diarrhea,wheez nose,wheez nose run,ENCOUNTERCLASS
0,2017,82.098652,0.0,0,87.622147,1100.0,80.252302,14656,1.0,14.092692,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,2017,82.098652,0.0,0,87.622147,1100.0,80.252302,14656,1.0,14.092692,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,2017,82.098652,0.0,0,87.622147,1100.0,80.252302,14656,1.0,14.092692,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,2017,82.098652,0.0,0,87.622147,1100.0,80.252302,14656,1.0,14.092692,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,2017,82.098652,0.0,0,87.622147,1100.0,80.252302,14656,1.0,14.092692,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [28]:
x_train, x_test, y_train, y_test = train_test_split(df_final.drop(columns = 'ENCOUNTERCLASS'), df_final['ENCOUNTERCLASS'], stratify=df_final['ENCOUNTERCLASS'],random_state=1)   

In [29]:
model = XGBClassifier(random_state = 42)

model.fit(x_train, y_train)

y_pred = model.predict(x_test)

print(accuracy_score(y_test, y_pred))

1.0


In [30]:
def computeAccuracy(number_of_features):
    pca = PCA(n_components = number_of_features)
    
    features = np.array(df_final.drop(columns = 'ENCOUNTERCLASS'))
    features_reduced = pca.fit_transform(features)
    
    labels = df_final['ENCOUNTERCLASS'].tolist()
    
    x_train, x_test, y_train, y_test = train_test_split(features_reduced, labels, stratify = labels, random_state = 1)
    
    model = XGBClassifier(random_state = 42)

    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    
    return accuracy_score(y_test, y_pred);

In [31]:
score = [computeAccuracy(i) for i in range(1, 100, 50)]

In [32]:
with open('tf_idf_allergies', 'wb') as files:
    pickle.dump(tfidf_vect, files)

In [33]:
with open('allergies', 'wb') as files:
    pickle.dump(model, files)

In [34]:
with open('final_ordered_features', 'wb') as files:
    pickle.dump(final_ordered_features, files)