## Create models to predict the outcome of Diabetes and it's co-morbid diseases from our cleaned and wrangled data

Import all required Libraries

In [1]:
#Data Handling Libraries
import xmltodict
import nltk
import numpy as np
import re
import collections
import pandas as pd

#Path Setup Libraries
import os, pathlib, glob
from pathlib import Path

#Sklearn for training and testing models
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

#for hyperparameterization
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score
from sklearn.model_selection importGridSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score


Read our final data frame

In [2]:
#Import the cleaned CSV
final_df = pd.read_csv("../data/output_finaldf.csv")
#final_df.columns

numeric_columns = final_df.select_dtypes(include=np.number).columns.tolist()
categorical_columns = final_df.select_dtypes(include=['object']).columns.tolist()

These are our categorical columns in the final data frame

In [None]:
#categorical_columns

In [None]:
#final_df.isnull().sum()

All co-morbid disorders we are predicting along with Diabetes

In [37]:
disorders = ['Asthma',
 'CAD',
 'CHF',
 'Depression',
 'Diabetes',
 'Gallstones',
 'GERD',
 'Gout',
 'Hypercholesterolemia',
 'Hypertension',
 'Hypertriglyceridemia',
 'OA',
 'Obesity',
 'OSA',
 'PVD',
 'Venous Insufficiency']

## Prediction!
 - We loop through all the disorders  and subset the final df
 - Hyper paramaters with grid search to find the best possible combination
 - Store our predicted values and accuracy in a results data frame



In [None]:
results = []

for disorder in disorders:
    # Prepare data for the current disorder
    one_final_df = final_df.dropna(subset=[disorder])
    one_final_df[disorder] = one_final_df[disorder].map({'N': 0, 'Y': 1})
    one_final_df = one_final_df.dropna(subset=[disorder])
    X = one_final_df[['Sign_symptom_mean_embedding',
                      'Dosage_mean_embedding', 'Medication_mean_embedding',
                      'Disease_disorder_mean_embedding']]
    y = one_final_df[disorder]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = XGBClassifier()
    param_grid = {
        'max_depth': [3, 5, 7],
        'learning_rate': [0.01, 0.1, 0.2],
        'n_estimators': [50, 100, 200],
        'min_child_weight': [1, 3, 5],
        'gamma': [0, 0.1, 0.2]
    }
    grid_search = GridSearchCV(model, param_grid, cv=5)
    grid_search.fit(X_train, y_train)
    best_params = grid_search.best_params_
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    
    results.append({'Doc ID': one_final_df['doc_id'],
                    'Disorder': disorder,
                    'Best Parameters': best_params,
                    'Accuracy': accuracy,
                    'Predicted Values': y_pred})

results_df = pd.DataFrame(results)

## Finally!! Our results data frame with accuracy for each disorder

In [47]:
results_df

Unnamed: 0,Doc ID,Disorder,Best Parameters,Accuracy,Predicted Values
0,0 3 1 5 2 7 4 ...,Asthma,"{'gamma': 0, 'learning_rate': 0.01, 'max_depth...",0.915789,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,0 3 1 5 2 7 4 ...,CAD,"{'gamma': 0.1, 'learning_rate': 0.01, 'max_dep...",0.521739,"[1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, ..."
2,0 3 2 7 3 8 7 1...,CHF,"{'gamma': 0, 'learning_rate': 0.2, 'max_depth'...",0.609195,"[1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, ..."
3,0 3 1 5 2 7 3 ...,Depression,"{'gamma': 0, 'learning_rate': 0.01, 'max_depth...",0.822917,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,0 3 1 5 2 7 3 ...,Diabetes,"{'gamma': 0, 'learning_rate': 0.01, 'max_depth...",0.739583,"[1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, ..."
5,0 3 1 5 2 7 3 ...,Gallstones,"{'gamma': 0, 'learning_rate': 0.01, 'max_depth...",0.858586,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
6,0 3 2 7 3 8 4 ...,GERD,"{'gamma': 0, 'learning_rate': 0.01, 'max_depth...",0.729412,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
7,0 3 1 5 2 7 3 ...,Gout,"{'gamma': 0, 'learning_rate': 0.01, 'max_depth...",0.87,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
8,0 3 1 5 2 7 3 ...,Hypercholesterolemia,"{'gamma': 0.1, 'learning_rate': 0.01, 'max_dep...",0.517241,"[0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, ..."
9,0 3 1 5 2 7 4 ...,Hypertension,"{'gamma': 0, 'learning_rate': 0.01, 'max_depth...",0.822222,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


Write our results into a csv

In [50]:
results_df.to_csv('../data/model_results.csv', index = False)