<a href="https://colab.research.google.com/github/sebastianneri/HealthHackathon/blob/main/Disease_Prediction_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Downloading

In [10]:
!mv kaggle.json ~/.kaggle/

In [11]:
!chmod 600 ~/.kaggle/kaggle.json

https://www.kaggle.com/datasets/uciml/pima-indians-diabetes-database?select=diabetes.csv

In [12]:
!kaggle datasets download -d uciml/pima-indians-diabetes-database

Downloading pima-indians-diabetes-database.zip to /content
  0% 0.00/8.91k [00:00<?, ?B/s]
100% 8.91k/8.91k [00:00<00:00, 7.02MB/s]


In [13]:
!rm ~/.kaggle/kaggle.json

In [14]:
!unzip pima-indians-diabetes-database.zip

Archive:  pima-indians-diabetes-database.zip
  inflating: diabetes.csv            


# Data Preprocessing

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from scipy import stats

In [2]:
df = pd.read_csv("diabetes.csv")

In [3]:
df1 = df.loc[df['Outcome'] == 1]
df2 = df.loc[df['Outcome'] == 0]
df1 = df1.replace({'BloodPressure':0, 'BMI':0, 'Glucose':0, 'SkinThickness':0, 'Insulin:':0}, {'BloodPressure': np.mean(df1['BloodPressure']), 'BMI': np.mean(df1['BMI']), 'Glucose': np.mean(df1['Glucose']),  'SkinThickness':np.mean(df1['SkinThickness']), 'Insulin':np.mean(df1['Insulin'])})
df2 = df2.replace({'BloodPressure':0, 'BMI':0, 'Glucose':0, 'SkinThickness':0, 'Insulin:':0}, {'BloodPressure': np.mean(df2['BloodPressure']), 'BMI': np.mean(df2['BMI']), 'Glucose': np.mean(df2['Glucose']),  'SkinThickness':np.mean(df2['SkinThickness']), 'Insulin':np.mean(df2['Insulin'])})
dataframe = [df1, df2]
dataset = pd.concat(dataframe)

In [4]:
y = df["Outcome"]
df = df.iloc[:, :-1]
columns = df.columns

In [5]:
sc = StandardScaler()
df = pd.DataFrame(sc.fit_transform(df), columns=columns)

In [6]:
df.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,2.5442610000000002e-17,3.614007e-18,-1.3272440000000001e-17,7.994184000000001e-17,-3.556183e-17,2.295979e-16,2.462585e-16,1.8576e-16
std,1.000652,1.000652,1.000652,1.000652,1.000652,1.000652,1.000652,1.000652
min,-1.141852,-3.783654,-3.572597,-1.288212,-0.6928906,-4.060474,-1.189553,-1.041549
25%,-0.8448851,-0.6852363,-0.3673367,-1.288212,-0.6928906,-0.5955785,-0.6889685,-0.7862862
50%,-0.2509521,-0.1218877,0.1496408,0.1545332,-0.4280622,0.0009419788,-0.3001282,-0.3608474
75%,0.6399473,0.6057709,0.5632228,0.7190857,0.4120079,0.5847705,0.4662269,0.6602056
max,3.906578,2.444478,2.734528,4.921866,6.652839,4.455807,5.883565,4.063716


In [7]:
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=150)

In [8]:
smt = SMOTE()
X_train, y_train = smt.fit_resample(X_train, y_train)
np.bincount(y_train)

array([400, 400])

# XGBoost

In [13]:
from sklearn.metrics import recall_score, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV

In [48]:
model = XGBClassifier(silent=False, 
                      scale_pos_weight=1,
                      learning_rate=0.001,  
                      colsample_bytree = 0.4,
                      subsample = 0.8,
                      objective='reg:logistic', 
                      n_estimators=700, 
                      max_depth=4)

In [49]:
model.fit(X_train, y_train)

XGBClassifier(colsample_bytree=0.4, learning_rate=0.001, max_depth=4,
              n_estimators=700, objective='reg:logistic', silent=False,
              subsample=0.8)

In [50]:
y_pred = model.predict(X_test)
print(f"Recall: {recall_score(y_pred, y_test)}, Accuracy:{accuracy_score(y_pred, y_test)}.")

Recall: 0.6885245901639344, Accuracy:0.82.


In [56]:
params = { 'max_depth': [3, 5, 6, 10, 15, 20],
           'learning_rate': np.arange(0, 0.01, 0.0001),
           'subsample': np.arange(0, 1.0, 0.1),
           'colsample_bytree': np.arange(0, 1.0, 0.1),
           'colsample_bylevel': np.arange(0, 1.0, 0.1),
           'n_estimators': [100, 500, 1000]}
xgbr = XGBClassifier(seed = 20, objective='reg:logistic')
clf = RandomizedSearchCV(estimator=xgbr, 
                   param_distributions=params,
                   scoring='recall', 
                   verbose=1, n_iter=50, n_jobs=-1)
clf.fit(X_train, y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


RandomizedSearchCV(estimator=XGBClassifier(objective='reg:logistic', seed=20),
                   n_iter=50, n_jobs=-1,
                   param_distributions={'colsample_bylevel': array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]),
                                        'colsample_bytree': array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]),
                                        'learning_rate': array([0.    , 0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007,
       0.0008, 0.0009, 0.001 , 0....
       0.0064, 0.0065, 0.0066, 0.0067, 0.0068, 0.0069, 0.007 , 0.0071,
       0.0072, 0.0073, 0.0074, 0.0075, 0.0076, 0.0077, 0.0078, 0.0079,
       0.008 , 0.0081, 0.0082, 0.0083, 0.0084, 0.0085, 0.0086, 0.0087,
       0.0088, 0.0089, 0.009 , 0.0091, 0.0092, 0.0093, 0.0094, 0.0095,
       0.0096, 0.0097, 0.0098, 0.0099]),
                                        'max_depth': [3, 5, 6, 10, 15, 20],
                                        'n_estimators': [100, 500, 1000],
 

In [57]:
y_pred = clf.predict(X_test)
print(f"Recall: {recall_score(y_pred, y_test)}, Accuracy:{accuracy_score(y_pred, y_test)}.")

Recall: 0.6833333333333333, Accuracy:0.8133333333333334.


# Random Forest

In [51]:
rf = RandomForestClassifier(n_estimators=300, bootstrap = True, max_features = 'sqrt')
rf.fit(X_train, y_train)

RandomForestClassifier(max_features='sqrt', n_estimators=300)

In [52]:
y_pred = rf.predict(X_test)
print(f"Recall: {recall_score(y_pred, y_test)}, Accuracy:{accuracy_score(y_pred, y_test)}")

Recall: 0.6964285714285714, Accuracy:0.8133333333333334


# Naive Bayes

In [53]:
from sklearn.naive_bayes import GaussianNB
gauss = GaussianNB()
gauss.fit(X_train, y_train)

GaussianNB()

In [54]:
y_pred = gauss.predict(X_test)
print(f"Recall: {recall_score(y_pred, y_test)}, Accuracy:{accuracy_score(y_pred, y_test)}")

Recall: 0.6923076923076923, Accuracy:0.8
