<a href="https://colab.research.google.com/github/sebastianneri/HealthHackathon/blob/main/Disease_Prediction_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Downloading

In [8]:
!mv kaggle.json ~/.kaggle/

In [9]:
!chmod 600 ~/.kaggle/kaggle.json

https://www.kaggle.com/datasets/uciml/pima-indians-diabetes-database?select=diabetes.csv

In [10]:
!kaggle datasets download -d uciml/pima-indians-diabetes-database

Downloading pima-indians-diabetes-database.zip to /content
  0% 0.00/8.91k [00:00<?, ?B/s]
100% 8.91k/8.91k [00:00<00:00, 9.29MB/s]


In [11]:
!rm ~/.kaggle/kaggle.json

In [12]:
!unzip pima-indians-diabetes-database.zip

Archive:  pima-indians-diabetes-database.zip
  inflating: diabetes.csv            


# Data Preprocessing

In [446]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from scipy import stats

In [447]:
df = pd.read_csv("diabetes.csv")

In [448]:
df1 = df.loc[df['Outcome'] == 1]
df2 = df.loc[df['Outcome'] == 0]
df1 = df1.replace({'BloodPressure':0, 'BMI':0, 'Glucose':0,'Insulin:':0}, {'BloodPressure': np.mean(df1['BloodPressure']), 'BMI': np.mean(df1['BMI']), 'Glucose': np.mean(df1['Glucose']), 'Insulin':np.mean(df1['Insulin'])})
df2 = df2.replace({'BloodPressure':0, 'BMI':0, 'Glucose':0, 'Insulin:':0}, {'BloodPressure': np.mean(df2['BloodPressure']), 'BMI': np.mean(df2['BMI']), 'Glucose': np.mean(df2['Glucose']), 'Insulin':np.mean(df2['Insulin'])})
dataframe = [df1, df2]
dataset = pd.concat(dataframe)

In [449]:
y = df["Outcome"]
df.drop(["Outcome", "SkinThickness"], inplace=True, axis=1)
columns = df.columns

In [450]:
sc = StandardScaler()
df = pd.DataFrame(sc.fit_transform(df), columns=columns)

In [451]:
df.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,Insulin,BMI,DiabetesPedigreeFunction,Age
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,2.5442610000000002e-17,3.614007e-18,-1.3272440000000001e-17,-3.556183e-17,2.295979e-16,2.462585e-16,1.8576e-16
std,1.000652,1.000652,1.000652,1.000652,1.000652,1.000652,1.000652
min,-1.141852,-3.783654,-3.572597,-0.6928906,-4.060474,-1.189553,-1.041549
25%,-0.8448851,-0.6852363,-0.3673367,-0.6928906,-0.5955785,-0.6889685,-0.7862862
50%,-0.2509521,-0.1218877,0.1496408,-0.4280622,0.0009419788,-0.3001282,-0.3608474
75%,0.6399473,0.6057709,0.5632228,0.4120079,0.5847705,0.4662269,0.6602056
max,3.906578,2.444478,2.734528,6.652839,4.455807,5.883565,4.063716


In [452]:
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=100)

In [453]:
smt = SMOTE()
X_train, y_train = smt.fit_resample(X_train, y_train)
np.bincount(y_train)

array([447, 447])

# XGBoost

In [125]:
from sklearn.metrics import recall_score, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV

## Load the model

In [455]:
model = XGBClassifier(silent=False, 
                      scale_pos_weight=1,
                      learning_rate=0.01,  
                      colsample_bytree = 0.4,
                      subsample = 0.8,
                      objective='reg:logistic', 
                      n_estimators=300, 
                      max_depth=4)

In [456]:
model.fit(X_train, y_train)

XGBClassifier(colsample_bytree=0.4, learning_rate=0.01, max_depth=4,
              n_estimators=300, objective='reg:logistic', silent=False,
              subsample=0.8)

In [457]:
y_pred = model.predict(X_test)
print(f"Accuracy:{accuracy_score(y_pred, y_test)}.")

Accuracy:0.84.


## Retrain and save the model

In [465]:
import joblib
import pickle
def save_model(model, file_name):
  pickle.dump(model, open(file_name+".pkl", "wb"))
  joblib.dump(model, file_name+".sav")
  return 

In [466]:
best_model = XGBClassifier(silent=False, 
                      scale_pos_weight=1,
                      learning_rate=0.01,  
                      colsample_bytree = 0.4,
                      subsample = 0.8,
                      objective='reg:logistic', 
                      n_estimators=300, 
                      max_depth=4)

In [467]:
best_model.fit(df, y)

XGBClassifier(colsample_bytree=0.4, learning_rate=0.01, max_depth=4,
              n_estimators=300, objective='reg:logistic', silent=False,
              subsample=0.8)

In [468]:
save_model(best_model, "DiabetesModel")

In [479]:
pd.DataFrame(best_model.predict_proba(df)[:, 1]).to_csv("Probs.csv")

# Random Forest

In [188]:
rf = RandomForestClassifier(n_estimators=300, bootstrap = True, max_features = 'sqrt')
rf.fit(X_train, y_train)

RandomForestClassifier(max_features='sqrt', n_estimators=300)

In [189]:
y_pred = rf.predict(X_test)
print(f"Accuracy:{accuracy_score(y_pred, y_test)}")

Accuracy:0.78


# Naive Bayes

In [100]:
from sklearn.naive_bayes import GaussianNB
gauss = GaussianNB()
gauss.fit(X_train, y_train)

GaussianNB()

In [101]:
y_pred = gauss.predict(X_test)
print(f"Accuracy:{accuracy_score(y_pred, y_test)}")

Accuracy:0.82
