In [1]:
# Import libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score 
import warnings
warnings.simplefilter(action='ignore', category=UserWarning)

In [2]:
# Loading the dataset
diabetes_cleaned_df = pd.read_csv('dataset\diabetes_cleaned.csv')
diabetes_cleaned_df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72.0,35.0,125.0,33.6,0.627,50,1
1,1,85.0,66.0,29.0,125.0,26.6,0.351,31,0
2,8,183.0,64.0,29.0,125.0,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101.0,76.0,48.0,180.0,32.9,0.171,63,0
764,2,122.0,70.0,27.0,125.0,36.8,0.340,27,0
765,5,121.0,72.0,23.0,112.0,26.2,0.245,30,0
766,1,126.0,60.0,29.0,125.0,30.1,0.349,47,1


In [3]:
X = diabetes_cleaned_df.drop(columns='Outcome')
y = diabetes_cleaned_df['Outcome']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1,  stratify=y)
print('X_train size: {}, X_test size: {}'.format(X_train.shape, X_test.shape))

X_train size: (614, 8), X_test size: (154, 8)


In [4]:
# Feature Scaling
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Model Building

### Balanced Random Forest Classifier

In [5]:
# Create a random forest classifier
brf_model = BalancedRandomForestClassifier(n_estimators=100, random_state=1)

# Fitting the model
brf_model = brf_model.fit(X_train, y_train)
brf_test_predictions = brf_model.predict(X_test)

# Calculated the balanced accuracy score
brf_test_acc_score = accuracy_score(y_test, brf_test_predictions)
print(brf_test_acc_score)

0.7337662337662337


In [6]:
brf_train_predictions = brf_model.predict(X_train)

# Calculated the balanced accuracy score
brf_train_acc_score = accuracy_score(y_train, brf_train_predictions)
print(brf_train_acc_score)

0.9576547231270358


### Easy Ensemble AdaBoost Classifier

In [7]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier
eec = EasyEnsembleClassifier(n_estimators=100, random_state=1)
ecc_model = eec.fit(X_train, y_train)

In [8]:
ecc_test_predictions = ecc_model.predict(X_test)
ecc_test_acc_score = accuracy_score(y_test, ecc_test_predictions)
print(ecc_test_acc_score)

0.7467532467532467


In [9]:
ecc_train_predictions = ecc_model.predict(X_train)

# Calculated the balanced accuracy score
ecc_train_acc_score = accuracy_score(y_train, ecc_train_predictions)
print(ecc_train_acc_score)

0.8338762214983714


### Logistic Regression

In [10]:
lr_model = LogisticRegression(random_state=1)
lr_model.fit(X_train, y_train)  
lr_test_predictions = lr_model.predict(X_test)

# Calculated the balanced accuracy score
lr_test_acc_score = accuracy_score(y_test, lr_test_predictions)
print(lr_test_acc_score)

0.7597402597402597


In [11]:
lr_train_predictions = lr_model.predict(X_train)

# Calculated the balanced accuracy score
lr_train_acc_score = accuracy_score(y_train, lr_train_predictions)
print(lr_train_acc_score)

0.7785016286644951


In [12]:
lst = [
    ['Balanced Random Forest Classifier', brf_test_acc_score, brf_train_acc_score], 
    ['Easy Ensemble AdaBoost Classifier', ecc_test_acc_score, ecc_train_acc_score],
    ['Logistic Regression', lr_test_acc_score, lr_train_acc_score]
]
    
df = pd.DataFrame(lst, columns =['Model', 'Test Score', 'Train Score'])
df

Unnamed: 0,Model,Test Score,Train Score
0,Balanced Random Forest Classifier,0.733766,0.957655
1,Easy Ensemble AdaBoost Classifier,0.746753,0.833876
2,Logistic Regression,0.75974,0.778502


In [13]:
# Creating a function for prediction
def predict_diabetes(Pregnancies, Glucose, BloodPressure, SkinThickness, Insulin, BMI, DPF, Age):
    preg = int(Pregnancies)
    glucose = float(Glucose)
    bp = float(BloodPressure)
    st = float(SkinThickness)
    insulin = float(Insulin)
    bmi = float(BMI)
    dpf = float(DPF)
    age = int(Age)

    x = [[preg, glucose, bp, st, insulin, bmi, dpf, age]]
    x = sc.transform(x)

    return brf_model.predict(x)[0]

In [14]:
# Prediction 1
# Input sequence: Pregnancies, Glucose, BloodPressure, SkinThickness, Insulin, BMI, DPF, Age
prediction = predict_diabetes(6, 148, 72, 35, 125, 33.6, 0.627, 50)
if prediction:
  print('Oops! You have diabetes.')
else:
  print("Great! You don't have diabetes.")

Oops! You have diabetes.


In [15]:
# Prediction 2
# Input sequence: Pregnancies, Glucose, BloodPressure, SkinThickness, Insulin, BMI, DPF, Age
prediction = predict_diabetes(1, 85, 66, 29, 125, 26.6, 0.351, 31)
if prediction:
  print('Oops! You have diabetes.')
else:
  print("Great! You don't have diabetes.")

Great! You don't have diabetes.
