In [133]:
import pandas as pd
df = pd.read_csv('diabetes.csv')

In [134]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [135]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [136]:
df['Glucose'].value_counts()

Glucose
99     17
100    17
111    14
129    14
125    14
       ..
191     1
177     1
44      1
62      1
190     1
Name: count, Length: 136, dtype: int64

In [137]:
def convert(value):
    if value < 18.5:
        return 'Underweight'
    elif value < 25:
        return 'Normal'
    elif value < 35:
        return 'Overwight1'
    else:
        return 'Overwight2'
df['BMI'] = df['BMI'].apply(convert)

In [138]:
def convert_ins(value):
    if 35 <= value <= 278:
        return 'normal'
    else:
        return 'abnormal'
df['Insulin'] = df['Insulin'].apply(convert_ins)

In [139]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    object 
 5   BMI                       768 non-null    object 
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(1), int64(6), object(2)
memory usage: 54.1+ KB


In [140]:
def convert_glu(value):
    if value < 70:
        return 'low'
    elif value <= 99:
        return 'normal'
    elif value <= 125:
        return 'prediabetes'
    else:
        return 'diabetes'
df['Glucose'] = df['Glucose'].apply(convert_glu)

In [141]:
df = pd.get_dummies(df, columns=['BMI', 'Insulin', 'Glucose'])

In [142]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 16 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   BloodPressure             768 non-null    int64  
 2   SkinThickness             768 non-null    int64  
 3   DiabetesPedigreeFunction  768 non-null    float64
 4   Age                       768 non-null    int64  
 5   Outcome                   768 non-null    int64  
 6   BMI_Normal                768 non-null    bool   
 7   BMI_Overwight1            768 non-null    bool   
 8   BMI_Overwight2            768 non-null    bool   
 9   BMI_Underweight           768 non-null    bool   
 10  Insulin_abnormal          768 non-null    bool   
 11  Insulin_normal            768 non-null    bool   
 12  Glucose_diabetes          768 non-null    bool   
 13  Glucose_low               768 non-null    bool   
 14  Glucose_no

In [154]:
x = df.drop('Outcome', axis=1)
y = df['Outcome']

In [158]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [160]:
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test= sc.transform(x_test)

In [162]:
lgr = LogisticRegression()
lgr.fit(x_train, y_train)

In [164]:
predictions = lgr.predict(x_test)
print('Logistic Regression:', accuracy_score(y_test, predictions))

Logistic Regression: 0.7337662337662337


In [166]:
classifier = KNeighborsClassifier(n_neighbors=5)
classifier.fit(x_train, y_train)
predictions = classifier.predict(x_test)
print('Neighbors Classifier:', accuracy_score(y_test, predictions))

Neighbors Classifier: 0.6818181818181818


In [170]:
rf = RandomForestClassifier()
rf.fit(x_train, y_train)
predictions = rf.predict(x_test)
print('RandomForest Classifier:', accuracy_score(y_test, predictions))

RandomForest Classifier: 0.7272727272727273


In [172]:
gb = GradientBoostingClassifier()
gb.fit(x_train, y_train)
predictions = gb.predict(x_test)
print('GradientBoosting Classifier:', accuracy_score(y_test, predictions))

GradientBoosting Classifier: 0.6948051948051948


In [174]:
bc = BaggingClassifier()
bc.fit(x_train, y_train)
predictions = bc.predict(x_test)
print('BaggingClassifier:', accuracy_score(y_test, predictions))

BaggingClassifier: 0.7337662337662337


In [176]:
dt = DecisionTreeClassifier()
dt.fit(x_train, y_train)
predictions = dt.predict(x_test)
print('DecisionTreeClassifier:', accuracy_score(y_test, predictions))

DecisionTreeClassifier: 0.6883116883116883


In [184]:
bag_des = BaggingClassifier(
    estimator=LogisticRegression(),
    n_estimators=100,
    max_samples=0.8,
    max_features=0.8,
    bootstrap=True,
    random_state=42
)
bag_des.fit(x_train, y_train)
preds = bag_des.predict(x_test)
print('Bagging Decision Tree:', accuracy_score(y_test,preds))

Bagging Decision Tree: 0.7272727272727273
