In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

In [2]:
diabetes_dataset = pd.read_csv('diabetes_prediction_dataset.csv')

In [3]:
diabetes_dataset.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


In [4]:
diabetes_dataset.tail()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
99995,Female,80.0,0,0,No Info,27.32,6.2,90,0
99996,Female,2.0,0,0,No Info,17.37,6.5,100,0
99997,Male,66.0,0,0,former,27.83,5.7,155,0
99998,Female,24.0,0,0,never,35.42,4.0,100,0
99999,Female,57.0,0,0,current,22.43,6.6,90,0


In [5]:
diabetes_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   gender               100000 non-null  object 
 1   age                  100000 non-null  float64
 2   hypertension         100000 non-null  int64  
 3   heart_disease        100000 non-null  int64  
 4   smoking_history      100000 non-null  object 
 5   bmi                  100000 non-null  float64
 6   HbA1c_level          100000 non-null  float64
 7   blood_glucose_level  100000 non-null  int64  
 8   diabetes             100000 non-null  int64  
dtypes: float64(3), int64(4), object(2)
memory usage: 6.9+ MB


In [6]:
diabetes_dataset.shape

(100000, 9)

In [7]:
missing_values = diabetes_dataset.isna().sum()

In [8]:
missing_values

gender                 0
age                    0
hypertension           0
heart_disease          0
smoking_history        0
bmi                    0
HbA1c_level            0
blood_glucose_level    0
diabetes               0
dtype: int64

In [9]:
# NO missing values and no NAN values in the dataset

In [10]:
diabetes_dataset.describe()

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,41.885856,0.07485,0.03942,27.320767,5.527507,138.05806,0.085
std,22.51684,0.26315,0.194593,6.636783,1.070672,40.708136,0.278883
min,0.08,0.0,0.0,10.01,3.5,80.0,0.0
25%,24.0,0.0,0.0,23.63,4.8,100.0,0.0
50%,43.0,0.0,0.0,27.32,5.8,140.0,0.0
75%,60.0,0.0,0.0,29.58,6.2,159.0,0.0
max,80.0,1.0,1.0,95.69,9.0,300.0,1.0


In [11]:
label_encoder = LabelEncoder()

In [12]:
diabetes_dataset.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


In [13]:
categorical_columns = ['gender', 'age', 'smoking_history']

In [14]:
diabetes_dataset[categorical_columns] = diabetes_dataset[categorical_columns].apply(label_encoder.fit_transform)

In [15]:
diabetes_dataset.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,0,101,0,1,4,25.19,6.6,140,0
1,0,75,0,0,0,27.32,6.6,80,0
2,1,49,0,0,4,27.32,5.7,158,0
3,0,57,0,0,1,23.45,5.0,155,0
4,1,97,1,1,1,20.14,4.8,155,0


In [16]:
diabetes_dataset.describe()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,0.41466,62.67179,0.07485,0.03942,2.17965,27.320767,5.527507,138.05806,0.085
std,0.493031,22.966612,0.26315,0.194593,1.889659,6.636783,1.070672,40.708136,0.278883
min,0.0,0.0,0.0,0.0,0.0,10.01,3.5,80.0,0.0
25%,0.0,45.0,0.0,0.0,0.0,23.63,4.8,100.0,0.0
50%,0.0,64.0,0.0,0.0,3.0,27.32,5.8,140.0,0.0
75%,1.0,81.0,0.0,0.0,4.0,29.58,6.2,159.0,0.0
max,2.0,101.0,1.0,1.0,5.0,95.69,9.0,300.0,1.0


In [17]:
diabetes_dataset.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,0,101,0,1,4,25.19,6.6,140,0
1,0,75,0,0,0,27.32,6.6,80,0
2,1,49,0,0,4,27.32,5.7,158,0
3,0,57,0,0,1,23.45,5.0,155,0
4,1,97,1,1,1,20.14,4.8,155,0


In [18]:
columns_to_scale = ['age', 'bmi', 'HbA1c_level', 'blood_glucose_level']

In [19]:
scalar = StandardScaler()

In [20]:
diabetes_dataset[columns_to_scale] = scalar.fit_transform(diabetes_dataset[columns_to_scale])

In [21]:
diabetes_dataset.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,0,1.668875,0,1,4,-0.321056,1.001706,0.047704,0
1,0,0.536791,0,0,0,-0.000116,1.001706,-1.42621,0
2,1,-0.595293,0,0,4,-0.000116,0.161108,0.489878,0
3,0,-0.246959,0,0,1,-0.583232,-0.49269,0.416183,0
4,1,1.494708,1,1,1,-1.08197,-0.67949,0.416183,0


In [22]:
diabetes_dataset.describe()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,0.41466,4.123923e-17,0.07485,0.03942,2.17965,1.241318e-16,4.485878e-16,3.172174e-16,0.085
std,0.493031,1.000005,0.26315,0.194593,1.889659,1.000005,1.000005,1.000005,0.278883
min,0.0,-2.728835,0.0,0.0,0.0,-2.60832,-1.893686,-1.42621,0.0
25%,0.0,-0.7694595,0.0,0.0,0.0,-0.5561106,-0.6794897,-0.9349053,0.0
50%,0.0,0.0578325,0.0,0.0,3.0,-0.0001155837,0.2545078,0.04770422,0.0
75%,1.0,0.7980411,0.0,0.0,4.0,0.3404125,0.6281067,0.5144437,0.0
max,2.0,1.668875,1.0,1.0,5.0,10.30161,3.2433,3.978142,1.0


In [23]:
X = diabetes_dataset.drop('diabetes', axis=1)
y = diabetes_dataset['diabetes']

In [24]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [25]:
X_train.shape

(80000, 8)

In [26]:
X_train.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level
75220,1,1.364083,0,0,3,-0.38434,-1.893686,-1.42621
48955,1,1.668875,0,0,4,-0.409955,0.161108,0.17053
44966,0,-0.159876,0,0,2,-0.450637,-1.426688,0.489878
13568,0,-0.682376,0,0,5,-1.273329,-0.49269,-0.934905
92727,0,0.841583,1,0,1,-0.785138,-0.959689,-1.303384


In [27]:
X_train.tail()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level
6265,1,0.319083,0,0,4,0.852711,0.161108,-1.42621
54886,0,-1.161335,0,0,4,0.117412,-0.49269,0.514444
76820,1,0.014291,0,0,4,-0.177913,0.254508,-1.303384
860,0,-0.203418,0,0,4,-0.355711,0.628107,0.489878
15795,0,-0.813001,0,0,4,0.100837,-0.49269,0.514444


In [28]:
y_train.head()

75220    0
48955    1
44966    0
13568    0
92727    0
Name: diabetes, dtype: int64

In [29]:
y_test.shape

(20000,)

In [30]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=7)

knn.fit(X_train, y_train)



KNeighborsClassifier(n_neighbors=7)

In [31]:
y_pred = knn.predict(X_test)

In [32]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_pred)

print("The accuracy of the model is => ", accuracy)

The accuracy of the model is =>  0.96065


In [35]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [39]:
X = diabetes_dataset.drop('diabetes', axis=1)
y = diabetes_dataset['diabetes']

In [40]:
X.shape, y.shape

((100000, 8), (100000,))

In [41]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [42]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((80000, 8), (20000, 8), (80000,), (20000,))

In [44]:
model_gini = DecisionTreeClassifier(criterion='gini')
model_gini.fit(X_train, y_train)
y_pred_gini = model_gini.predict(X_test)

In [45]:
accuracy_score_gini = accuracy_score(y_test, y_pred_gini)
print(accuracy_score_gini)

0.9519


In [46]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [47]:
model_entropy = DecisionTreeClassifier(criterion='entropy')
model_entropy.fit(X_train, y_train)

DecisionTreeClassifier(criterion='entropy')

In [49]:
y_pred_entropy = model_entropy.predict(X_test)

In [50]:
accuracy_score_entropy = accuracy_score(y_test, y_pred_entropy)
print(accuracy_score_entropy)

0.9536


In [51]:
from sklearn.neural_network import MLPClassifier

In [62]:
model_mlp = MLPClassifier(hidden_layer_sizes=(100, 50), activation='logistic', solver='adam', random_state=42)

In [63]:
model_mlp.fit(X_train, y_train)

MLPClassifier(activation='logistic', hidden_layer_sizes=(100, 50),
              random_state=42)

In [64]:
y_pred = model_mlp.predict(X_test)

In [65]:
accuracy_score_mlp = accuracy_score(y_test, y_pred)

In [66]:
accuracy_score_mlp

0.9724

In [67]:
from sklearn.linear_model import Perceptron

In [68]:
perceptron_model = Perceptron()

In [69]:
perceptron_model.fit(X_train, y_train)

Perceptron()

In [71]:
y_pred = perceptron_model.predict(X_test)

In [72]:
accuracy_score_perceptron = accuracy_score(y_test, y_pred)

In [73]:
accuracy_score_perceptron

0.9505