In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('/content/diabetes_prediction_dataset.csv')
df

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0
...,...,...,...,...,...,...,...,...,...
99995,Female,80.0,0,0,No Info,27.32,6.2,90,0
99996,Female,2.0,0,0,No Info,17.37,6.5,100,0
99997,Male,66.0,0,0,former,27.83,5.7,155,0
99998,Female,24.0,0,0,never,35.42,4.0,100,0


##**Exploratory Data Analysis**

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   gender               100000 non-null  object 
 1   age                  100000 non-null  float64
 2   hypertension         100000 non-null  int64  
 3   heart_disease        100000 non-null  int64  
 4   smoking_history      100000 non-null  object 
 5   bmi                  100000 non-null  float64
 6   HbA1c_level          100000 non-null  float64
 7   blood_glucose_level  100000 non-null  int64  
 8   diabetes             100000 non-null  int64  
dtypes: float64(3), int64(4), object(2)
memory usage: 6.9+ MB


**Checking the values available in object type columns**

In [4]:
df['gender'].value_counts()

Female    58552
Male      41430
Other        18
Name: gender, dtype: int64

In [5]:
df['smoking_history'].value_counts()

No Info        35816
never          35095
former          9352
current         9286
not current     6447
ever            4004
Name: smoking_history, dtype: int64

In [6]:
#Removing rows with gender value as 'Other' because it has only 18 occurences
df = df[df['gender'] != 'Other']

In [7]:
df

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0
...,...,...,...,...,...,...,...,...,...
99995,Female,80.0,0,0,No Info,27.32,6.2,90,0
99996,Female,2.0,0,0,No Info,17.37,6.5,100,0
99997,Male,66.0,0,0,former,27.83,5.7,155,0
99998,Female,24.0,0,0,never,35.42,4.0,100,0


##**Creating Column Transformers**

In [29]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(df.drop(columns = ['diabetes']), df['diabetes'], test_size = 0.2, random_state = 3)

In [19]:
x_train

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level
40532,Female,59.0,0,0,not current,36.26,4.5,160
68651,Male,65.0,0,0,No Info,45.20,3.5,100
40628,Female,66.0,0,0,never,43.26,3.5,155
6508,Male,23.0,0,0,not current,27.32,6.2,159
2668,Female,54.0,0,0,never,46.67,3.5,85
...,...,...,...,...,...,...,...,...
48066,Male,47.0,0,0,never,28.46,8.8,280
59022,Male,1.4,0,0,No Info,14.58,4.5,130
77067,Male,22.0,0,0,No Info,27.32,6.0,140
67238,Male,61.0,0,0,never,21.03,5.7,300


In [10]:
y_train

40532    0
68651    0
40628    0
6508     0
2668     0
        ..
48066    1
59022    0
77067    0
67238    1
71546    0
Name: diabetes, Length: 79985, dtype: int64

In [11]:
#1st column transformer for one hot encoding of categorical columns
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
trf1 = ColumnTransformer([('ohe_gender_smoke', OneHotEncoder(sparse_output = False, handle_unknown = 'ignore', drop = 'if_binary'), [0, 4])
], remainder = 'passthrough')

In [13]:
#2nd column transformer for scaling the dataset
from sklearn.preprocessing import StandardScaler
trf2 = ColumnTransformer([('scale', StandardScaler(), slice(0, 13))])

In [51]:
from sklearn.ensemble import RandomForestClassifier
trf3 = RandomForestClassifier(n_estimators=300, random_state = 3)

##**Building Pipelines**

In [14]:
from sklearn.pipeline import Pipeline

In [52]:
pipe = Pipeline([
    ('OneHotEncoder', trf1),
    ('StdScaler', trf2),
    ('RFClassifier', trf3)
])

In [53]:
pipe.fit(x_train, y_train)

In [54]:
pipe.named_steps

{'OneHotEncoder': ColumnTransformer(remainder='passthrough',
                   transformers=[('ohe_gender_smoke',
                                  OneHotEncoder(drop='if_binary',
                                                handle_unknown='ignore',
                                                sparse_output=False),
                                  [0, 4])]),
 'StdScaler': ColumnTransformer(transformers=[('scale', StandardScaler(),
                                  slice(0, 13, None))]),
 'RFClassifier': RandomForestClassifier(n_estimators=300, random_state=3)}

In [55]:
y_pred = pipe.predict(x_test)

##**Checking Accuracy**

In [56]:
from sklearn.metrics import accuracy_score, confusion_matrix
accuracy_score(y_test, y_pred)

0.9678451767765165

In [57]:
confusion_matrix(y_test, y_pred)

array([[18176,    52],
       [  591,  1178]])

##**Exporting the Pipeline**

In [58]:
import pickle
pickle.dump(pipe, open('pipe.pkl', 'wb'))