**Importing all the libraries needed and required for our model**

In [18]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as imbPipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/diabetes-prediction-dataset/diabetes_prediction_dataset.csv


**Let's explore our dataset!**

In [19]:
data = pd.read_csv("/kaggle/input/diabetes-prediction-dataset/diabetes_prediction_dataset.csv")
data

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0
...,...,...,...,...,...,...,...,...,...
99995,Female,80.0,0,0,No Info,27.32,6.2,90,0
99996,Female,2.0,0,0,No Info,17.37,6.5,100,0
99997,Male,66.0,0,0,former,27.83,5.7,155,0
99998,Female,24.0,0,0,never,35.42,4.0,100,0


In [20]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   gender               100000 non-null  object 
 1   age                  100000 non-null  float64
 2   hypertension         100000 non-null  int64  
 3   heart_disease        100000 non-null  int64  
 4   smoking_history      100000 non-null  object 
 5   bmi                  100000 non-null  float64
 6   HbA1c_level          100000 non-null  float64
 7   blood_glucose_level  100000 non-null  int64  
 8   diabetes             100000 non-null  int64  
dtypes: float64(3), int64(4), object(2)
memory usage: 6.9+ MB


In [21]:
data.describe()

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,41.885856,0.07485,0.03942,27.320767,5.527507,138.05806,0.085
std,22.51684,0.26315,0.194593,6.636783,1.070672,40.708136,0.278883
min,0.08,0.0,0.0,10.01,3.5,80.0,0.0
25%,24.0,0.0,0.0,23.63,4.8,100.0,0.0
50%,43.0,0.0,0.0,27.32,5.8,140.0,0.0
75%,60.0,0.0,0.0,29.58,6.2,159.0,0.0
max,80.0,1.0,1.0,95.69,9.0,300.0,1.0


**Checking the dataset for duplicate values and then removing it**

In [22]:
duplicated_data = data[data.duplicated()]
duplicated_data.shape

(3854, 9)

In [23]:
data.drop_duplicates()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0
...,...,...,...,...,...,...,...,...,...
99994,Female,36.0,0,0,No Info,24.60,4.8,145,0
99996,Female,2.0,0,0,No Info,17.37,6.5,100,0
99997,Male,66.0,0,0,former,27.83,5.7,155,0
99998,Female,24.0,0,0,never,35.42,4.0,100,0


**Now, checking if any null values are present**

In [24]:
data.isnull().sum()

gender                 0
age                    0
hypertension           0
heart_disease          0
smoking_history        0
bmi                    0
HbA1c_level            0
blood_glucose_level    0
diabetes               0
dtype: int64

**Let's see how many unique values are there for every column**

In [25]:
for column in data.columns:
    unique_val = len(data[column].unique())
    print(f"{column} : {unique_val} distinct values")

gender : 3 distinct values
age : 102 distinct values
hypertension : 2 distinct values
heart_disease : 2 distinct values
smoking_history : 6 distinct values
bmi : 4247 distinct values
HbA1c_level : 18 distinct values
blood_glucose_level : 18 distinct values
diabetes : 2 distinct values


**Label Encoding!**

In [26]:
def encoding_smoking(smoking_status):
    if smoking_status in ['never', 'No Info']:
        return 'non-smoker'
    if smoking_status == 'current':
        return 'smoker'
    if smoking_status in ['ever', 'former', 'not current']:
        return 'past-smoker'
    
data['smoking_history'] = data['smoking_history'].apply(encoding_smoking)

In [27]:
data['smoking_history'].value_counts()

non-smoker     70911
past-smoker    19803
smoker          9286
Name: smoking_history, dtype: int64

In [28]:
def encoding(df, column_name):
    dummies = pd.get_dummies(df, column_name)
    df = pd.concat([df.drop(column_name, axis=1), dummies], axis = 1)
    return df

In [29]:
data = encoding(data, 'gender')
data = encoding(data, 'smoking_history')

In [30]:
data

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes,age.1,hypertension.1,heart_disease.1,...,diabetes.1,gender_Female,gender_Male,gender_Other,gender_non-smoker,gender_past-smoker,gender_smoker,smoking_history_non-smoker,smoking_history_past-smoker,smoking_history_smoker
0,80.0,0,1,25.19,6.6,140,0,80.0,0,1,...,0,1,0,0,1,0,0,1,0,0
1,54.0,0,0,27.32,6.6,80,0,54.0,0,0,...,0,1,0,0,1,0,0,1,0,0
2,28.0,0,0,27.32,5.7,158,0,28.0,0,0,...,0,0,1,0,1,0,0,1,0,0
3,36.0,0,0,23.45,5.0,155,0,36.0,0,0,...,0,1,0,0,0,0,1,0,0,1
4,76.0,1,1,20.14,4.8,155,0,76.0,1,1,...,0,0,1,0,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,80.0,0,0,27.32,6.2,90,0,80.0,0,0,...,0,1,0,0,1,0,0,1,0,0
99996,2.0,0,0,17.37,6.5,100,0,2.0,0,0,...,0,1,0,0,1,0,0,1,0,0
99997,66.0,0,0,27.83,5.7,155,0,66.0,0,0,...,0,0,1,0,0,1,0,0,1,0
99998,24.0,0,0,35.42,4.0,100,0,24.0,0,0,...,0,1,0,0,1,0,0,1,0,0


**Finally, defining our features and target for the model**

In [31]:
X = data.drop('diabetes', axis=1)
y = data['diabetes']

In [32]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
rf = RandomForestClassifier(n_estimators= 20, criterion="entropy")
rf.fit(x_train, y_train)

In [33]:
y_pred = rf.predict(x_test)
y_pred

array([[0, 0, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 0],
       ...,
       [0, 0, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 0]])

In [34]:
accuracy_score(y_test, y_pred)

0.96805