## About dataset
url: https://www.kaggle.com/datasets/prosperchuks/health-dataset

In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
raw_diabetes = pd.read_csv('diabetes_data.csv')

In [3]:
raw_diabetes.head(2)

Unnamed: 0,Age,Sex,HighChol,CholCheck,BMI,Smoker,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,GenHlth,MentHlth,PhysHlth,DiffWalk,Stroke,HighBP,Diabetes
0,4.0,1.0,0.0,1.0,26.0,0.0,0.0,1.0,0.0,1.0,0.0,3.0,5.0,30.0,0.0,0.0,1.0,0.0
1,12.0,1.0,1.0,1.0,26.0,1.0,0.0,0.0,1.0,0.0,0.0,3.0,0.0,0.0,0.0,1.0,1.0,0.0


In [4]:
# rows and cols
raw_diabetes.shape

(70692, 18)

In [5]:
# check for missing values
raw_diabetes.isna().sum()

Age                     0
Sex                     0
HighChol                0
CholCheck               0
BMI                     0
Smoker                  0
HeartDiseaseorAttack    0
PhysActivity            0
Fruits                  0
Veggies                 0
HvyAlcoholConsump       0
GenHlth                 0
MentHlth                0
PhysHlth                0
DiffWalk                0
Stroke                  0
HighBP                  0
Diabetes                0
dtype: int64

In [6]:
# remove duplicates
raw_diabetes.drop_duplicates(inplace=True)

In [7]:
raw_diabetes.dtypes

Age                     float64
Sex                     float64
HighChol                float64
CholCheck               float64
BMI                     float64
Smoker                  float64
HeartDiseaseorAttack    float64
PhysActivity            float64
Fruits                  float64
Veggies                 float64
HvyAlcoholConsump       float64
GenHlth                 float64
MentHlth                float64
PhysHlth                float64
DiffWalk                float64
Stroke                  float64
HighBP                  float64
Diabetes                float64
dtype: object

In [8]:
# numeric cols
num_cols = ['BMI', 'MentHlth', 'PhysHlth']

# categorical cols
cat_cols = raw_diabetes.columns.tolist()
cat_cols = [col for col in cat_cols if col not in num_cols]

In [9]:
# convert the num cols which are categories to categorical cols
for cat in cat_cols:
    raw_diabetes[cat] = raw_diabetes[cat].astype('category')

In [10]:
raw_diabetes.dtypes

Age                     category
Sex                     category
HighChol                category
CholCheck               category
BMI                      float64
Smoker                  category
HeartDiseaseorAttack    category
PhysActivity            category
Fruits                  category
Veggies                 category
HvyAlcoholConsump       category
GenHlth                 category
MentHlth                 float64
PhysHlth                 float64
DiffWalk                category
Stroke                  category
HighBP                  category
Diabetes                category
dtype: object

In [11]:
# Train Test Split
features = raw_diabetes.drop('Diabetes', axis=1)
target = raw_diabetes['Diabetes']

In [12]:
train_x, test_x, train_y, test_y = train_test_split(features, target, test_size=0.2, random_state=42)

In [13]:
train_x.shape, train_y.shape, test_x.shape, test_y.shape

((51216, 17), (51216,), (12804, 17), (12804,))

In [20]:
# Modelling
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

from sklearn.metrics import classification_report, accuracy_score

In [15]:
rf = RandomForestClassifier(n_estimators=200)
lr = LogisticRegression()
svc = SVC()

In [16]:
# random forest
rf.fit(train_x, train_y)
pred_y = rf.predict(test_x)
print(classification_report(test_y, pred_y))

              precision    recall  f1-score   support

         0.0       0.69      0.64      0.66      6055
         1.0       0.70      0.74      0.72      6749

    accuracy                           0.69     12804
   macro avg       0.69      0.69      0.69     12804
weighted avg       0.69      0.69      0.69     12804



In [17]:
# logistic regression
lr.fit(train_x, train_y)
pred_y = lr.predict(test_x)
print(classification_report(test_y, pred_y))

              precision    recall  f1-score   support

         0.0       0.73      0.69      0.71      6055
         1.0       0.74      0.76      0.75      6749

    accuracy                           0.73     12804
   macro avg       0.73      0.73      0.73     12804
weighted avg       0.73      0.73      0.73     12804



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [18]:
# support vector classifier
svc.fit(train_x, train_y)
pred_y = svc.predict(test_x)
print(classification_report(test_y, pred_y))

              precision    recall  f1-score   support

         0.0       0.75      0.66      0.70      6055
         1.0       0.73      0.80      0.76      6749

    accuracy                           0.74     12804
   macro avg       0.74      0.73      0.73     12804
weighted avg       0.74      0.74      0.73     12804



In [19]:
# knn
from sklearn.neighbors import KNeighborsClassifier


In [21]:
for n in range(4, 12):
    knn = KNeighborsClassifier(n_neighbors=n)
    knn.fit(train_x, train_y)
    pred_y = knn.predict(test_x)
    print(f"{n}: {accuracy_score(test_y, pred_y)}")

4: 0.6612777257107154
5: 0.6819743830053109
6: 0.6758825367072789
7: 0.6915807560137457
8: 0.6869728209934396
9: 0.7037644486098095
10: 0.6961886910340519
11: 0.7061855670103093
