In [79]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report
# Load dataset
ds = load_dataset("marianeft/diabetes_prediction_dataset")

# Usually datasets have 'train' split; convert to pandas DataFrame
df = pd.DataFrame(ds['train'])
print(df.head())
print(df.info())
print(df.describe())


   gender   age  hypertension  heart_disease smoking_history    bmi  \
0  Female  80.0             0              1           never  25.19   
1  Female  54.0             0              0         No Info  27.32   
2    Male  28.0             0              0           never  27.32   
3  Female  36.0             0              0         current  23.45   
4    Male  76.0             1              1         current  20.14   

   HbA1c_level  blood_glucose_level  diabetes  
0          6.6                  140         0  
1          6.6                   80         0  
2          5.7                  158         0  
3          5.0                  155         0  
4          4.8                  155         0  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   gender               100000 non-null  object 
 1   age               

In [80]:
# Gender mapping: Female=1, Male=0
df['gender'] = df['gender'].map({'Female': 1, 'Male': 0})

# Smoking History mapping
df['smoking_history'] = df['smoking_history'].map({
    'never': 0,
    'current': 1,
    'former': 2,
    'No Info': 3
})



In [81]:
print(df[['gender','smoking_history']].isnull().sum())



gender                18
smoking_history    10451
dtype: int64


In [82]:
# Numeric columns → median
numeric_cols = df.select_dtypes(include=['int64','float64']).columns
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())

# Categorical columns (gender, smoking_history) → mode
categorical_cols = ['gender','smoking_history']
for col in categorical_cols:
    df[col] = df[col].fillna(df[col].mode()[0])


In [83]:
print(df[['gender','smoking_history']].isnull().sum())

gender             0
smoking_history    0
dtype: int64


In [84]:
X = df.drop(columns=['diabetes'])
Y = df['diabetes']
print(X)
print(Y)

       gender   age  hypertension  heart_disease  smoking_history    bmi  \
0         1.0  80.0             0              1              0.0  25.19   
1         1.0  54.0             0              0              3.0  27.32   
2         0.0  28.0             0              0              0.0  27.32   
3         1.0  36.0             0              0              1.0  23.45   
4         0.0  76.0             1              1              1.0  20.14   
...       ...   ...           ...            ...              ...    ...   
99995     1.0  80.0             0              0              3.0  27.32   
99996     1.0   2.0             0              0              3.0  17.37   
99997     0.0  66.0             0              0              2.0  27.83   
99998     1.0  24.0             0              0              0.0  35.42   
99999     1.0  57.0             0              0              1.0  22.43   

       HbA1c_level  blood_glucose_level  
0              6.6                  140  
1  

In [85]:
scalar=StandardScaler()
scalar.fit(X)
standardized_data=scalar.transform(X)
print(standardized_data)
X=standardized_data
Y=df['diabetes']
print(X)
print(Y)

[[ 8.41046203e-01  1.69270354e+00 -2.84439447e-01 ... -3.21055792e-01
   1.00170572e+00  4.77042159e-02]
 [ 8.41046203e-01  5.38006427e-01 -2.84439447e-01 ... -1.15583678e-04
   1.00170572e+00 -1.42620999e+00]
 [-1.18899532e+00 -6.16690686e-01 -2.84439447e-01 ... -1.15583678e-04
   1.61108022e-01  4.89878478e-01]
 ...
 [-1.18899532e+00  1.07094356e+00 -2.84439447e-01 ...  7.67292549e-02
   1.61108022e-01  4.16182767e-01]
 [ 8.41046203e-01 -7.94336396e-01 -2.84439447e-01 ...  1.22036126e+00
  -1.42668764e+00 -9.34905254e-01]
 [ 8.41046203e-01  6.71240710e-01 -2.84439447e-01 ... -7.36921977e-01
   1.00170572e+00 -1.18055762e+00]]
[[ 8.41046203e-01  1.69270354e+00 -2.84439447e-01 ... -3.21055792e-01
   1.00170572e+00  4.77042159e-02]
 [ 8.41046203e-01  5.38006427e-01 -2.84439447e-01 ... -1.15583678e-04
   1.00170572e+00 -1.42620999e+00]
 [-1.18899532e+00 -6.16690686e-01 -2.84439447e-01 ... -1.15583678e-04
   1.61108022e-01  4.89878478e-01]
 ...
 [-1.18899532e+00  1.07094356e+00 -2.8443944

In [86]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,stratify=Y,random_state=2)

In [87]:
print(X.shape,X_train.shape,X_test.shape)

(100000, 8) (80000, 8) (20000, 8)


SVM MODEL


In [88]:
classifier=svm.SVC(kernel='linear')
classifier.fit(X_train,Y_train)



In [89]:
# Training predictions
Y_train_pred = classifier.predict(X_train)
training_acc = accuracy_score(Y_train, Y_train_pred)

# Testing predictions
Y_pred = classifier.predict(X_test)
testing_acc = accuracy_score(Y_test, Y_pred)

print("Training Accuracy:", training_acc)
print("Testing Accuracy:", testing_acc)

# Confusion Matrix
cm = confusion_matrix(Y_test, Y_pred)
print("Confusion Matrix:\n", cm)

# Classification Report
report = classification_report(Y_test, Y_pred)
print("Classification Report:\n", report)


Training Accuracy: 0.9607
Testing Accuracy: 0.9618
Confusion Matrix:
 [[18206    94]
 [  670  1030]]
Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.99      0.98     18300
           1       0.92      0.61      0.73      1700

    accuracy                           0.96     20000
   macro avg       0.94      0.80      0.85     20000
weighted avg       0.96      0.96      0.96     20000



KNN


In [90]:
from sklearn.preprocessing import StandardScaler
import numpy as np

# Agar scaling use kiya hai
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# KNN classifier
classifier = KNeighborsClassifier(n_neighbors=5)
classifier.fit(X_train_scaled, Y_train)

# Training predictions
Y_train_pred = classifier.predict(X_train_scaled)
training_acc_kn = accuracy_score(Y_train, Y_train_pred)

# Testing predictions
Y_pred = classifier.predict(X_test_scaled)
testing_acc_kn = accuracy_score(Y_test, Y_pred)

print("Training Accuracy:", training_acc_kn)
print("Testing Accuracy:", testing_acc_kn)

# Confusion Matrix
cm = confusion_matrix(Y_test, Y_pred)
print("Confusion Matrix:\n", cm)

# Classification Report
report = classification_report(Y_test, Y_pred)
print("Classification Report:\n", report)

# -------------------------------
# ✅ Single Input Prediction System
# Input data example (order of features same as X columns)
input_data = (1, 50, 0, 0, 0, 27.5, 5.8, 120)  # gender, age, hypertension, heart_disease, smoking_history, bmi, HbA1c_level, blood_glucose_level

# Convert to numpy array and reshape
input_data_as_np = np.asarray(input_data)
input_data_reshaped = input_data_as_np.reshape(1, -1)

# Scale input data (same scaler as training)
input_scaled = scaler.transform(input_data_reshaped)

# Prediction
prediction = classifier.predict(input_scaled)
print("Prediction:", prediction)

# Interpret result
if prediction[0] == 0:
    print("Not Diabetic")
else:
    print("Diabetic")


Training Accuracy: 0.969525
Testing Accuracy: 0.9634
Confusion Matrix:
 [[18147   153]
 [  579  1121]]
Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.99      0.98     18300
           1       0.88      0.66      0.75      1700

    accuracy                           0.96     20000
   macro avg       0.92      0.83      0.87     20000
weighted avg       0.96      0.96      0.96     20000

Prediction: [1]
Diabetic


SVM


In [91]:
from sklearn import svm
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import numpy as np

# 1️⃣ Model banaye (SVM with linear kernel)
classifier = svm.SVC(kernel='linear')

# 2️⃣ Training
classifier.fit(X_train, Y_train)

# 3️⃣ Training predictions
Y_train_pred = classifier.predict(X_train)
training_acc_svm = accuracy_score(Y_train, Y_train_pred)

# 4️⃣ Testing predictions
Y_pred = classifier.predict(X_test)
testing_acc_svm = accuracy_score(Y_test, Y_pred)

print("Training Accuracy:", training_acc)
print("Testing Accuracy:", testing_acc)

# 5️⃣ Confusion Matrix
cm = confusion_matrix(Y_test, Y_pred)
print("Confusion Matrix:\n", cm)

# 6️⃣ Classification Report
report = classification_report(Y_test, Y_pred)
print("Classification Report:\n", report)
# ✅ Single Input Prediction System
# Input data example (order of features same as X columns)
input_data = (1, 50, 0, 0, 0, 27.5, 5.8, 120)  # gender, age, hypertension, heart_disease, smoking_history, bmi, HbA1c_level, blood_glucose_level

# Convert to numpy array and reshape
input_data_as_np = np.asarray(input_data)
input_data_reshaped = input_data_as_np.reshape(1, -1)

# Scale input data (same scaler as training)
input_scaled = scaler.transform(input_data_reshaped)

# Prediction
prediction = classifier.predict(input_scaled)
print("Prediction:", prediction)

# Interpret result
if prediction[0] == 0:
    print("Not Diabetic")
else:
    print("Diabetic")


Training Accuracy: 0.9607
Testing Accuracy: 0.9618
Confusion Matrix:
 [[18206    94]
 [  670  1030]]
Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.99      0.98     18300
           1       0.92      0.61      0.73      1700

    accuracy                           0.96     20000
   macro avg       0.94      0.80      0.85     20000
weighted avg       0.96      0.96      0.96     20000

Prediction: [1]
Diabetic


Decision Tree


In [92]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import numpy as np

# 1️⃣ Model banaye (Decision Tree)
classifier = DecisionTreeClassifier(random_state=42)  # random_state for reproducibility

# 2️⃣ Training
classifier.fit(X_train, Y_train)

# 3️⃣ Training predictions
Y_train_pred = classifier.predict(X_train)
training_acc_dt = accuracy_score(Y_train, Y_train_pred)

# 4️⃣ Testing predictions
Y_pred = classifier.predict(X_test)
testing_acc_dt = accuracy_score(Y_test, Y_pred)

print("Training Accuracy:", training_acc_dt)
print("Testing Accuracy:", testing_acc_dt)

# 5️⃣ Confusion Matrix
cm = confusion_matrix(Y_test, Y_pred)
print("Confusion Matrix:\n", cm)

# 6️⃣ Classification Report
report = classification_report(Y_test, Y_pred)
print("Classification Report:\n", report)
# ✅ Single Input Prediction System
# Input data example (order of features same as X columns)
input_data = (1, 50, 0, 0, 0, 27.5, 5.8, 120)  # gender, age, hypertension, heart_disease, smoking_history, bmi, HbA1c_level, blood_glucose_level

# Convert to numpy array and reshape
input_data_as_np = np.asarray(input_data)
input_data_reshaped = input_data_as_np.reshape(1, -1)

# Scale input data (same scaler as training)
input_scaled = scaler.transform(input_data_reshaped)

# Prediction
prediction = classifier.predict(input_scaled)
print("Prediction:", prediction)

# Interpret result
if prediction[0] == 0:
    print("Not Diabetic")
else:
    print("Diabetic")


Training Accuracy: 0.99915
Testing Accuracy: 0.952
Confusion Matrix:
 [[17760   540]
 [  420  1280]]
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.97      0.97     18300
           1       0.70      0.75      0.73      1700

    accuracy                           0.95     20000
   macro avg       0.84      0.86      0.85     20000
weighted avg       0.95      0.95      0.95     20000

Prediction: [1]
Diabetic


In [93]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import numpy as np

# ------------------------------
# 1️⃣ Logistic Regression Model
# ------------------------------
logistic_model = LogisticRegression(max_iter=500)
logistic_model.fit(X_train_scaled, Y_train)  # scaled data use karo

# Training & Testing Predictions
Y_train_pred_log = logistic_model.predict(X_train_scaled)
Y_test_pred_log = logistic_model.predict(X_test_scaled)

# Accuracy
train_acc_log = accuracy_score(Y_train, Y_train_pred_log)
test_acc_log = accuracy_score(Y_test, Y_test_pred_log)

print("----- Logistic Regression -----")
print("Training Accuracy:", train_acc_log)
print("Testing Accuracy:", test_acc_log)
print("Confusion Matrix:\n", confusion_matrix(Y_test, Y_test_pred_log))
print("Classification Report:\n", classification_report(Y_test, Y_test_pred_log))

# Single Input Prediction
input_data = (1, 50, 0, 0, 0, 27.5, 5.8, 120)
input_np = np.array(input_data).reshape(1, -1)
input_scaled = scaler.transform(input_np)

prediction_log = logistic_model.predict(input_scaled)
print("Logistic Regression Prediction:", "Diabetic" if prediction_log[0] else "Not Diabetic")

# ------------------------------
# 2️⃣ Random Forest Classifier
# ------------------------------
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, Y_train)  # Random Forest usually scaling nahi chahiye

# Training & Testing Predictions
Y_train_pred_rf = rf_model.predict(X_train)
Y_test_pred_rf = rf_model.predict(X_test)

# Accuracy
train_acc_rf = accuracy_score(Y_train, Y_train_pred_rf)
test_acc_rf = accuracy_score(Y_test, Y_test_pred_rf)

print("\n----- Random Forest -----")
print("Training Accuracy:", train_acc_rf)
print("Testing Accuracy:", test_acc_rf)
print("Confusion Matrix:\n", confusion_matrix(Y_test, Y_test_pred_rf))
print("Classification Report:\n", classification_report(Y_test, Y_test_pred_rf))

# Single Input Prediction
prediction_rf = rf_model.predict(input_np)  # raw input, scaling not needed
print("Random Forest Prediction:", "Diabetic" if prediction_rf[0] else "Not Diabetic")


----- Logistic Regression -----
Training Accuracy: 0.959975
Testing Accuracy: 0.96165
Confusion Matrix:
 [[18120   180]
 [  587  1113]]
Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.99      0.98     18300
           1       0.86      0.65      0.74      1700

    accuracy                           0.96     20000
   macro avg       0.91      0.82      0.86     20000
weighted avg       0.96      0.96      0.96     20000

Logistic Regression Prediction: Diabetic

----- Random Forest -----
Training Accuracy: 0.9991375
Testing Accuracy: 0.9712
Confusion Matrix:
 [[18226    74]
 [  502  1198]]
Classification Report:
               precision    recall  f1-score   support

           0       0.97      1.00      0.98     18300
           1       0.94      0.70      0.81      1700

    accuracy                           0.97     20000
   macro avg       0.96      0.85      0.90     20000
weighted avg       0.97      0.97      0.97  

In [94]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import numpy as np

# ------------------------------
# 1️⃣ Gaussian Naive Bayes Model
# ------------------------------
nb_model = GaussianNB()
nb_model.fit(X_train, Y_train)  # raw numeric data use karo, scaling optional

# Training & Testing Predictions
Y_train_pred_nb = nb_model.predict(X_train)
Y_test_pred_nb = nb_model.predict(X_test)

# Accuracy
train_acc_nb = accuracy_score(Y_train, Y_train_pred_nb)
test_acc_nb = accuracy_score(Y_test, Y_test_pred_nb)

print("----- Naive Bayes -----")
print("Training Accuracy:", train_acc_nb)
print("Testing Accuracy:", test_acc_nb)
print("Confusion Matrix:\n", confusion_matrix(Y_test, Y_test_pred_nb))
print("Classification Report:\n", classification_report(Y_test, Y_test_pred_nb))

# ------------------------------
# 2️⃣ Single Input Prediction
# ------------------------------
input_data = (1, 50, 0, 0, 0, 27.5, 5.8, 120)  # gender, age, hypertension, heart_disease, smoking_history, bmi, HbA1c, glucose
input_np = np.array(input_data).reshape(1, -1)

# Logistic Regression prediction (scaled)
input_scaled = scaler.transform(input_np)
prediction_log = logistic_model.predict(input_scaled)
print("Logistic Regression Prediction:", "Diabetic" if prediction_log[0] else "Not Diabetic")

# Naive Bayes prediction (raw input)
prediction_nb = nb_model.predict(input_np)
print("Naive Bayes Prediction:", "Diabetic" if prediction_nb[0] else "Not Diabetic")


----- Naive Bayes -----
Training Accuracy: 0.9034
Testing Accuracy: 0.90475
Confusion Matrix:
 [[16974  1326]
 [  579  1121]]
Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.93      0.95     18300
           1       0.46      0.66      0.54      1700

    accuracy                           0.90     20000
   macro avg       0.71      0.79      0.74     20000
weighted avg       0.92      0.90      0.91     20000

Logistic Regression Prediction: Diabetic
Naive Bayes Prediction: Diabetic


In [95]:
from sklearn.naive_bayes import CategoricalNB
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import numpy as np

# ------------------------------
# 1️⃣ Continuous features ko discretize karna
# ------------------------------
# n_bins=5, encode='ordinal'
discretizer = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform')
X_train_cat = discretizer.fit_transform(X_train)
X_test_cat = discretizer.transform(X_test)

# ------------------------------
# 2️⃣ Categorical Naive Bayes Model
# ------------------------------
cat_nb_model = CategoricalNB()
cat_nb_model.fit(X_train_cat, Y_train)

# Training & Testing Predictions
Y_train_pred_cat = cat_nb_model.predict(X_train_cat)
Y_test_pred_cat = cat_nb_model.predict(X_test_cat)

# Accuracy
train_acc_cat = accuracy_score(Y_train, Y_train_pred_cat)
test_acc_cat = accuracy_score(Y_test, Y_test_pred_cat)

print("----- Categorical Naive Bayes -----")
print("Training Accuracy:", train_acc_cat)
print("Testing Accuracy:", test_acc_cat)
print("Confusion Matrix:\n", confusion_matrix(Y_test, Y_test_pred_cat))
print("Classification Report:\n", classification_report(Y_test, Y_test_pred_cat))

# ------------------------------
# 3️⃣ Single Input Prediction
# ------------------------------
input_data = (1, 50, 0, 0, 0, 27.5, 5.8, 120)
input_np = np.array(input_data).reshape(1, -1)

# Discretize input same way as training
input_data_cat = discretizer.transform(input_np)

# Prediction
prediction_cat = cat_nb_model.predict(input_data_cat)
print("Categorical Naive Bayes Prediction:", "Diabetic" if prediction_cat[0] else "Not Diabetic")


----- Categorical Naive Bayes -----
Training Accuracy: 0.9662625
Testing Accuracy: 0.96805
Confusion Matrix:
 [[18112   188]
 [  451  1249]]
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.99      0.98     18300
           1       0.87      0.73      0.80      1700

    accuracy                           0.97     20000
   macro avg       0.92      0.86      0.89     20000
weighted avg       0.97      0.97      0.97     20000

Categorical Naive Bayes Prediction: Diabetic


In [102]:
import pandas as pd

models = ["KNN", "SVM", "Decision Tree", "Logistic Regression",
          "Random Forest", "Gaussian NB", "Categorical NB"]

training_accuracies = [training_acc, training_acc_svm, training_acc_dt, train_acc_log,
                       train_acc_rf, train_acc_nb, train_acc_cat]

testing_accuracies = [testing_acc, testing_acc_svm, testing_acc_dt, test_acc_log,
                      test_acc_rf, test_acc_nb, test_acc_cat]

comparison_df = pd.DataFrame({
    "Model": models,
    "Training Accuracy": training_accuracies,
    "Testing Accuracy": testing_accuracies
})

comparison_df["Training Accuracy"] = comparison_df["Training Accuracy"].round(2)
comparison_df["Testing Accuracy"] = comparison_df["Testing Accuracy"].round(2)

comparison_df = comparison_df.sort_values(by="Testing Accuracy", ascending=False)

print("----- Model Comparison -----")
print(comparison_df)

best_model = comparison_df.iloc[0]["Model"]
best_accuracy = comparison_df.iloc[0]["Testing Accuracy"]
print(f"\nBest Model: {best_model} with Testing Accuracy: {best_accuracy}")


----- Model Comparison -----
                 Model  Training Accuracy  Testing Accuracy
4        Random Forest               1.00              0.97
6       Categorical NB               0.97              0.97
0                  KNN               0.96              0.96
3  Logistic Regression               0.96              0.96
1                  SVM               0.96              0.96
2        Decision Tree               1.00              0.95
5          Gaussian NB               0.90              0.90

Best Model: Random Forest with Testing Accuracy: 0.97
