In [None]:
#  1. Data Loading & Exploration
import pandas as pd

# Load CSV
df = pd.read_csv('/content/diabetes.csv')
print(df.head())
print(df.info())
print(df.describe())
print(df.isnull().sum())


   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768

In [None]:
# 2. Preprocessing
from sklearn.preprocessing import StandardScaler

X = df.drop('Outcome', axis=1)
y = df['Outcome']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [None]:
# 3. Train-Test Split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


In [None]:
# 4. Model Training
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

clf_dt = DecisionTreeClassifier().fit(X_train, y_train)
clf_rf = RandomForestClassifier().fit(X_train, y_train)
clf_svm = SVC(probability=True).fit(X_train, y_train)
clf_xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss').fit(X_train, y_train)


Parameters: { "use_label_encoder" } are not used.



In [None]:
#  5. Evaluation
from sklearn.metrics import accuracy_score, classification_report

models = {
    'Decision Tree': clf_dt,
    'Random Forest': clf_rf,
    'SVM': clf_svm,
    'XGBoost': clf_xgb
}

for name, model in models.items():
    preds = model.predict(X_test)
    print(f"{name} Accuracy: {accuracy_score(y_test, preds):.4f}")
    print(classification_report(y_test, preds))


Decision Tree Accuracy: 0.7597
              precision    recall  f1-score   support

           0       0.84      0.77      0.80        99
           1       0.64      0.75      0.69        55

    accuracy                           0.76       154
   macro avg       0.74      0.76      0.75       154
weighted avg       0.77      0.76      0.76       154

Random Forest Accuracy: 0.7403
              precision    recall  f1-score   support

           0       0.78      0.83      0.80        99
           1       0.65      0.58      0.62        55

    accuracy                           0.74       154
   macro avg       0.72      0.71      0.71       154
weighted avg       0.74      0.74      0.74       154

SVM Accuracy: 0.7273
              precision    recall  f1-score   support

           0       0.77      0.82      0.79        99
           1       0.63      0.56      0.60        55

    accuracy                           0.73       154
   macro avg       0.70      0.69      0.70  

In [None]:
# percentage
from sklearn.metrics import accuracy_score, classification_report

models = {
    'Decision Tree': clf_dt,
    'Random Forest': clf_rf,
    'SVM': clf_svm,
    'XGBoost': clf_xgb
}

for name, model in models.items():
    preds = model.predict(X_test)
    acc = accuracy_score(y_test, preds) * 100  # Convert to percentage
    print(f"{name} Accuracy: {acc:.2f}%")      # Format to 2 decimal places
    print(classification_report(y_test, preds))


Decision Tree Accuracy: 75.97%
              precision    recall  f1-score   support

           0       0.84      0.77      0.80        99
           1       0.64      0.75      0.69        55

    accuracy                           0.76       154
   macro avg       0.74      0.76      0.75       154
weighted avg       0.77      0.76      0.76       154

Random Forest Accuracy: 74.03%
              precision    recall  f1-score   support

           0       0.78      0.83      0.80        99
           1       0.65      0.58      0.62        55

    accuracy                           0.74       154
   macro avg       0.72      0.71      0.71       154
weighted avg       0.74      0.74      0.74       154

SVM Accuracy: 72.73%
              precision    recall  f1-score   support

           0       0.77      0.82      0.79        99
           1       0.63      0.56      0.60        55

    accuracy                           0.73       154
   macro avg       0.70      0.69      0.70  

In [None]:
# . Model Saving (to use in Flask)
import joblib

joblib.dump(clf_dt, 'diabetes_model.joblib')
joblib.dump(clf_rf, 'diabetes_model_rfc.joblib')
joblib.dump(clf_xgb, 'diabetes_model_xgb.joblib')
joblib.dump(clf_svm, 'diabetes_model_svm.joblib')
joblib.dump(scaler, 'diabetes_scaler.joblib')  # important for prediction
