# Multiclass Diabetes Classification

This notebook demonstrates loading the diabetes dataset, preprocessing the data, building three machineâ€‘learning models (SVM, decision tree and KNN) and evaluating their performance. It accompanies the technical report and presentation.

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
%matplotlib inline


## Load the dataset

In [None]:
# Path to the CSV
df = pd.read_csv('/content/sample_data/Dataset of Diabetes .csv')
# Clean labels and gender
df['CLASS'] = df['CLASS'].astype(str).str.strip()
df['Gender'] = df['Gender'].astype(str).str.strip().str.upper()
df['Gender'] = df['Gender'].map({'F':0, 'M':1})
print('Dataset shape:', df.shape)
df.head()

## Class distribution

In [None]:
counts = df['CLASS'].value_counts()
print(counts)
counts.plot(kind='bar', color=['#083D77','#F4D35E','#EE964B'], title='Class Distribution')
plt.show()

## Feature correlation heatmap

In [None]:
corr = df.drop(['ID','No_Pation','CLASS'], axis=1).corr()
plt.figure(figsize=(10,8))
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title('Feature Correlation Heatmap')
plt.show()

## Preprocessing and train/test split

In [None]:
X = df.drop(['CLASS','ID','No_Pation'], axis=1)
y = df['CLASS']
# Label encode target
label_enc = LabelEncoder()
y_enc = label_enc.fit_transform(y)
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y_enc, test_size=0.2, stratify=y_enc, random_state=42)


## Train models and evaluate

In [None]:

model = SVC(kernel='rbf', gamma='scale', C=1.0)


results = {}


pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', model)
])


pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)


acc = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=label_enc.classes_)
cm = confusion_matrix(y_test, y_pred)


cv_scores = cross_val_score(
    pipeline,
    X,
    y_enc,
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    scoring='accuracy'
)


results['SVM (RBF)'] = {
    'accuracy': acc,
    'classification_report': report,
    'confusion_matrix': cm,
    'cv_scores': cv_scores
}


print("=== SVM (RBF) ===")
print("Test accuracy:", acc)
print("Cross-val mean accuracy:", cv_scores.mean())
print("Classification report:\n", report)
print("Confusion matrix:\n", cm)

## Confusion matrices

In [None]:
fig, axes = plt.subplots(1,3, figsize=(15,4))
for ax, (name, res) in zip(axes, results.items()):
    cm = res['confusion_matrix']
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=label_enc.classes_, yticklabels=label_enc.classes_, ax=ax)
    ax.set_title(name)
    ax.set_xlabel('Predicted')
    ax.set_ylabel('Actual')
plt.tight_layout()
plt.show()