In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
from imblearn.under_sampling import TomekLinks

In [2]:
df = pd.read_csv("dataset/diabetes.csv")

X = df.drop(columns= 'Outcome', axis=1)
Y = df['Outcome']

In [3]:
tl = TomekLinks()
X, Y = tl.fit_resample(X, Y)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.2, random_state=42, stratify=Y
)


In [5]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [6]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred, output_dict=True)


results = {
    "Random Forest": {
        "Accuracy": accuracy,
        "Confusion Matrix": conf_matrix,
        "Classification Report": class_report
    }
}


print(f"\nRandom Forest Accuracy: {accuracy:.4f}")
print("\nConfusion Matrix:\n", conf_matrix)
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Random Forest Accuracy: 0.8182

Confusion Matrix:
 [[76 13]
 [13 41]]

Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.85      0.85        89
           1       0.76      0.76      0.76        54

    accuracy                           0.82       143
   macro avg       0.81      0.81      0.81       143
weighted avg       0.82      0.82      0.82       143



In [7]:
import pickle
filename = 'diabetes_model.sav'
pickle.dump(model,open(filename, 'wb'))