In [1]:
import numpy as np
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [2]:
df=pd.read_csv('encoded_data.csv')

In [3]:
x = df.drop('Prediabetes Status', axis=1)
y = df['Prediabetes Status']


smote = SMOTE(random_state=42)
x_amplified, y_amplified = smote.fit_resample(x, y)

In [4]:
x_amplified

Unnamed: 0,Age,Gender,BMI,Waist Circumference (cm),Physical Activity Level,Frequent Urination,Increased Thirst,Increased Hunger,Excessive Weight Loss (Past 6 Months),Blurred Vision (Recent),...,Dry Mouth,Tingling/Numbness in Hands/Feet,Frequent Skin Infections (Past Year),Darkening Skin Folds (In Armpits/Neck),Difficulty Concentrating,Irritability,Erectile Dysfunction (Male) / Irregular Periods (Female),Delayed Wound Healing (After Minor Injury),Poor Night Sleep,Increased Belly Fat
0,0.893844,0,0.811899,0.897333,2,1,0,0,0,0,...,0,0,0,0,0,2,1,0,0,1
1,0.368053,0,0.538871,0.548370,3,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
2,0.499501,0,0.287398,0.348963,3,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,0.262895,0,0.380802,0.448667,3,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,0.315474,0,0.423912,0.498518,3,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
261,0.547495,0,0.665294,0.716666,3,0,0,0,0,0,...,0,0,0,0,0,2,1,0,0,1
262,0.411333,1,0.428394,0.528606,3,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
263,0.399426,1,0.416307,0.513092,3,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
264,0.482227,1,0.537540,0.595533,3,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [5]:
X_train, X_test, y_train, y_test = train_test_split(x_amplified, y_amplified, test_size=0.2, random_state=42)

In [6]:
model = RandomForestClassifier(n_estimators=100, max_depth=6, min_samples_leaf=5, 
                                       max_features='sqrt', random_state=42)

In [7]:
model.fit(X_train,y_train)

In [8]:
from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score, f1_score

# Make predictions on the training and test sets
train_predictions = model.predict(X_train)
test_predictions = model.predict(X_test)

# Precision, recall, and F1-score for training data
train_precision = precision_score(y_train, train_predictions)
train_recall = recall_score(y_train, train_predictions)
train_f1_score = f1_score(y_train, train_predictions)
train_accuracy = model.score(X_train, y_train)



# Precision, recall, and F1-score for test data
test_precision = precision_score(y_test, test_predictions)
test_recall = recall_score(y_test, test_predictions)
test_f1_score = f1_score(y_test, test_predictions)
test_accuracy = model.score(X_test, y_test)

# Confusion matrix for training data
train_conf_matrix = confusion_matrix(y_train, train_predictions)

# Confusion matrix for test data
test_conf_matrix = confusion_matrix(y_test, test_predictions)

# Print evaluation metrics

print(f"Training Accuracy: {train_accuracy:.2f}")
print("Training Precision:", train_precision)
print("Training Recall:", train_recall)
print("Training F1-score:", train_f1_score)
print(f"\nTest Accuracy: {test_accuracy:.2f}")
print("Test Precision:", test_precision)
print("Test Recall:", test_recall)
print("Test F1-score:", test_f1_score)
print("\nConfusion Matrix for Training Data:")
print(train_conf_matrix)
print("\nConfusion Matrix for Test Data:")
print(test_conf_matrix)

Training Accuracy: 0.93
Training Precision: 0.9494949494949495
Training Recall: 0.9038461538461539
Training F1-score: 0.9261083743842364

Test Accuracy: 0.87
Test Precision: 0.9230769230769231
Test Recall: 0.8275862068965517
Test F1-score: 0.8727272727272727

Confusion Matrix for Training Data:
[[103   5]
 [ 10  94]]

Confusion Matrix for Test Data:
[[23  2]
 [ 5 24]]
