<h1>Diabetes Prediction Model - Random Forest Method</h1>

In [None]:
%pip install pandas numpy seaborn matplotlib scipy scikit-learn imbalanced-learn

In [2]:
import pandas as pd
df = pd.read_csv("diabetes_prediction_dataset.csv")

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isnull()

In [7]:
## OHE on categorical columns, gender & smoking_history
df = pd.get_dummies(df, drop_first=True)

In [None]:
df.info()

In [9]:
from sklearn.model_selection import train_test_split

## Divinding dataset into Independent & Dependent
X = df.drop("diabetes", axis=1)

In [10]:
## Getting Dependent features
y = df["diabetes"]

In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [12]:
## Doing standard scaling - feature scaling technique that standardizes the features by removing the mean and scaling to unit variance.
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

In [13]:
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [14]:
## Importing Random FOrest algorithm
from sklearn.ensemble import RandomForestClassifier

## Initializing Random Forest with class_weight for embalancing classes
rf = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)

In [None]:
rf.fit(X_train, y_train)

In [16]:
## Making predictions on test data
y_pred = rf.predict(X_test)

In [None]:
## Evaluating model's accuracy
from sklearn.metrics import accuracy_score, classification_report

accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy {accuracy * 100:.2f}%")

print(f"Classification report:\n{classification_report(y_test, y_pred)}")

<h5>Training Model on Unseen Data</h5>

In [18]:
unseen_data = pd.DataFrame(
    {
        "gender": ["Male", "Female", "Male", "Female", "Male", "Female"],
        "age": [50, 42, 63, 56, 38, 36],
        "hypertension": [0, 1, 1, 0, 0, 1],
        "heart_disease": [1, 0, 1, 0, 0, 1],
        "smoking_history": ["former", "never", "current", "former", "never", "never"],
        "bmi": [28.5, 31.2, 34.0, 29.4, 25.8, 29.71],
        "HbA1c_level": [6.3, 5.9, 7.2, 5.7, 5.6, 10.3],
        "blood_glucose_level": [140, 120, 160, 110, 100, 130],
    }
)

In [None]:
unseen_data.info()

In [20]:
unseen_data = pd.get_dummies(unseen_data, drop_first=True)

In [21]:
columns = [
    "age",
    "hypertension",
    "heart_disease",
    "bmi",
    "HbA1c_level",
    "blood_glucose_level",
    "gender_Male",
    "gender_Other",
    "smoking_history_current",
    "smoking_history_ever",
    "smoking_history_former",
    "smoking_history_never",
    "smoking_history_not current",
]

unseen_data = unseen_data.reindex(columns=columns, fill_value=0)

In [None]:
unseen_data_scaled = scaler.transform(unseen_data[columns])

In [23]:
# Convert unseen_data to NumPy array to overcome column name warning
#unseen_data_array = unseen_data.values

In [24]:
## Making predictions
predictions = rf.predict(unseen_data_scaled)

In [None]:
## Predictions of New Data
predictions

In [None]:
for i, prediction in enumerate(predictions):
    print(f"Patient {i+1} is {'Diabetic' if prediction == 1 else 'Non-Diabetic'}")

<h5>Evaluating Model Performance on Unseen_Data</h5>

In [27]:
## Adding target label to unseen_data
unseen_data['diabetes'] = predictions

In [None]:
## Evaluating models performance on Unseen_Data
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

accuracy = accuracy_score(unseen_data['diabetes'], predictions)
print(f"Accuracy: {accuracy * 100:.2f}%")

print(
    f"Classification report:\n{classification_report(unseen_data['diabetes'], predictions)}"
)

print(f"Confusion matrix:\n{confusion_matrix(unseen_data['diabetes'], predictions)}")

<h5>Feature's Columns Importance Analysis</h5>

In [None]:
## Analysing which columns is contributing the most to our model
importances = rf.feature_importances_
feature_importance_df = pd.DataFrame(
    {"Feature": columns, "Importance": importances}
).sort_values(by="Importance", ascending=False)


feature_importance_df

In [None]:
## Plotting the graph of the above feature analysis
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
plt.barh(feature_importance_df["Feature"], feature_importance_df["Importance"])
plt.xlabel("Importance")
plt.title("Features Importance")
plt.gca().invert_yaxis()  # Inverting y axis so it start most important comes first
plt.show()