<a href="https://colab.research.google.com/github/shakombo/shakombo/blob/main/RF_SMOTE_VIF_less_than_10_10_fold_cross_validation_profile.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Step 1: Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report, roc_curve, auc
from sklearn.feature_selection import VarianceThreshold
from statsmodels.stats.outliers_influence import variance_inflation_factor
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import joblib

In [None]:
# upload csv dataset
from google.colab import files
uploaded = files.upload()

In [None]:
# Step 2: Load the dataset
data = pd.read_csv('updated_obstetric_data_cleaned.csv')

In [None]:
# Step 3: Data Visualization
data.hist(bins=20, figsize=(15, 10))
plt.show()

sns.boxplot(data=data, orient='h')
plt.show()

corr_matrix = data.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.show()

In [None]:
# Step 4: Class Balance Check
class_balance = data['Est Blood Loss'].value_counts()
print("Class Balance:")
print(class_balance)

In [None]:
# Step 5: Feature Selection
variance_selector = VarianceThreshold(threshold=0.01)
data_numeric = data.select_dtypes(include=[np.number])
variance_selector.fit(data_numeric)
low_variance_cols = data_numeric.columns[~variance_selector.get_support()]
data = data.drop(columns=low_variance_cols)

In [None]:
# Calculate VIF for numeric variables
def calculate_vif(data_frame):
    vif_data = pd.DataFrame()
    vif_data['Variable'] = data_frame.columns
    vif_data['VIF'] = [variance_inflation_factor(data_frame.values, i) for i in range(data_frame.shape[1])]
    return vif_data
    vif_data = calculate_vif(data_numeric)
print("VIF for numeric variables:")
print(vif_data)

In [None]:
# Filter features with VIF less than 10
selected_features = vif_data[vif_data['VIF'] < 10]['Variable']
selected_features = selected_features[selected_features != 'Previous Vacuum Extraction']  # Exclude 'Previous Vacuum Extraction' if it's not present
data = data[selected_features]

In [None]:
# Step 6: Resampling for Class Imbalance using SMOTE
X = data.drop('Est Blood Loss', axis=1)
y = data['Est Blood Loss']

In [None]:
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

In [None]:
# Step 7: Encoding Categorical Variables (One-Hot Encoding)
categorical_columns = ['HIV Status', 'Previous LSCS', 'Previous ENND', 'Previous Still Births',
                       'Previous Abortion', 'Previous PPH', 'Previous Miscarriage',
                        'Previous IUD', 'Previous Ectopic Pregnancy',
                       'Previous Post Dates', 'Previous Twin Gestation', 'Previous Cord Prolapse',
                       'Previous Fetal Macrosomia', 'Previous PIH', 'Previous APH']
                       X_resampled = pd.get_dummies(X_resampled, columns=categorical_columns, drop_first=True)

In [None]:
# Step 9: Hyperparameter Optimization for Random Forest
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
rf_model = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(rf_model, param_grid, cv=10, n_jobs=-1, scoring='roc_auc')
grid_search.fit(X_train, y_train)

In [None]:
# Get the best Random Forest model
best_rf_model = grid_search.best_estimator_

In [None]:
# Step 10: Model Evaluation

y_pred = best_rf_model.predict(X_test)
y_proba = best_rf_model.predict_proba(X_test)[:, 1]

fpr, tpr, thresholds = roc_curve(y_test, y_proba)
roc_auc = auc(fpr, tpr)

In [None]:
# Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()

y_pred = best_rf_model.predict(X_test)
y_proba = best_rf_model.predict_proba(X_test)[:, 1]

confusion = confusion_matrix(y_test, y_pred)
sensitivity = confusion[1, 1] / (confusion[1, 0] + confusion[1, 1])
specificity = confusion[0, 0] / (confusion[0, 1] + confusion[0, 0])

roc_auc = roc_auc_score(y_test, y_proba)
print(f"ROC AUC: {roc_auc}")

class_report = classification_report(y_test, y_pred)
print("Classification Report:")
print(class_report)

print("Confusion Matrix:")
print(confusion)

print("Model Evaluation Metrics:")
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"Precision: {precision_score(y_test, y_pred)}")
print(f"Recall: {recall_score(y_test, y_pred)}")
print(f"F1 Score: {f1_score(y_test, y_pred)}")
print(f"Sensitivity: {sensitivity}")
print(f"Specificity: {specificity}")

# Step 11: Serialization (Save the Best Model)
joblib.dump(best_rf_model, 'RF_smote_hyperparameteroptimization_VIF_less_than_10.pkl')