<a href="https://colab.research.google.com/github/sonaliliyanahetti/Platform-Feasibility-and-Market-Analysist/blob/Model-training/platform_feasibility_using_random_forest_and_desicion_tree.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Ensemble Models
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

In [4]:

# --- 1. DATA PREPARATION (Replace this section with your actual code) ---
# NOTE: We simulate an imbalanced dataset (similar to your 74:5 ratio) for a runnable script.
# Replace this entire block with your code for loading data (X, y) and splitting it.

from sklearn.datasets import make_classification
# Creating a synthetic dataset with 93% Class 0 (Majority) and 7% Class 1 (Minority)
X, y = make_classification(
    n_samples=1000,
    n_features=20,
    n_informative=10,
    n_redundant=0,
    n_classes=2,
    n_clusters_per_class=1,
    weights=[0.93, 0.07], # Mimics your severe class imbalance
    flip_y=0,
    random_state=42
)


In [5]:
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y # Use stratify to preserve imbalance ratio
)


In [6]:
print(f"Training set size (Class 0 / Class 1): {np.sum(y_train == 0)} / {np.sum(y_train == 1)}")
print(f"Test set size (Class 0 / Class 1): {np.sum(y_test == 0)} / {np.sum(y_test == 1)}\n")

Training set size (Class 0 / Class 1): 744 / 56
Test set size (Class 0 / Class 1): 186 / 14



In [7]:

# --- 2. RANDOM FOREST CLASSIFIER (Recommended Model) ---

print("--- Evaluating Random Forest Classifier ---")

# Use class_weight='balanced' to penalize misclassification of the minority class
rf_model = RandomForestClassifier(
    n_estimators=100,
    class_weight='balanced', # Crucial for imbalanced data
    random_state=42,
    n_jobs=-1
)

rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

--- Evaluating Random Forest Classifier ---


In [11]:
print("\n--- Random Forest Evaluation Results ---")
print(f"Overall Accuracy: {accuracy_score(y_test, y_pred_rf):.4f}")

print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))

print("\nClassification Report (Focus on Class 1 Recall & F1-Score):\n",
      classification_report(y_test, y_pred_rf))



--- Random Forest Evaluation Results ---
Overall Accuracy: 0.9800

Confusion Matrix:
 [[186   0]
 [  4  10]]

Classification Report (Focus on Class 1 Recall & F1-Score):
               precision    recall  f1-score   support

           0       0.98      1.00      0.99       186
           1       1.00      0.71      0.83        14

    accuracy                           0.98       200
   macro avg       0.99      0.86      0.91       200
weighted avg       0.98      0.98      0.98       200



In [12]:
# --- 3. DECISION TREE CLASSIFIER (Baseline Comparison) ---

print("\n\n--- Evaluating Decision Tree Classifier ---")

# Use class_weight='balanced' and set a max_depth to reduce overfitting
dt_model = DecisionTreeClassifier(
    class_weight='balanced', # Crucial for imbalanced data
    max_depth=5,            # Limiting depth helps prevent overfitting
    random_state=42
)

dt_model.fit(X_train, y_train)
y_pred_dt = dt_model.predict(X_test
)



--- Evaluating Decision Tree Classifier ---


In [13]:

# Evaluation
print(f"Overall Accuracy: {accuracy_score(y_test, y_pred_dt):.4f}")
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_dt))
print("\nClassification Report (Focus on Class 1 Recall & F1-Score):\n", classification_report(y_test, y_pred_dt))

Overall Accuracy: 0.9350

Confusion Matrix:
 [[176  10]
 [  3  11]]

Classification Report (Focus on Class 1 Recall & F1-Score):
               precision    recall  f1-score   support

           0       0.98      0.95      0.96       186
           1       0.52      0.79      0.63        14

    accuracy                           0.94       200
   macro avg       0.75      0.87      0.80       200
weighted avg       0.95      0.94      0.94       200

