In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
import joblib

# Load dataset
df = pd.read_csv(r"C:\Users\MAHE\Downloads\Hackathon\website\backend\large_student_data.csv")

# Fill missing subjects with 0 (indicating the student did not take that subject)
df.fillna(0, inplace=True)

# Dynamically Calculate Cutoffs Only If Subject Exists
def calculate_cutoffs(row):
    cutoff_eng = (row["Math"] * 2 + row["Physics"] + row["Chemistry"]) / 4 if row["Math"] and row["Physics"] and row["Chemistry"] else 0
    cutoff_med = (row["Biology"] * 2 + row["Physics"] + row["Chemistry"]) / 4 if row["Biology"] and row["Physics"] and row["Chemistry"] else 0
    cutoff_comm = (row["Commerce"] * 2 + row["Math"]) / 3 if row["Commerce"] and row["Math"] else 0
    cutoff_arts = np.mean([row["Math"], row["Physics"], row["Chemistry"], row["Biology"], row["CS"], row["Commerce"]])  # Use only available subjects

    return pd.Series([cutoff_eng, cutoff_med, cutoff_comm, cutoff_arts])

# Apply Cutoff Calculation
df[["Cutoff_Eng", "Cutoff_Med", "Cutoff_Comm", "Cutoff_Arts"]] = df.apply(calculate_cutoffs, axis=1)

# Encode target variable (Recommended_Stream)
le = LabelEncoder()
df["Recommended_Stream"] = le.fit_transform(df["Recommended_Stream"])

# Select Features & Target
X = df[["Cutoff_Eng", "Cutoff_Med", "Cutoff_Comm", "Cutoff_Arts"]]
y = df["Recommended_Stream"]

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Save Model & Label Encoder
joblib.dump(model, "stream_recommender.pkl")
joblib.dump(le, "label_encoder.pkl")

print("Model training completed and saved!")

Model training completed and saved!
