# New section

In [10]:
import pandas as pd
import numpy as np
import pickle
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

def load_and_prepare_data(path="StudentsPerformance.csv"):
    df = pd.read_csv(path)
    df.columns = df.columns.str.replace(" ", "_").str.replace("/", "_")
    df["average_score"] = df[["math_score", "reading_score", "writing_score"]].mean(axis=1)
    df["Performance_Level"] = (df["average_score"] >= 70).astype(int)

    features_to_drop = ["math_score", "reading_score", "writing_score",
                        "average_score", "Performance_Level"]
    X = df.drop(columns=features_to_drop)
    y = df["Performance_Level"]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.20, random_state=42, stratify=y
    )

    return X_train, X_test, y_train, y_test, X.columns

def build_pipeline(categorical_columns):
    preprocessor = ColumnTransformer(
        transformers=[
            ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_columns)
        ],
        remainder="passthrough"
    )

    pipeline = Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("rf", RandomForestClassifier(random_state=42))
    ])

    return pipeline

def train_model(X_train, y_train, categorical_cols):
    pipeline = build_pipeline(categorical_cols)

    param_grid = {
        "rf__n_estimators": [100, 200, 300],
        "rf__max_depth": [5, 10, 15, None],
        "rf__min_samples_split": [2, 5],
        "rf__class_weight": [None, "balanced"]
    }

    grid = GridSearchCV(
        pipeline,
        param_grid,
        cv=5,
        scoring="accuracy",
        n_jobs=-1,
        verbose=1
    )
    grid.fit(X_train, y_train)
    return grid.best_estimator_