In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pickle
import warnings
warnings.filterwarnings('ignore')

# --- 1. Data Preprocessing & Engineering Module (Functional Module 1) ---

def preprocess_data(file_path):
    """
    Loads data, engineers the target variable, and prepares features for ML.
    """
    print("--- 1. Starting Data Preprocessing & Feature Engineering ---")

    # 1.1 Load Dataset
    try:
        df = pd.read_csv(file_path)
    except FileNotFoundError:
        print(f"Error: File not found at {file_path}")
        return None, None, None, None

    # Clean column names by removing spaces and slashes
    df.columns = df.columns.str.replace(' ', '_').str.replace('/', '_')

    # 1.2 Feature Engineering: Create the Target Variable ('Performance_Level')
    # Calculate the average score for each student
    df['average_score'] = df[['math_score', 'reading_score', 'writing_score']].mean(axis=1)

    # Define a binary classification target:
    # 1 for 'High Performer' (average score >= 70), 0 for 'Low Performer'
    # The threshold can be adjusted, but 70 is a reasonable initial choice for "Success"
    performance_threshold = 70
    df['Performance_Level'] = (df['average_score'] >= performance_threshold).astype(int)

    print(f"Target variable created: 'Performance_Level' (1=Success, 0=Failure, Threshold: {performance_threshold})")
    print(f"Success count: {df['Performance_Level'].sum()} / {len(df)}")

    # Define features (X) and target (y)
    # Drop the original scores and the engineered average score to avoid data leakage
    X = df.drop(columns=['math_score', 'reading_score', 'writing_score', 'average_score', 'Performance_Level'])
    y = df['Performance_Level']

    # 1.3 Identify Feature Types
    categorical_features = X.select_dtypes(include=['object']).columns

    # 1.4 Create a Column Transformer for One-Hot Encoding
    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
        ],
        remainder='passthrough' # Keep other columns (none in this case, but good practice)
    )

    # 1.5 Split Data into Training and Testing Sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    print(f"Data split: Training samples={len(X_train)}, Testing samples={len(X_test)}")
    print("--- Data Preprocessing Complete ---")

    return X_train, X_test, y_train, y_test, preprocessor

# --- 2. Model Training & Tuning Module (Functional Module 2) ---

def train_and_tune_model(X_train, y_train, preprocessor):
    """
    Builds a pipeline, performs hyperparameter tuning, and trains the final model.
    """
    print("\n--- 2. Starting Model Training & Tuning ---")

    # 2.1 Define the Model Pipeline
    # The pipeline combines the preprocessor (OHE) and the classifier
    model_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', RandomForestClassifier(random_state=42)) # Using Random Forest Classifier
    ])

    # 2.2 Hyperparameter Tuning with GridSearchCV
    # Define a grid of hyperparameters to search
    param_grid = {
        'classifier__n_estimators': [100, 200],  # Number of trees
        'classifier__max_depth': [5, 10, None],  # Max depth of trees
        'classifier__min_samples_split': [2, 5]  # Minimum number of samples required to split an internal node
    }

    # Use Grid Search for systematic hyperparameter optimization
    grid_search = GridSearchCV(
        model_pipeline,
        param_grid,
        cv=5,
        scoring='accuracy',
        n_jobs=-1,
        verbose=1
    )

    # Fit the grid search to the training data
    grid_search.fit(X_train, y_train)

    # The best estimator from the search is our final trained model
    best_model = grid_search.best_estimator_

    print("\nBest parameters found by Grid Search:")
    print(grid_search.best_params_)

    print("\n--- Model Training & Tuning Complete ---")

    return best_model

# --- 3. Prediction & Reporting Interface Module (Functional Module 3) ---

def evaluate_model(best_model, X_test, y_test):
    """
    Evaluates the model on the test set and prints a detailed report.
    """
    print("\n--- 3. Starting Model Evaluation & Reporting ---")

    # 3.1 Make predictions on the test set
    y_pred = best_model.predict(X_test)

    # 3.2 Calculate and print evaluation metrics
    print("\nModel Evaluation on Test Data:")
    print("-----------------------------------")

    # Accuracy Score
    test_accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {test_accuracy:.4f}")

    # Classification Report (Precision, Recall, F1-Score)
    print("\nClassification Report (0=Low Performer, 1=High Performer):\n")
    print(classification_report(y_test, y_pred, target_names=['Low Performer', 'High Performer']))

    # Confusion Matrix
    conf_mat = confusion_matrix(y_test, y_pred)
    print("Confusion Matrix:")
    print(conf_mat)
    #
    print("--- Model Evaluation Complete ---")

    return conf_mat, test_accuracy

def save_model(model, filename="final_ml_model.pkl"):
    """
    Saves the trained model to a file using pickle.
    """
    with open(filename, 'wb') as file:
        pickle.dump(model, file)
    print(f"\n✅ Trained model saved as {filename}")

def predict_new_student(model, new_data):
    """
    Function to demonstrate prediction on new, unseen student data.
    """
    print("\n--- Prediction Demonstration ---")
    # Convert the new data to a DataFrame, matching the training features
    new_df = pd.DataFrame([new_data])

    # Make the prediction
    prediction = model.predict(new_df)

    performance_map = {1: "High Performer (SUCCESS)", 0: "Low Performer (FAILURE)"}
    result = performance_map[prediction[0]]

    print(f"Input Student Data:\n{new_df.iloc[0].to_dict()}")
    print(f"\nPredicted Performance Level: {result}")

    return result

# --- Main Execution Block ---
if __name__ == "__main__":
    DATA_FILE = 'StudentsPerformance.csv' #
    MODEL_OUTPUT_FILE = 'student_performance_classifier.pkl'

    # 1. Data Preprocessing
    X_train, X_test, y_train, y_test, preprocessor = preprocess_data(DATA_FILE)

    if X_train is not None:
        # 2. Model Training & Tuning
        trained_model = train_and_tune_model(X_train, y_train, preprocessor)

        # Save the final model
        save_model(trained_model, MODEL_OUTPUT_FILE)

        # 3. Model Evaluation and Reporting
        evaluate_model(trained_model, X_test, y_test)

        # Demonstrate prediction on a new data point
        sample_new_student = {
            'gender': 'female',
            'race_ethnicity': 'group D', # Changed from 'race/ethnicity'
            'parental_level_of_education': "bachelor's degree", # Changed from 'parental level of education'
            'lunch': 'standard',
            'test_preparation_course': 'completed' # Changed from 'test preparation course'
        }
        predict_new_student(trained_model, sample_new_student)

--- 1. Starting Data Preprocessing & Feature Engineering ---
Target variable created: 'Performance_Level' (1=Success, 0=Failure, Threshold: 70)
Success count: 459 / 1000
Data split: Training samples=800, Testing samples=200
--- Data Preprocessing Complete ---

--- 2. Starting Model Training & Tuning ---
Fitting 5 folds for each of 12 candidates, totalling 60 fits

Best parameters found by Grid Search:
{'classifier__max_depth': 5, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 100}

--- Model Training & Tuning Complete ---

✅ Trained model saved as student_performance_classifier.pkl

--- 3. Starting Model Evaluation & Reporting ---

Model Evaluation on Test Data:
-----------------------------------
Accuracy: 0.6250

Classification Report (0=Low Performer, 1=High Performer):

                precision    recall  f1-score   support

 Low Performer       0.63      0.74      0.68       108
High Performer       0.62      0.49      0.55        92

      accuracy              