In [1]:
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import warnings
warnings.filterwarnings("ignore")
sns.set(style="whitegrid")
DATA_FILE = "student_performance.csv"
OUTPUT_DIR = "outputs"
def safe_mkdir(path):
    if not os.path.exists(path):
        os.makedirs(path)
def load_data(path):
    if not os.path.exists(path):
        raise FileNotFoundError(f"File not found: {path}. Put your CSV in the same folder as this script.")
    df = pd.read_csv(path)
    return df
def clean_and_fill(df):
    # Drop exact duplicate rows if any
    df = df.drop_duplicates().reset_index(drop=True)
    # Lower/strip column names for safer checks, keep original mapping
    cols = {c: c.strip() for c in df.columns}
    df.rename(columns=cols, inplace=True)
    # Replace empty strings with NaN
    df.replace(r'^\s*$', np.nan, regex=True, inplace=True)
    # Fill numeric columns with mean, categorical with mode
    for col in df.columns:
        if pd.api.types.is_numeric_dtype(df[col]):
            if df[col].isnull().any():
                df[col].fillna(df[col].mean(), inplace=True)
        else:
            if df[col].isnull().any():
                try:
                    df[col].fillna(df[col].mode(dropna=True)[0], inplace=True)
                except Exception:
                    df[col].fillna("Unknown", inplace=True)
    return df

def infer_target(df):
    # Prefer existing Pass column (case-insensitive)
    pass_col = None
    for candidate in ["Pass", "pass", "PASS"]:
        if candidate in df.columns:
            pass_col = candidate
            break
    if pass_col:
        df['Pass'] = df[pass_col].apply(lambda x: int(x) if (str(x).strip() != "") else 0)
        return df, 'Pass'

    # If Score exists, create Pass with threshold 50
    if 'Score' in df.columns:
        df['Pass'] = (pd.to_numeric(df['Score'], errors='coerce').fillna(0) >= 50).astype(int)
        return df, 'Pass'

    # If neither exists, ask user to create a target manually
    raise ValueError("No 'Pass' or 'Score' column found. Please include a 'Score' column or a 'Pass' (0/1) column in the CSV.")

def basic_eda_and_save(df):
    safe_mkdir(OUTPUT_DIR)
    print("\n-- Basic EDA --")
    print("Shape:", df.shape)
    print(df.describe(include='all').T)
    # Score distribution (if present)
    if 'Score' in df.columns:
        plt.figure(figsize=(8,5))
        sns.histplot(df['Score'].dropna(), bins=30, kde=True)
        plt.title("Score Distribution")
        plt.tight_layout()
        plt.savefig(os.path.join(OUTPUT_DIR, "score_distribution.png"), dpi=150)
        plt.close()
    # Boxplot: Score by SchoolType if available
    if 'SchoolType' in df.columns and 'Score' in df.columns:
        plt.figure(figsize=(7,4))
        sns.boxplot(x='SchoolType', y='Score', data=df)
        plt.title("Score by School Type")
        plt.tight_layout()
        plt.savefig(os.path.join(OUTPUT_DIR, "score_by_schooltype.png"), dpi=150)
        plt.close()
    # Correlation heatmap for numeric columns
    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    if len(num_cols) >= 2:
        plt.figure(figsize=(8,6))
        sns.heatmap(df[num_cols].corr(), annot=True, fmt=".2f", cmap="Blues")
        plt.title("Correlation Heatmap (numeric)")
        plt.tight_layout()
        plt.savefig(os.path.join(OUTPUT_DIR, "correlation_heatmap.png"), dpi=150)
        plt.close()
def prepare_features(df, drop_id=True):
    # Drop StudentID-like column if exists (to avoid leakage)
    candidate_id_cols = [c for c in df.columns if c.lower() in ('studentid', 'student_id', 'id')]
    if candidate_id_cols and drop_id:
        df = df.drop(columns=candidate_id_cols)
    # Remove target if present (we'll separate later)
    # Convert categorical columns to dummies (one-hot)
    cat_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
    # Exclude text columns that are long (if any) â€” but we'll convert all strings
    df_encoded = pd.get_dummies(df, columns=cat_cols, drop_first=True)
    return df_encoded
def train_random_forest(X, y):
    # Split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)
    # Scale numeric columns only
    numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
    scaler = StandardScaler()
    X_train_scaled = X_train.copy()
    X_test_scaled = X_test.copy()
    if numeric_cols:
        X_train_scaled[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
        X_test_scaled[numeric_cols] = scaler.transform(X_test[numeric_cols])
    # Train RF
    clf = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
    clf.fit(X_train_scaled, y_train)
    y_pred = clf.predict(X_test_scaled)
    acc = accuracy_score(y_test, y_pred)
    print(f"\nModel Accuracy: {acc:.4f}")
    print("\nClassification Report:\n", classification_report(y_test, y_pred))
    # Save confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(5,4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title("Confusion Matrix")
    plt.xlabel("Predicted"); plt.ylabel("Actual")
    plt.tight_layout()
    plt.savefig(os.path.join(OUTPUT_DIR, "confusion_matrix.png"), dpi=150)
    plt.close()
    return clf, X_test, y_test, y_pred

def feature_importance_plot(model, X, top_n=15):
    # Try to get feature importances; if model has it
    if hasattr(model, "feature_importances_"):
        importances = pd.Series(model.feature_importances_, index=X.columns).sort_values(ascending=False)
        importances_top = importances.head(top_n)
        plt.figure(figsize=(8,6))
        sns.barplot(x=importances_top.values, y=importances_top.index)
        plt.title("Top Feature Importances")
        plt.tight_layout()
        plt.savefig(os.path.join(OUTPUT_DIR, "feature_importances.png"), dpi=150)
        plt.close()
        # Save importances to CSV
        importances.to_csv(os.path.join(OUTPUT_DIR, "feature_importances.csv"))
    else:
        print("Model does not expose feature_importances_")

def main():
    safe_mkdir(OUTPUT_DIR)
    try:
        df = load_data(DATA_FILE)
    except Exception as e:
        print("ERROR loading data:", e)
        sys.exit(1)

    print("Loaded dataset with shape:", df.shape)
    # quick columns preview
    print("Columns:", df.columns.tolist())

    # Clean missing and duplicates
    df = clean_and_fill(df)

    # Infer target (Pass) from Score or existing Pass
    try:
        df, target_col = infer_target(df)
        print(f"Using target column: '{target_col}'")
    except Exception as e:
        print("ERROR inferring target:", e)
        print("If your CSV has a different target column, add it named 'Pass' (0/1) or add a 'Score' column.")
        sys.exit(1)

    # EDA
    basic_eda_and_save(df)

    # Prepare features
    # Keep a copy of cleaned df for saving
    df_cleaned_path = os.path.join(OUTPUT_DIR, "cleaned_student_data.csv")
    df.to_csv(df_cleaned_path, index=False)
    print(f"Cleaned data saved to: {df_cleaned_path}")

    # Encode features
    y = df[target_col]
    X_df = df.drop(columns=[target_col], errors='ignore')
    X_encoded = prepare_features(X_df, drop_id=True)

    # Ensure no constant columns, drop columns with single unique value
    nunique = X_encoded.nunique()
    cols_to_drop = nunique[nunique <= 1].index.tolist()
    if cols_to_drop:
        X_encoded.drop(columns=cols_to_drop, inplace=True)

    # Align X and y shapes
    print("Final feature matrix shape:", X_encoded.shape)

    if X_encoded.shape[0] < 10:
        print("Not enough rows to train a model. Need more data.")
        sys.exit(1)

    # Train model
    model, X_test, y_test, y_pred = train_random_forest(X_encoded, y)

    # Feature importance
    feature_importance_plot(model, X_encoded)
    try:
        X_test_reset = X_test.reset_index(drop=True)
        results_df = X_test_reset.copy()
        results_df['Actual'] = y_test.reset_index(drop=True)
        results_df['Predicted'] = y_pred
        results_df.to_csv(os.path.join(OUTPUT_DIR, "predictions_sample.csv"), index=False)
        print("Saved predictions sample to outputs/predictions_sample.csv")
    except Exception as e:
        print("Could not save predictions sample:", e)

    print("\nDone. Check the 'outputs' folder for images and CSV files.")

if __name__ == "__main__":
    main()


Loaded dataset with shape: (1200, 14)
Columns: ['StudentID', 'Gender', 'Age', 'SchoolType', 'ParentalEducation', 'StudyHours', 'AttendancePercent', 'ExtraClasses', 'InternetAccess', 'PreviousScore', 'TestPrepCourse', 'SocioeconomicStatus', 'Score', 'Pass']
Using target column: 'Pass'

-- Basic EDA --
Shape: (1200, 14)
                      count unique        top  freq       mean        std  \
StudentID              1200   1200  STUD11183     1        NaN        NaN   
Gender                 1200      3     Female   602        NaN        NaN   
Age                  1200.0    NaN        NaN   NaN  16.555833     1.7086   
SchoolType             1200      2     Public   824        NaN        NaN   
ParentalEducation      1200      5   Bachelor   363        NaN        NaN   
StudyHours           1200.0    NaN        NaN   NaN   3.547667   1.896068   
AttendancePercent    1200.0    NaN        NaN   NaN   84.64375   9.524886   
ExtraClasses           1200      2         No   784        NaN  