# ðŸ“Š Student Performance Prediction Pipeline

Predict student exam scores using ML models.

**Kaggle Datasets:**
- [Student Performance](https://www.kaggle.com/datasets/spscientist/students-performance-in-exams)
- [Open University (OULAD)](https://www.kaggle.com/datasets/rocki37/open-university-learning-analytics-dataset)

---

## 1. Setup

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import joblib
import os

os.makedirs('../models', exist_ok=True)
print('Setup complete')

## 2. Generate/Load Data

In [None]:
def generate_synthetic_data(n_samples=1000):
    np.random.seed(42)
    data = {
        'study_hours': np.random.uniform(1, 10, n_samples),
        'attendance_rate': np.random.uniform(50, 100, n_samples),
        'previous_score': np.random.uniform(30, 100, n_samples),
        'parental_education': np.random.choice(['high_school', 'bachelors', 'masters'], n_samples),
        'internet_access': np.random.choice([0, 1], n_samples),
        'extracurricular': np.random.choice([0, 1], n_samples),
        'sleep_hours': np.random.uniform(4, 10, n_samples),
        'screen_time': np.random.uniform(1, 8, n_samples),
    }
    target = (data['study_hours'] * 5 + data['attendance_rate'] * 0.3 +
              data['previous_score'] * 0.4 + data['internet_access'] * 5 +
              data['sleep_hours'] * 2 - data['screen_time'] * 1.5 + np.random.normal(0, 5, n_samples))
    data['final_score'] = np.clip(target, 0, 100)
    return pd.DataFrame(data)

df = generate_synthetic_data()
print(f'Dataset: {df.shape}')
df.head()

## 3. EDA

In [None]:
print('=== Correlation with Final Score ===')
for col in df.select_dtypes(include=[np.number]).columns:
    if col != 'final_score':
        print(f'{col}: {df[col].corr(df["final_score"]):.3f}')

## 4. Preprocessing

In [None]:
X = df.drop(columns=['final_score'])
y = df['final_score']

# Encode categorical
le = LabelEncoder()
X['parental_education'] = le.fit_transform(X['parental_education'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f'Train: {X_train_scaled.shape}, Test: {X_test_scaled.shape}')

## 5. Model Training

In [None]:
models = {
    'Linear Regression': LinearRegression(),
    'Ridge': Ridge(alpha=1.0),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42)
}

best_model, best_r2 = None, -np.inf

print('=== Model Results ===')
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print(f'{name}: RÂ²={r2:.3f}, RMSE={rmse:.2f}')
    if r2 > best_r2:
        best_r2, best_model = r2, (name, model)

print(f'\nBest: {best_model[0]} (RÂ²={best_r2:.3f})')

## 6. Save Model

In [None]:
joblib.dump(best_model[1], '../models/student_performance.joblib')
joblib.dump(scaler, '../models/student_performance_scaler.joblib')
print('Model saved!')

## 7. Demo Prediction

In [None]:
sample = [[7, 85, 75, 1, 1, 1, 7, 3]]  # study, attend, prev, parent_edu, internet, extra, sleep, screen
sample_scaled = scaler.transform(sample)
pred = best_model[1].predict(sample_scaled)[0]
print(f'Predicted Score: {pred:.1f}')