In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import pickle
import os

def create_study_dataset():
    """Create synthetic dataset for study hours vs exam scores"""
    print("Creating synthetic dataset...")
    np.random.seed(42)
    
    # Generate 1000 student records
    n_students = 1000
    
    # Features
    study_hours = np.random.uniform(0, 12, n_students)  # 0-12 hours daily study
    previous_score = np.random.uniform(30, 95, n_students)  # Previous exam scores
    sleep_hours = np.random.uniform(4, 10, n_students)  # Hours of sleep
    attendance = np.random.uniform(60, 100, n_students)  # Class attendance %
    
    # Realistic exam score calculation
    exam_score = (
        study_hours * 4.2 +          # Study impact: 4.2 points per hour
        previous_score * 0.25 +      # Previous performance matters
        sleep_hours * 1.8 +          # Sleep is important: 1.8 points per hour
        attendance * 0.15 +          # Attendance impact: 0.15 per %
        np.random.normal(0, 8, n_students) +  # Random noise
        5  # Base score
    )
    
    # Keep scores realistic (0-100)
    exam_score = np.clip(exam_score, 15, 98)
    
    # Create DataFrame
    data = pd.DataFrame({
        'study_hours': np.round(study_hours, 1),
        'previous_score': np.round(previous_score, 1),
        'sleep_hours': np.round(sleep_hours, 1),
        'attendance': np.round(attendance, 1),
        'exam_score': np.round(exam_score, 1)
    })
    
    return data

def train_and_save_model():
    """Train the linear regression model and save it"""
    print("=== STUDY HOURS TO EXAM SCORE PREDICTOR ===\n")
    
    # Create dataset
    data = create_study_dataset()
    print(f"Dataset created with {len(data)} students")
    print("\nFirst 5 records:")
    print(data.head())
    
    print(f"\nDataset statistics:")
    print(data.describe())
    
    # Prepare features and target
    feature_columns = ['study_hours', 'previous_score', 'sleep_hours', 'attendance']
    X = data[feature_columns]
    y = data['exam_score']
    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    
    print(f"\nData split:")
    print(f"Training set: {len(X_train)} students")
    print(f"Testing set: {len(X_test)} students")
    
    # Create and train the model
    print("\nTraining Linear Regression model...")
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluate the model
    train_mse = mean_squared_error(y_train, y_train_pred)
    test_mse = mean_squared_error(y_test, y_test_pred)
    train_r2 = r2_score(y_train, y_train_pred)
    test_r2 = r2_score(y_test, y_test_pred)
    
    print(f"\n=== MODEL PERFORMANCE ===")
    print(f"Training MSE: {train_mse:.2f}")
    print(f"Testing MSE: {test_mse:.2f}")
    print(f"Training R²: {train_r2:.4f}")
    print(f"Testing R²: {test_r2:.4f}")
    
    # Show feature importance (coefficients)
    print(f"\n=== FEATURE IMPORTANCE ===")
    for feature, coef in zip(feature_columns, model.coef_):
        print(f"{feature}: {coef:.3f}")
    print(f"Intercept: {model.intercept_:.3f}")
    
    # Create models directory if it doesn't exist
    os.makedirs('models', exist_ok=True)
    
    # Package model data
    model_package = {
        'model': model,
        'feature_names': feature_columns,
        'model_type': 'Linear Regression',
        'performance': {
            'test_mse': test_mse,
            'test_r2': test_r2,
            'train_r2': train_r2
        }
    }
    
    # Save the model
    model_path = 'models/study_model.pkl'
    with open(model_path, 'wb') as file:
        pickle.dump(model_package, file)
    
    print(f"\n✅ Model saved successfully to {model_path}")
    
    # Test with sample predictions
    print(f"\n=== SAMPLE PREDICTIONS ===")
    test_cases = [
        [8.0, 85.0, 8.0, 95.0],  # High study, good previous, good sleep, high attendance
        [3.0, 60.0, 6.0, 70.0],  # Low study, average previous, ok sleep, ok attendance
        [10.0, 90.0, 7.0, 98.0], # Very high study, excellent previous, good sleep, perfect attendance
        [1.0, 45.0, 5.0, 50.0]   # Very low study, poor previous, little sleep, poor attendance
    ]
    
    for i, case in enumerate(test_cases, 1):
        prediction = model.predict([case])[0]
        print(f"Student {i}: Study={case[0]}h, Previous={case[1]}%, Sleep={case[2]}h, Attendance={case[3]}%")
        print(f"  → Predicted Exam Score: {prediction:.1f}%\n")

if __name__ == "__main__":
    train_and_save_model()

=== STUDY HOURS TO EXAM SCORE PREDICTOR ===

Creating synthetic dataset...
Dataset created with 1000 students

First 5 records:
   study_hours  previous_score  sleep_hours  attendance  exam_score
0          4.5            42.0          5.6        86.9        68.1
1         11.4            65.2          5.5        91.9        95.4
2          8.8            86.7          9.4        70.0        86.2
3          7.2            77.6          5.5        85.0        80.9
4          1.9            82.4          5.6        82.9        52.4

Dataset statistics:
       study_hours  previous_score  sleep_hours   attendance   exam_score
count  1000.000000     1000.000000  1000.000000  1000.000000  1000.000000
mean      5.882000       62.956600     7.015400    79.614900    69.635100
std       3.506924       18.992341     1.743549    11.458911    17.102178
min       0.100000       30.200000     4.000000    60.000000    25.900000
25%       2.800000       45.675000     5.600000    69.700000    57.100000

