<a href="https://colab.research.google.com/github/shiv-coder/AIDI1002W2026/blob/main/Module3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
# ==========================================
# Week 3 — Complete Feature Engineering & Selection (Colab-ready)
# ==========================================

# STEP 1: Import Libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from google.colab import files

# STEP 2: Load dataset from CSV
# Upload week3_student_data.csv before running
uploaded = files.upload()
for filename in uploaded.keys():
    df = pd.read_csv('week3_student_data.csv');

# STEP 3: Explore dataset
print("First 5 rows:")
print(df.head())
print("\nInfo:")
print(df.info())
print("\nSummary statistics:")
print(df.describe())
print("\nMissing values:")
print(df.isnull().sum())

# STEP 4: Handle Missing Values FIRST
numeric_cols = ["StudyHours", "SleepHours", "PreviousScore", "Attendance"]
categorical_cols = ["ParentEducation"]

for col in numeric_cols:
    if col in ["StudyHours", "PreviousScore"]:
        df[col] = df[col].fillna(df[col].mean())
    else:
        df[col] = df[col].fillna(df[col].median())

for col in categorical_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

print("\nMissing values after filling:")
print(df.isnull().sum())

# STEP 5: Feature Engineering — Create new features
df["StudyProductivity"] = df["StudyHours"] * df["SleepHours"]
df["ScoreImprovement"] = df["ExamScore"] - df["PreviousScore"]
df["AttendanceCategory"] = df["Attendance"].apply(lambda x: "High" if x >= 85 else "Low/Medium")

# Verify no NaNs in new features
print("\nMissing values after creating new features:")
print(df[["StudyProductivity", "ScoreImprovement", "AttendanceCategory"]].isnull().sum())

# STEP 6: Encode Categorical Variables
le = LabelEncoder()
df["Gender_Encoded"] = le.fit_transform(df["Gender"])
df["AttendanceCategory_Encoded"] = le.fit_transform(df["AttendanceCategory"])
df = pd.get_dummies(df, columns=["ParentEducation"], drop_first=True)

# STEP 7: Scale Numerical Features
scaler_std = StandardScaler()
scaler_mm = MinMaxScaler()
df["PreviousScore_std"] = scaler_std.fit_transform(df[["PreviousScore"]])
df["SleepHours_scaled"] = scaler_mm.fit_transform(df[["SleepHours"]])

# STEP 8: Feature Selection — Statistical (SelectKBest)
X = df[["StudyHours", "Attendance", "PreviousScore", "SleepHours", "Gender_Encoded", "StudyProductivity"]]
y = df["ExamScore"]

selector = SelectKBest(score_func=f_regression, k=3)
selector.fit(X, y)
scores = selector.scores_

feature_scores = pd.DataFrame({
    "Feature": X.columns,
    "Score": scores
}).sort_values(by="Score", ascending=False)
print("\nTop features (SelectKBest):")
print(feature_scores)

# STEP 9: Feature Selection — Model-Based (Random Forest)
rf = RandomForestRegressor(random_state=42)
rf.fit(X, y)
importances = pd.DataFrame({
    "Feature": X.columns,
    "Importance": rf.feature_importances_
}).sort_values(by="Importance", ascending=False)
print("\nFeature importances (Random Forest):")
print(importances)

# STEP 10: Evaluate Model Performance — Baseline (all features)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
baseline_mse = mean_squared_error(y_test, y_pred)
print("\nBaseline MSE (all features):", baseline_mse)

# STEP 11: Evaluate Model Performance — Top 3 features
top3_features = feature_scores["Feature"].iloc[:3].values
X_top3 = df[top3_features]

X_train2, X_test2, y_train2, y_test2 = train_test_split(X_top3, y, test_size=0.2, random_state=42)
lr2 = LinearRegression()
lr2.fit(X_train2, y_train2)
y_pred2 = lr2.predict(X_test2)
selected_mse = mean_squared_error(y_test2, y_pred2)
print("MSE (Top 3 features):", selected_mse)

# STEP 12: Compare Actual vs Predicted for Top 3 Features
comparison = pd.DataFrame({
    "Actual": y_test2,
    "Predicted": y_pred2
})
print("\nComparison of Actual vs Predicted (Top 3 features):")
print(comparison)


Saving week3_student_data.csv to week3_student_data (2).csv
First 5 rows:
  StudentID  Gender  StudyHours  Attendance  PreviousScore  SleepHours  \
0      S001    Male        12.0        90.0           75.0         7.0   
1      S002  Female         9.0        85.0           68.0         6.0   
2      S003    Male        15.0        95.0           80.0         8.0   
3      S004  Female         NaN        70.0           60.0         5.0   
4      S005  Female         8.0        80.0            NaN         6.0   

  ParentEducation  ExamScore  
0        Bachelor         82  
1     High School         74  
2          Master         90  
3     High School         65  
4        Bachelor         72  

Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15 entries, 0 to 14
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   StudentID        15 non-null     object 
 1   Gender           15 non-null     object 
 2