In [1]:

#@title Install & import (safe to re-run)
import sys, subprocess

def pip_install(pkg):
    subprocess.run([sys.executable, "-m", "pip", "install", "-q", pkg], check=False)

pip_install("gdown>=5.1")
pip_install("pandas>=1.5")
pip_install("numpy>=1.23")
pip_install("matplotlib>=3.7")
pip_install("scikit-learn>=1.3")

import gdown, os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
#package.module


In [2]:
import gdown, os, pandas as pd

GDOWN_ID = "1VXJ0VcTBUsdbKs9FBNYMUP7gnemZ7W1q"  # replace with your file’s ID
OUTPUT_CSV = "StudentsPerformance.csv"

if not os.path.exists(OUTPUT_CSV):
    url = f"https://drive.google.com/uc?id={GDOWN_ID}"
    gdown.download(url, OUTPUT_CSV, quiet=False)
else:
    print("Found existing file:", OUTPUT_CSV)

df = pd.read_csv(OUTPUT_CSV)
print("✅ Loaded:", df.shape)
df.head()


Downloading...
From: https://drive.google.com/uc?id=1VXJ0VcTBUsdbKs9FBNYMUP7gnemZ7W1q
To: /content/StudentsPerformance.csv
100%|██████████| 72.0k/72.0k [00:00<00:00, 24.8MB/s]

✅ Loaded: (1000, 8)





Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [3]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OrdinalEncoder

# 1) Standardize column names
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")
print("✅ Standardized column names:", list(df.columns))

# 2) Check duplicates
print("Duplicates count:", df.duplicated().sum())
df = df.drop_duplicates()

# 3) Columns with missing values
print("Missing values per column:")
print(df.isnull().sum())

# 4) Numeric imputation strategy (example: mean)
num_cols = df.select_dtypes(include=np.number).columns
print("Numeric columns:", list(num_cols))

num_imputer = SimpleImputer(strategy="mean")   # or "median"
df[num_cols] = num_imputer.fit_transform(df[num_cols])

# 5) Categorical imputation (most_frequent)
cat_cols = df.select_dtypes(exclude=np.number).columns
print("Categorical columns:", list(cat_cols))

cat_imputer = SimpleImputer(strategy="most_frequent")
df[cat_cols] = cat_imputer.fit_transform(df[cat_cols])

# 6) Encoding strategy (One Hot Encoding)
df = pd.get_dummies(df, drop_first=True)

# 7) Scaling strategy (StandardScaler or MinMax)
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

# 8) Outlier handling (IQR method example)
for col in num_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5*IQR
    upper = Q3 + 1.5*IQR
    outliers = df[(df[col] < lower) | (df[col] > upper)]
    print(f"Outliers in {col}: {len(outliers)}")

# 9) Feature selection/dropping (example: drop writing_score if highly correlated with reading_score)
# You can check correlations first
corr = df.corr()
print(corr["math_score"].sort_values(ascending=False))


✅ Standardized column names: ['gender', 'race/ethnicity', 'parental_level_of_education', 'lunch', 'test_preparation_course', 'math_score', 'reading_score', 'writing_score']
Duplicates count: 0
Missing values per column:
gender                         0
race/ethnicity                 0
parental_level_of_education    0
lunch                          0
test_preparation_course        0
math_score                     0
reading_score                  0
writing_score                  0
dtype: int64
Numeric columns: ['math_score', 'reading_score', 'writing_score']
Categorical columns: ['gender', 'race/ethnicity', 'parental_level_of_education', 'lunch', 'test_preparation_course']
Outliers in math_score: 8
Outliers in reading_score: 8
Outliers in writing_score: 5
math_score                                       1.000000
reading_score                                    0.817580
writing_score                                    0.802642
lunch_standard                                   0.350877
race

In [None]:
corr = df.corr(numeric_only=True)
corr['math_score'].sort_values(ascending=False)


Unnamed: 0,math_score
math_score,1.0
reading_score,0.81758
writing_score,0.802642
lunch_standard,0.350877
race/ethnicity_group E,0.205855
gender_male,0.167982
parental_level_of_education_bachelor's degree,0.079664
parental_level_of_education_master's degree,0.060417
race/ethnicity_group D,0.050071
parental_level_of_education_some college,0.037056


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

X = df.drop(columns=['math_score'])
y = df['math_score']

# one-hot encode categorical features
X = pd.get_dummies(X, drop_first=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2_test = r2_score(y_test, y_pred)
r2_train = model.score(X_train, y_train)
gap = abs(r2_train - r2_test)
print(mae, mse, rmse, r2_test, r2_train,gap)


0.2781012880760493 0.12667197758547108 0.3559100695196345 0.8804332983749564 0.8743172040139593 0.0061160943609971374


In [6]:
import pandas as pd
import numpy as np
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Ensure X is all numeric
X_numeric = X.select_dtypes(include=[np.number])

# Compute VIF
X_vif = pd.DataFrame()
X_vif["feature"] = X_numeric.columns
X_vif["VIF"] = [variance_inflation_factor(X_numeric.values, i) for i in range(X_numeric.shape[1])]

print(X_vif)



         feature        VIF
0  reading_score  11.268556
1  writing_score  11.268556


In [8]:
import statsmodels.api as sm

# Ensure dummies are numeric
X = pd.get_dummies(X, drop_first=True)

# Force all to numeric (coerce errors → NaN)
X = X.apply(pd.to_numeric, errors='coerce')

# Drop any NaN rows (if coercion produced some)
X = X.dropna()

# Add constant again
X_vif = sm.add_constant(X)


In [9]:
print(X_vif.dtypes)   # should all be int64 or float64


const                                            float64
reading_score                                    float64
writing_score                                    float64
gender_male                                         bool
race/ethnicity_group B                              bool
race/ethnicity_group C                              bool
race/ethnicity_group D                              bool
race/ethnicity_group E                              bool
parental_level_of_education_bachelor's degree       bool
parental_level_of_education_high school             bool
parental_level_of_education_master's degree         bool
parental_level_of_education_some college            bool
parental_level_of_education_some high school        bool
lunch_standard                                      bool
test_preparation_course_none                        bool
dtype: object


In [10]:
# Convert all bool columns to int (0/1)
X_vif = X_vif.astype(float)   # or use .astype(int) if you prefer 0/1


In [11]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Ensure all features are numeric
X_vif = X_vif.apply(lambda x: x.astype(int) if x.dtype == 'bool' else x)
X_vif = X_vif.astype(float)  # make sure all are float for statsmodels

# Compute VIF
vif_data = pd.DataFrame()
vif_data["feature"] = X_vif.columns
vif_data["VIF"] = [variance_inflation_factor(X_vif.values, i)
                   for i in range(X_vif.shape[1])]
print(vif_data.sort_values(by="VIF", ascending=False))


                                          feature        VIF
0                                           const  20.479928
2                                   writing_score  15.197227
1                                   reading_score  13.098439
5                          race/ethnicity_group C   3.182481
6                          race/ethnicity_group D   3.020222
4                          race/ethnicity_group B   2.565534
7                          race/ethnicity_group E   2.275519
11       parental_level_of_education_some college   1.570961
9         parental_level_of_education_high school   1.569870
12   parental_level_of_education_some high school   1.545603
8   parental_level_of_education_bachelor's degree   1.372319
14                   test_preparation_course_none   1.257279
10    parental_level_of_education_master's degree   1.215077
3                                     gender_male   1.201317
13                                 lunch_standard   1.115216


In [13]:
import statsmodels.api as sm

# X_vif is your features after get_dummies and adding constant
model = sm.OLS(df['math_score'], X_vif).fit()
residuals = model.resid  # differences between predicted and actual values


In [14]:
from scipy.stats import shapiro
shapiro_test = shapiro(residuals)
shapiro_p = shapiro_test[1]  # this is what goes in your table
print("Shapiro p-value:", shapiro_p)


Shapiro p-value: 0.4088062278075787


In [15]:
from statsmodels.stats.stattools import durbin_watson
dw = durbin_watson(residuals)
print("Durbin-Watson:", dw)


Durbin-Watson: 1.9861246335872516


In [16]:
import statsmodels.stats.api as sms
bp_test = sms.het_breuschpagan(residuals, model.model.exog)
bp_p = bp_test[1]
print("Breusch-Pagan p-value:", bp_p)


Breusch-Pagan p-value: 0.2871122862369354


In [18]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
import numpy as np

X = df[['writing_score']]
y = df['math_score']

best_r2 = -np.inf
best_deg = 1
for deg in [1,2,3]:
    poly = PolynomialFeatures(degree=deg, include_bias=False)
    X_poly = poly.fit_transform(X)
    r2 = cross_val_score(LinearRegression(), X_poly, y, cv=5, scoring='r2').mean()
    if r2 > best_r2:
        best_r2 = r2
        best_deg = deg

print("Best degree:", best_deg, "CV R²:", best_r2)


Best degree: 3 CV R²: 0.640191134058069


In [19]:
cat_cols = df.select_dtypes(include=['object']).columns
print(cat_cols)


Index([], dtype='object')


In [21]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Target
y = df['math_score']

# Features
X = df.drop(columns=['math_score'])

# One-Hot Encode categorical columns
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first'), cat_cols)  # drop='first' to avoid dummy variable trap
    ],
    remainder='passthrough'  # keep numeric columns as-is
)

X_enc = preprocessor.fit_transform(X)

# Ensure numeric type
import numpy as np
X_enc = X_enc.astype(float)


In [23]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

# X before encoding (only numeric columns)
X_num = df.select_dtypes(include=['int64','float64'])
y = df['math_score']

model = LinearRegression()
r2_before = cross_val_score(model, X_num, y, cv=5, scoring='r2').mean()
print("R² before encoding:", r2_before)


R² before encoding: 1.0


In [24]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
import numpy as np

cat_cols = df.select_dtypes(include=['object']).columns
X = df.drop(columns=['math_score'])

preprocessor = ColumnTransformer(
    transformers=[('cat', OneHotEncoder(drop='first'), cat_cols)],
    remainder='passthrough'
)
X_enc = preprocessor.fit_transform(X).astype(float)

r2_after = cross_val_score(model, X_enc, y, cv=5, scoring='r2').mean()
print("R² after encoding:", r2_after)


R² after encoding: 0.8717997497009925


In [25]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV, train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_enc, y, test_size=0.2, random_state=42)

lasso = Lasso(max_iter=10000)
alphas = [0.01, 0.1, 1, 10]
grid = GridSearchCV(lasso, param_grid={'alpha': alphas}, cv=5, scoring='r2')
grid.fit(X_train, y_train)

best_alpha = grid.best_params_['alpha']
best_r2 = grid.best_score_
test_r2 = grid.score(X_test, y_test)
non_zero = np.sum(grid.best_estimator_.coef_ != 0)

print("Best alpha:", best_alpha)
print("CV R²:", best_r2)
print("Test R²:", test_r2)
print("# non-zero coefs:", non_zero)


Best alpha: 0.01
CV R²: 0.8653723647782485
Test R²: 0.8812308883859827
# non-zero coefs: 7


In [26]:
from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import GridSearchCV, train_test_split
import numpy as np

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X_enc, y, test_size=0.2, random_state=42)

# Alpha values to try
alpha_grid = [0.01, 0.1, 1, 10]

# ----- Lasso -----
lasso = Lasso(max_iter=10000)
grid_lasso = GridSearchCV(lasso, param_grid={'alpha': alpha_grid}, cv=5, scoring='r2')
grid_lasso.fit(X_train, y_train)

# Values for table
lasso_best_alpha = grid_lasso.best_params_['alpha']
lasso_r2_test = grid_lasso.score(X_test, y_test)
lasso_nonzero = np.sum(grid_lasso.best_estimator_.coef_ != 0)

print("Lasso:")
print("Alpha grid:", alpha_grid)
print("Best alpha:", lasso_best_alpha)
print("Test R²:", lasso_r2_test)
print("# Non-zero coefficients:", lasso_nonzero)

# ----- Ridge -----
ridge = Ridge(max_iter=10000)
grid_ridge = GridSearchCV(ridge, param_grid={'alpha': alpha_grid}, cv=5, scoring='r2')
grid_ridge.fit(X_train, y_train)

ridge_best_alpha = grid_ridge.best_params_['alpha']
ridge_r2_test = grid_ridge.score(X_test, y_test)

print("\nRidge:")
print("Alpha grid:", alpha_grid)
print("Best alpha:", ridge_best_alpha)
print("Test R²:", ridge_r2_test)
# Ridge keeps all coefficients non-zero
print("# Non-zero coefficients: all")


Lasso:
Alpha grid: [0.01, 0.1, 1, 10]
Best alpha: 0.01
Test R²: 0.8812308883859827
# Non-zero coefficients: 7

Ridge:
Alpha grid: [0.01, 0.1, 1, 10]
Best alpha: 1
Test R²: 0.8804513937125638
# Non-zero coefficients: all


In [27]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import mean_squared_error
import numpy as np

cv_folds = 5
model = LinearRegression()

# R² CV scores
r2_scores = cross_val_score(model, X_enc, y, cv=cv_folds, scoring='r2')
r2_mean = r2_scores.mean()
r2_std = r2_scores.std()

# RMSE CV
y_pred = cross_val_predict(model, X_enc, y, cv=cv_folds)
rmse = np.sqrt(mean_squared_error(y, y_pred))

# RMSE CV std can be approximated per fold
from sklearn.model_selection import KFold

kf = KFold(n_splits=cv_folds, shuffle=True, random_state=42)
rmse_per_fold = []
for train_idx, test_idx in kf.split(X_enc):
    model.fit(X_enc[train_idx], y.iloc[train_idx])
    y_pred_fold = model.predict(X_enc[test_idx])
    rmse_fold = np.sqrt(mean_squared_error(y.iloc[test_idx], y_pred_fold))
    rmse_per_fold.append(rmse_fold)

rmse_mean = np.mean(rmse_per_fold)
rmse_std = np.std(rmse_per_fold)

print("R² CV mean:", r2_mean)
print("R² CV std:", r2_std)
print("RMSE CV mean:", rmse_mean)
print("RMSE CV std:", rmse_std)


R² CV mean: 0.8717997497009925
R² CV std: 0.015014562412394575
RMSE CV mean: 0.35696094939528844
RMSE CV std: 0.01537149610534244
