# Q1 Linear Regression

In [1]:
# Import required libraries
import hashlib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_squared_error
import plotly.graph_objects as go

In [2]:
# Using SHA-256 to generate unique seed
username = "m.samartha"
seed = int( hashlib.sha256(username.encode()).hexdigest(),16) % (2**32)
print(seed)

2669878649


In [3]:
# Load dataset
df = pd.read_csv("Dataset/student-dataset.csv")
print(df)

      Gender   Major Program   GPA
0       Male  B.Tech     CHD  6.65
1       Male  B.Tech     CSE  7.05
2     Female  B.Tech     CSE  7.67
3       Male  B.Tech     ECE  8.89
4       Male  B.Tech     ECE  6.35
...      ...     ...     ...   ...
9995    Male  B.Tech     ECE  5.82
9996  Female  B.Tech     ECE  7.72
9997    Male     PhD     CHD  7.68
9998    Male  B.Tech     ECE  7.61
9999    Male     PhD     ECE  8.26

[10000 rows x 4 columns]


In [None]:
class PolyRegressor:
    def __init__(self, degree=1, regularizer=None, reg_strength=0.0):
        """
        Initialize the polynomial regression model.
        
        Args:
            degree (int): Degree of polynomial features.
            regularizer (str or None): Type of regularization ('l1', 'l2', or None).
            reg_strength (float): Regularization strength (alpha).
        """
        self.degree = degree
        self.regularizer = regularizer
        self.reg_strength = reg_strength
        self.username = "m.samartha"
        # Polynomial expansion and scaling
        self.poly = PolynomialFeatures(degree=self.degree)
        self.scaler = StandardScaler()

        # Model selection
        if self.regularizer is None:
            self.model = LinearRegression()
        elif self.regularizer == 'l1':
            self.model = Lasso(alpha=self.reg_strength, max_iter=10000)
        elif self.regularizer == 'l2':
            self.model = Ridge(alpha=self.reg_strength, max_iter=10000)
        else:
            raise ValueError("regularizer must be one of None, 'l1', 'l2'")
    
    def prepare_data(self, df, target_col="GPA", 
                     test_size=0.2, val_size=0.2, random_state=seed):
        """
        Split dataframe into train, validation, and test sets.
        One-hot encode categorical features and return numpy arrays.

        Args:
            df (pd.DataFrame): Input dataframe with features + target.
            target_col (str): Name of the target column (default "GPA").
            test_size (float): Fraction of data to use for test.
            val_size (float): Fraction of data to use for validation.
            random_state (int): Random seed for reproducibility.

        Returns:
            X_train, y_train, X_val, y_val, X_test, y_test (all numpy arrays)
        """
        # Separate features (X) and target (y)
        X = df.drop(columns=[target_col])
        y = df[target_col]

        # One-hot encode categorical features
        X = pd.get_dummies(X, drop_first=True)  

        # First split: train+val vs test
        X_temp, X_test, y_temp, y_test = train_test_split(
            X, y, test_size=test_size, random_state=random_state
        )

        # Second split: train vs val (from temp set)
        val_fraction = val_size / (1 - test_size)  # adjust val proportion
        X_train, X_val, y_train, y_val = train_test_split(
            X_temp, y_temp, test_size=val_fraction, random_state=random_state
        )

        # Convert to numpy arrays (for sklearn)
        return (X_train.to_numpy(), y_train.to_numpy(),
                X_val.to_numpy(), y_val.to_numpy(),
                X_test.to_numpy(), y_test.to_numpy())

    def run_poly_regression(self, X_train, y_train, X_val, y_val, X_test, y_test):
        """
        Fit model, make predictions, and return results.
        """
        # polynomial expansion
        X_train_poly = self.poly.fit_transform(X_train) # fit only on training data
        X_val_poly = self.poly.transform(X_val)
        X_test_poly = self.poly.transform(X_test)

        # scaling
        X_train_poly = self.scaler.fit_transform(X_train_poly) # fit only on training data
        X_val_poly = self.scaler.transform(X_val_poly)
        X_test_poly = self.scaler.transform(X_test_poly)

        # fit model and get parameters for model
        self.model.fit(X_train_poly, y_train)

        # predictions
        y_train_pred = self.model.predict(X_train_poly)
        y_val_pred = self.model.predict(X_val_poly)
        y_test_pred = self.model.predict(X_test_poly)

        # results
        results = {
            "train_mse": mean_squared_error(y_train, y_train_pred),
            "val_mse": mean_squared_error(y_val, y_val_pred),
            "test_mse": mean_squared_error(y_test, y_test_pred),
            "coeffs": self.model.coef_,
            "poly": self.poly
        }
        return results
    
    def grid_search(self, X_train, y_train, X_val, y_val, X_test, y_test,
                    regularizers=[None, 'l1', 'l2'],
                    degrees=range(1, 7),
                    reg_strengths=np.logspace(-4, 2, 10)):
        """
        Run experiments across multiple regularizers, degrees, and strengths.
        Returns nested dict: results_all[reg][degree] = {train, val, alphas}.
        """
        results_all = {}

        for reg in regularizers:
            results_all[reg] = {}
            for d in degrees:
                val_mses, train_mses = [], []

                # if no reg, just run once with strength=0.0
                for alpha in (reg_strengths if reg else [0.0]):
                    # create a fresh regressor each time
                    regressor = PolyRegressor(degree=d,
                                              regularizer=reg,
                                              reg_strength=alpha)
                    res = regressor.run_poly_regression(X_train, y_train,
                                                        X_val, y_val,
                                                        X_test, y_test)
                    val_mses.append(res["val_mse"])
                    train_mses.append(res["train_mse"])

                results_all[reg][d] = {
                    "train": train_mses,
                    "val": val_mses,
                    "alphas": (reg_strengths if reg else [0.0])
                }
        return results_all
    
    def plot_degree_vs_mse(self, results, degrees=range(1, 7)):
        """
        Plot Train/Validation MSE vs Polynomial Degree for each regularizer.
        """
        for reg in results.keys():
            train_errors = []
            val_errors = []
            for d in degrees:
                idx = np.argmin(results[reg][d]["val"])  # best alpha for this degree
                val_errors.append(results[reg][d]["val"][idx])
                train_errors.append(results[reg][d]["train"][idx])

            # Degree vs MSE
            fig1 = go.Figure()

            fig1.add_trace(go.Scatter(
                x=list(degrees),
                y=train_errors,
                mode="lines+markers",
                name=f"{reg} Train",
                marker=dict(symbol="circle")
            ))

            fig1.add_trace(go.Scatter(
                x=list(degrees),
                y=val_errors,
                mode="lines+markers",
                name=f"{reg} Val",
                marker=dict(symbol="square")
            ))

            fig1.update_layout(
                width = 800,
                height = 500,
                title=f"Degree vs. MSE for Regularizer: {reg}",
                xaxis_title="Polynomial Degree",
                yaxis_title="MSE",
                template="plotly_white"
            )
            fig1.add_annotation(
                x=0.95, y=0.95,
                xref="paper", yref="paper",  # relative to axes
                text=self.username,
                showarrow=False,
                font=dict(size=10, color="gray"),
                align="right",
                opacity=0.7
            )

            fig1.show()

            # Skip if no regularizer
            if reg is not None:
                # Regularization strength vs Val MSE
                best_degree = min(degrees, key=lambda d: min(results[reg][d]["val"]))
                alphas = results[reg][best_degree]["alphas"]
                val_mse = results[reg][best_degree]["val"]

                fig2 = go.Figure()
                fig2.add_trace(go.Scatter(
                    x=alphas, y=val_mse,
                    mode="lines+markers",
                    marker=dict(symbol="circle"),
                    name="Validation MSE"
                ))

                fig2.update_layout(
                    width=800,
                    height=500,
                    title=f"Reg Strength vs. Val MSE (Best Degree={best_degree}, Reg: {reg})",
                    xaxis=dict(title="Regularization Strength (alpha)", type="log"),
                    yaxis=dict(title="Validation MSE"),
                    template="plotly_white"
                )

                fig2.add_annotation(
                    x=0.95, y=0.95,
                    xref="paper", yref="paper",  # relative to axes
                    text=self.username,    
                    showarrow=False,
                    font=dict(size=10, color="gray"),
                    align="right",
                    opacity=0.7
                )

                fig2.show()
            
LR = PolyRegressor()
X_train, y_train, X_val, y_val, X_test, y_test = LR.prepare_data(df)
results = LR.grid_search(X_train, y_train, X_val, y_val, X_test, y_test)
LR.plot_degree_vs_mse(results)