In [2]:
#q1
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score

# --- STEP 1: Generate Highly Correlated Dataset ---
np.random.seed(42)
N = 500
X = np.random.randn(N, 7) # 7 features
# Create correlation: Feature 1 is highly dependent on F0
X[:, 1] = X[:, 0] * 0.9 + np.random.randn(N) * 0.1
X[:, 2] = X[:, 0] * 0.8 + X[:, 3] * 0.5
X[:, 3] = X[:, 4] * 0.7 + np.random.randn(N) * 0.3
# Target variable y = 1 + 2*X0 + 3*X1 + ... + noise
true_betas = np.array([2, 3, -1, 0.5, 4, -2, 1])
y = 1 + X @ true_betas + np.random.randn(N) * 20

# Scale the data and split
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


# --- STEP 2: Ridge Regression with Gradient Descent Implementation ---
class RidgeRegressionGD:
    def __init__(self, learning_rate=0.01, n_iterations=1000, lambda_param=1e-5):
        self.lr = learning_rate
        self.n_iterations = n_iterations
        self.lambda_param = lambda_param
        self.coef_ = None
        self.intercept_ = None
        self.cost_history = []

    def fit(self, X, y):
        m, n = X.shape
        # Initialize coefficients (weights + intercept) with small random values
        self.coef_ = np.random.randn(n, 1) * 0.01
        self.intercept_ = np.random.randn(1, 1) * 0.01
        
        y = y.reshape(-1, 1)

        for i in range(self.n_iterations):
            # Predictions
            y_pred = X @ self.coef_ + self.intercept_
            
            # Cost function (Mean Squared Error + L2 Regularization)
            cost = (1/m) * np.sum((y_pred - y)**2) + self.lambda_param * np.sum(self.coef_**2)
            self.cost_history.append(cost)

            # Gradients (with L2 term: 2*lambda*w)
            d_coef = (2/m) * (X.T @ (y_pred - y)) + 2 * self.lambda_param * self.coef_
            d_intercept = (2/m) * np.sum(y_pred - y)
            
            # Gradient clipping to prevent overflow
            d_coef = np.clip(d_coef, -10, 10)
            d_intercept = np.clip(d_intercept, -10, 10)
            
            # Update parameters
            self.coef_ -= self.lr * d_coef
            self.intercept_ -= self.lr * d_intercept

    def predict(self, X):
        return X @ self.coef_ + self.intercept_


# --- STEP 3: Experiment and Find Best Parameters ---
learning_rates = [0.0001, 0.001, 0.01, 0.1]
lambda_param = 1e-5
best_params = {'R2': -np.inf, 'LR': None, 'Cost': np.inf}

for lr in learning_rates:
    model = RidgeRegressionGD(learning_rate=lr, n_iterations=10000, lambda_param=lambda_param)
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test).flatten()
    r2 = r2_score(y_test, y_pred)
    min_cost = model.cost_history[-1]
    
    if r2 > best_params['R2'] or (r2 == best_params['R2'] and min_cost < best_params['Cost']):
        best_params['R2'] = r2
        best_params['LR'] = lr
        best_params['Cost'] = min_cost

print("--- Q1: Ridge Regression (Gradient Descent) ---")
print(f"Regularization Parameter (λ): {lambda_param}")
print(f"Best Learning Rate (α): {best_params['LR']}")
print(f"Maximum R2 Score: {best_params['R2']:.4f}")
print(f"Minimum Cost Function Value: {best_params['Cost']:.4f}")

--- Q1: Ridge Regression (Gradient Descent) ---
Regularization Parameter (λ): 1e-05
Best Learning Rate (α): 0.01
Maximum R2 Score: 0.0825
Minimum Cost Function Value: 384.9441


In [1]:
#q2
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import r2_score
import numpy as np

# --- STEP (a): Load and Pre-process Data ---
try:
    df_h = pd.read_csv('Hitters.csv').dropna()
except FileNotFoundError:
    print("NOTE: 'Hitters.csv' not found. Generating synthetic data for demonstration.")
    # Create synthetic Hitters-like dataset
    np.random.seed(42)
    n_samples = 263
    df_h = pd.DataFrame({
        'AtBat': np.random.randint(100, 700, n_samples),
        'Hits': np.random.randint(10, 200, n_samples),
        'HmRun': np.random.randint(0, 50, n_samples),
        'Runs': np.random.randint(10, 150, n_samples),
        'RBI': np.random.randint(10, 150, n_samples),
        'Walks': np.random.randint(5, 100, n_samples),
        'Years': np.random.randint(1, 25, n_samples),
        'CAtBat': np.random.randint(100, 10000, n_samples),
        'CHits': np.random.randint(10, 3000, n_samples),
        'CHmRun': np.random.randint(0, 500, n_samples),
        'CRuns': np.random.randint(10, 2000, n_samples),
        'CRBI': np.random.randint(10, 2000, n_samples),
        'CWalks': np.random.randint(5, 1500, n_samples),
        'League': np.random.choice(['A', 'N'], n_samples),
        'Division': np.random.choice(['E', 'W'], n_samples),
        'PutOuts': np.random.randint(0, 1500, n_samples),
        'Assists': np.random.randint(0, 500, n_samples),
        'Errors': np.random.randint(0, 30, n_samples),
        'Salary': np.random.randint(50, 2500, n_samples) * 1000
    })

# Separate features (X) and target (y)
y_h = df_h['Salary']
X_h = df_h.drop('Salary', axis=1)

# Identify feature types
numerical_features = X_h.select_dtypes(include=np.number).columns
categorical_features = X_h.select_dtypes(include='object').columns

# Preprocessing Pipeline (Scaling + One-Hot Encoding)
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough'
)

# --- STEP (b): Separate and Scale (Handled by Pipeline) ---
X_train_h, X_test_h, y_train_h, y_test_h = train_test_split(
    X_h, y_h, test_size=0.3, random_state=42
)

# --- STEP (c) & (d): Fit and Evaluate Models ---
alpha_param = 0.5748
models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(alpha=alpha_param, random_state=42),
    "LASSO Regression": Lasso(alpha=alpha_param, random_state=42, max_iter=5000)
}

results = {}
for name, model in models.items():
    # Create and train the full pipeline
    full_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                    ('regressor', model)])
    
    full_pipeline.fit(X_train_h, y_train_h)
    
    # Predict and evaluate
    y_pred_h = full_pipeline.predict(X_test_h)
    r2 = r2_score(y_test_h, y_pred_h)
    results[name] = r2

print("\n--- Q2: Regression Model Performance on Hitters Dataset ---")
results_df = pd.DataFrame(results.items(), columns=['Model', 'R2 Score on Test Set'])
print(results_df.sort_values(by='R2 Score on Test Set', ascending=False).to_markdown(index=False))

best_model = max(results, key=results.get)
print(f"\nBest Performing Model: **{best_model}**")
print("Explanation: Linear Regression is the baseline. **Ridge** adds L2 regularization, preventing coefficients from becoming too large (good for multicollinearity). **LASSO** adds L1 regularization, forcing some coefficients to zero (performing feature selection). The best model depends on the dataset's characteristics, specifically the degree of multicollinearity and the number of irrelevant features.")


--- Q2: Regression Model Performance on Hitters Dataset ---
| Model             |   R2 Score on Test Set |
|:------------------|-----------------------:|
| Ridge Regression  |               0.402107 |
| LASSO Regression  |               0.397415 |
| Linear Regression |               0.380623 |

Best Performing Model: **Ridge Regression**
Explanation: Linear Regression is the baseline. **Ridge** adds L2 regularization, preventing coefficients from becoming too large (good for multicollinearity). **LASSO** adds L1 regularization, forcing some coefficients to zero (performing feature selection). The best model depends on the dataset's characteristics, specifically the degree of multicollinearity and the number of irrelevant features.


In [4]:
#q3
from sklearn.datasets import fetch_california_housing # Using California Housing as Boston is deprecated
from sklearn.linear_model import RidgeCV, LassoCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import numpy as np

# Load the dataset
# NOTE: The Boston dataset (load_boston) is deprecated. Using fetch_california_housing as a modern substitute.
data = fetch_california_housing(as_frame=True)
X_cv, y_cv = data.data, data.target

# Split and Scale
X_train_cv, X_test_cv, y_train_cv, y_test_cv = train_test_split(
    X_cv, y_cv, test_size=0.3, random_state=42
)
scaler_cv = StandardScaler()
X_train_cv_scaled = scaler_cv.fit_transform(X_train_cv)
X_test_cv_scaled = scaler_cv.transform(X_test_cv)

# Set up candidate alphas
alphas = np.logspace(-4, 0, 100) # 100 alpha values from 0.0001 to 1

# --- Ridge Cross Validation (RidgeCV) ---
# cv=None uses Generalized Cross-Validation (faster LOOCV approximation)
ridge_cv = RidgeCV(alphas=alphas)
ridge_cv.fit(X_train_cv_scaled, y_train_cv)
y_pred_ridge = ridge_cv.predict(X_test_cv_scaled)
mse_ridge = mean_squared_error(y_test_cv, y_pred_ridge)

# --- Lasso Cross Validation (LassoCV) ---
# cv=5 uses 5-fold cross-validation
lasso_cv = LassoCV(alphas=alphas, cv=5, max_iter=10000, random_state=42)
lasso_cv.fit(X_train_cv_scaled, y_train_cv)
y_pred_lasso = lasso_cv.predict(X_test_cv_scaled)
mse_lasso = mean_squared_error(y_test_cv, y_pred_lasso)

print("\n--- Q3: RidgeCV and LassoCV Results ---")
print(f"Dataset Used: California Housing (substitute for deprecated Boston Housing)")
print("--- RidgeCV ---")
print(f"Optimal Alpha (λ): {ridge_cv.alpha_:.5f}")
print(f"Test Set MSE: {mse_ridge:.4f}")

print("\n--- LassoCV ---")
print(f"Optimal Alpha (λ): {lasso_cv.alpha_:.5f}")
print(f"Test Set MSE: {mse_lasso:.4f}")


--- Q3: RidgeCV and LassoCV Results ---
Dataset Used: California Housing (substitute for deprecated Boston Housing)
--- RidgeCV ---
Optimal Alpha (λ): 1.00000
Test Set MSE: 0.5305

--- LassoCV ---
Optimal Alpha (λ): 0.00285
Test Set MSE: 0.5285
