## Q1

In [1]:
import numpy as np
import pandas as pd

# Step 1: Set a random seed for reproducibility
np.random.seed(42)

# Step 2: Define parameters
n_samples = 500      # number of data points
n_features = 7       # number of features

# Step 3: Create a covariance matrix to make features correlated
base_corr = 0.9
cov = np.full((n_features, n_features), base_corr)
np.fill_diagonal(cov, 1.0)  # diagonal = 1 (self-correlation)

# Step 4: Generate multivariate normal data (highly correlated)
mean = np.zeros(n_features)
X = np.random.multivariate_normal(mean, cov, size=n_samples)

# Step 5: Define true coefficients and bias
true_weights = np.array([2.5, -1.8, 1.2, 0.8, 0.5, 1.5, -0.7])
bias = 3.0

# Step 6: Generate target variable with some noise They define how the target variable (y) 
#is generated from your features (X) using a linear relationship + randomness
noise = np.random.normal(0, 1.5, size=n_samples)
y = X.dot(true_weights) + bias + noise

# Step 7: Create DataFrame
columns = [f'Feature_{i+1}' for i in range(n_features)]
df = pd.DataFrame(X, columns=columns)
df['Target'] = y

# Step 8: Display correlation matrix
print("Feature Correlation Matrix:")
print(df.corr())

# Step 9: Save dataset
df.to_csv("highly_correlated_dataset.csv", index=False)
print("\nDataset saved as 'highly_correlated_dataset.csv'")


Feature Correlation Matrix:
           Feature_1  Feature_2  Feature_3  Feature_4  Feature_5  Feature_6  \
Feature_1   1.000000   0.897950   0.889118   0.903796   0.896104   0.902124   
Feature_2   0.897950   1.000000   0.891180   0.906176   0.906239   0.898228   
Feature_3   0.889118   0.891180   1.000000   0.904157   0.905161   0.898464   
Feature_4   0.903796   0.906176   0.904157   1.000000   0.892401   0.911539   
Feature_5   0.896104   0.906239   0.905161   0.892401   1.000000   0.901148   
Feature_6   0.902124   0.898228   0.898464   0.911539   0.901148   1.000000   
Feature_7   0.895656   0.906879   0.899301   0.911495   0.894585   0.904232   
Target      0.897466   0.788642   0.864259   0.863332   0.846861   0.877392   

           Feature_7    Target  
Feature_1   0.895656  0.897466  
Feature_2   0.906879  0.788642  
Feature_3   0.899301  0.864259  
Feature_4   0.911495  0.863332  
Feature_5   0.894585  0.846861  
Feature_6   0.904232  0.877392  
Feature_7   1.000000  0.81478

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score

# Step 10: Load the dataset you just saved
df = pd.read_csv("highly_correlated_dataset.csv")
X = df.drop("Target", axis=1).values
y = df["Target"].values

# Step 11: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 12: Feature scaling (important for gradient descent stability)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Step 13: Add bias column
def add_bias(X):
    return np.hstack([np.ones((X.shape[0], 1)), X])

X_train_b = add_bias(X_train)
X_test_b = add_bias(X_test)

# Step 14: Ridge cost function
def ridge_cost(X, y, w, lam):
    n = X.shape[0]
    residuals = X @ w - y
    mse = (residuals @ residuals) / (2 * n)
    reg = (lam / (2 * n)) * np.sum(w[1:] ** 2)  # exclude bias from regularization
    return mse + reg

# Step 15: Gradient function
def ridge_grad(X, y, w, lam):
    n = X.shape[0]
    residuals = X @ w - y
    grad = (X.T @ residuals) / n
    grad[1:] += (lam / n) * w[1:]  # exclude bias term
    return grad

# Step 16: Gradient Descent implementation
def ridge_gd(X, y, alpha=0.001, lam=1.0, iters=5000, tol=1e-8):
    w = np.zeros(X.shape[1])
    history = []
    for t in range(iters):
        cost = ridge_cost(X, y, w, lam)
        history.append(cost)
        grad = ridge_grad(X, y, w, lam)
        w -= alpha * grad
        if t > 0 and abs(history[-2] - history[-1]) < tol:
            break
    return w, history

# Step 17: Hyperparameter sweep
learning_rates = [0.0001, 0.001, 0.01, 0.1]  # safer set
lambdas = [1e-15, 1e-10, 1e-5, 1e-3, 0, 1, 10, 20]

best_result = None
results = []

for alpha in learning_rates:
    for lam in lambdas:
        w, hist = ridge_gd(X_train_b, y_train, alpha=alpha, lam=lam, iters=10000)
        y_pred_train = X_train_b @ w
        y_pred_test = X_test_b @ w
        r2_train = r2_score(y_train, y_pred_train)
        r2_test = r2_score(y_test, y_pred_test)
        final_cost = ridge_cost(X_train_b, y_train, w, lam)
        results.append((alpha, lam, final_cost, r2_train, r2_test))
        if best_result is None or r2_test > best_result[4]:
            best_result = (alpha, lam, final_cost, r2_train, r2_test)

# Step 18: Print best parameters
print("\nBest Parameters:")
print(f"Learning Rate: {best_result[0]}")
print(f"Lambda: {best_result[1]}")
print(f"Final Cost: {best_result[2]:.4f}")
print(f"Train R2: {best_result[3]:.4f}")
print(f"Test R2: {best_result[4]:.4f}")


Best Parameters:
Learning Rate: 0.01
Lambda: 0.001
Final Cost: 1.1999
Train R2: 0.8693
Test R2: 0.8781


## Q2

In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import r2_score, mean_squared_error

# Step 1: Load dataset
df = pd.read_csv("Hitters.csv")

# Step 2: Handle missing values
df = df.dropna()   # simplest approach: drop rows with nulls
# Alternatively: df.fillna(df.mean(), inplace=True) for numeric columns

# Step 3: Separate features and target
X = df.drop("Salary", axis=1)   # assuming 'Salary' is the target column
y = df["Salary"]

# Step 4: Identify categorical and numeric columns
categorical_cols = X.select_dtypes(include=["object"]).columns
numeric_cols = X.select_dtypes(include=["int64", "float64"]).columns

# Step 5: Preprocessing (encoding + scaling)
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_cols),
        ("cat", OneHotEncoder(drop="first"), categorical_cols)
    ]
)

# Step 6: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 7: Define models
models = {
    "Linear": LinearRegression(),
    "Ridge": Ridge(alpha=0.5748),
    "Lasso": Lasso(alpha=0.5748)
}

# Step 8: Train and evaluate
results = {}
for name, model in models.items():
    pipe = Pipeline(steps=[("preprocessor", preprocessor),
                           ("regressor", model)])
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    results[name] = {"R2": r2, "MSE": mse}

# Step 9: Display results
print("\nModel Performance on Test Set:")
for name, metrics in results.items():
    print(f"{name}: R2={metrics['R2']:.4f}, MSE={metrics['MSE']:.2f}")



Model Performance on Test Set:
Linear: R2=0.2907, MSE=128284.35
Ridge: R2=0.3007, MSE=126484.39
Lasso: R2=0.3006, MSE=126504.31


  model = cd_fast.enet_coordinate_descent(


## Q3

In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso, RidgeCV, LassoCV
from sklearn.metrics import r2_score, mean_squared_error

# Step 1: Load Boston Housing dataset from CSV
df = pd.read_csv("Boston_Housing.csv")

# Step 2: Separate features and target
# Assuming the target column is named 'MEDV' (Median value of owner-occupied homes)
X = df.drop("MEDV", axis=1)
y = df["MEDV"]

# Step 3: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Step 5: Define models
models = {
    "Linear": LinearRegression(),
    "Ridge": Ridge(alpha=0.5748),
    "Lasso": Lasso(alpha=0.5748, max_iter=10000)
}

# Step 6: Train and evaluate
results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    results[name] = {"R2": r2, "MSE": mse}

print("\nModel Performance on Test Set:")
for name, metrics in results.items():
    print(f"{name}: R2={metrics['R2']:.4f}, MSE={metrics['MSE']:.2f}")

# Step 7: RidgeCV (cross-validation)
ridge_alphas = np.logspace(-3, 3, 50)  # range of alphas
ridge_cv = RidgeCV(alphas=ridge_alphas, store_cv_values=True)
ridge_cv.fit(X_train, y_train)
y_pred_ridgecv = ridge_cv.predict(X_test)

print("\nRidgeCV Results:")
print(f"Best Alpha: {ridge_cv.alpha_}")
print(f"R2: {r2_score(y_test, y_pred_ridgecv):.4f}, MSE: {mean_squared_error(y_test, y_pred_ridgecv):.2f}")

# Step 8: LassoCV (cross-validation)
lasso_cv = LassoCV(alphas=np.logspace(-3, 3, 50), cv=5, random_state=42, max_iter=10000)
lasso_cv.fit(X_train, y_train)
y_pred_lassocv = lasso_cv.predict(X_test)

print("\nLassoCV Results:")
print(f"Best Alpha: {lasso_cv.alpha_}")
print(f"R2: {r2_score(y_test, y_pred_lassocv):.4f}, MSE: {mean_squared_error(y_test, y_pred_lassocv):.2f}")



Model Performance on Test Set:
Linear: R2=0.6688, MSE=24.29
Ridge: R2=0.6686, MSE=24.30
Lasso: R2=0.6276, MSE=27.31

RidgeCV Results:
Best Alpha: 6.25055192527397
R2: 0.6669, MSE: 24.42

LassoCV Results:
Best Alpha: 0.001
R2: 0.6687, MSE: 24.29


## Q4

In [11]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Step 1: Load Iris dataset
iris = load_iris()
X = iris.data
y = iris.target
target_names = iris.target_names

# Step 2: Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Step 3: Feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Step 4: Implement Logistic Regression with One-vs-Rest strategy
# OvR is the default for LogisticRegression when multi_class='ovr'
log_reg_ovr = LogisticRegression(multi_class="ovr", solver="lbfgs", max_iter=1000)
log_reg_ovr.fit(X_train, y_train)

# Step 5: Predictions
y_pred = log_reg_ovr.predict(X_test)

# Step 6: Evaluation
acc = accuracy_score(y_test, y_pred)
print(f"Accuracy (OvR Logistic Regression): {acc:.4f}\n")
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=target_names))


Accuracy (OvR Logistic Regression): 0.9000

Classification Report:
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        10
  versicolor       0.89      0.80      0.84        10
   virginica       0.82      0.90      0.86        10

    accuracy                           0.90        30
   macro avg       0.90      0.90      0.90        30
weighted avg       0.90      0.90      0.90        30

