In [3]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score

np.random.seed(42)
n_samples = 300
z = np.random.randn(n_samples, 1)
x1 = z + 0.01 * np.random.randn(n_samples, 1)
x2 = z * 1.1 + 0.01 * np.random.randn(n_samples, 1)
x3 = z * 0.9 + 0.01 * np.random.randn(n_samples, 1)
x4 = 2 * z + 0.01 * np.random.randn(n_samples, 1)
x5 = -1.5 * z + 0.01 * np.random.randn(n_samples, 1)
x6 = 0.5 * z + 0.01 * np.random.randn(n_samples, 1)
x7 = -0.7 * z + 0.01 * np.random.randn(n_samples, 1)
X = np.hstack([x1, x2, x3, x4, x5, x6, x7])
true_w = np.array([[3], [-2], [1.5], [0.7], [-1], [2], [0.5]])
y = (X @ true_w).ravel() + 0.1 * np.random.randn(n_samples)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

def ridge_gradient_descent(X, y, lr, lambd, n_iters=2000):
    n_samples, n_features = X.shape
    w = np.zeros(n_features)
    b = 0.0
    for _ in range(n_iters):
        y_pred = X @ w + b
        error = y_pred - y
        dw = (X.T @ error) / n_samples + (lambd / n_samples) * w
        db = np.sum(error) / n_samples
        dw = np.clip(dw, -1e6, 1e6)
        db = float(np.clip(db, -1e6, 1e6))
        w -= lr * dw
        b -= lr * db
        if not np.all(np.isfinite(w)) or not np.isfinite(b):
            return None, None
    return w, b

def ridge_cost(X, y, w, b, lambd):
    n_samples = X.shape[0]
    y_pred = X @ w + b
    mse = np.mean((y_pred - y) ** 2)
    reg = lambd * np.sum(w ** 2) / (2 * n_samples)
    return 0.5 * mse + reg

learning_rates = [0.0001, 0.001, 0.01, 0.1, 1, 10]
lambdas = [1e-15, 1e-10, 1e-5, 1e-3, 0, 1, 10, 20]

best_params = None
best_cost = float("inf")
best_r2 = -float("inf")
best_model = None

for lr in learning_rates:
    for lambd in lambdas:
        w, b = ridge_gradient_descent(X_train_scaled, y_train, lr, lambd, n_iters=2000)
        if w is None:
            continue
        cost = ridge_cost(X_train_scaled, y_train, w, b, lambd)
        if not np.isfinite(cost):
            continue
        y_pred_test = X_test_scaled @ w + b
        if not np.all(np.isfinite(y_pred_test)):
            continue
        r2 = r2_score(y_test, y_pred_test)
        print("lr:", lr, "lambda:", lambd, "cost:", cost, "R2:", r2)
        if (cost < best_cost) or (np.isclose(cost, best_cost) and r2 > best_r2):
            best_cost = cost
            best_r2 = r2
            best_params = (lr, lambd)
            best_model = (w, b)

print("Best learning rate:", best_params[0])
print("Best lambda:", best_params[1])
print("Minimum cost:", best_cost)
print("Max R2 on test:", best_r2)


lr: 0.0001 lambda: 1e-15 cost: 1.0460451927680494 R2: 0.9394108801075687
lr: 0.0001 lambda: 1e-10 cost: 1.0460451927688204 R2: 0.9394108801075569
lr: 0.0001 lambda: 1e-05 cost: 1.0460452702055563 R2: 0.9394108789062035
lr: 0.0001 lambda: 0.001 cost: 1.0460529365159037 R2: 0.9394107599709982
lr: 0.0001 lambda: 0 cost: 1.0460451927680494 R2: 0.9394108801075687
lr: 0.0001 lambda: 1 cost: 1.053785764408221 R2: 0.9392907116627753
lr: 0.0001 lambda: 10 cost: 1.123165749575376 R2: 0.9382063456685797
lr: 0.0001 lambda: 20 cost: 1.1996567570939654 R2: 0.9369955683555288
lr: 0.001 lambda: 1e-15 cost: 0.006133709440266483 R2: 0.9995681821801614
lr: 0.001 lambda: 1e-10 cost: 0.006133709441273493 R2: 0.9995681821801617
lr: 0.001 lambda: 1e-05 cost: 0.006133810142349584 R2: 0.9995681822136135
lr: 0.001 lambda: 0.001 cost: 0.006143779642642159 R2: 0.9995681855250091
lr: 0.001 lambda: 0 cost: 0.006133709440266473 R2: 0.9995681821801614
lr: 0.001 lambda: 1 cost: 0.016197926526472887 R2: 0.9995711591617

Q2


In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import r2_score, mean_squared_error

df = pd.read_csv("/content/Hitters (1).csv")
df = df.dropna()

y = df["Salary"].values
X = df.drop(columns=["Salary"])

X = pd.get_dummies(X, drop_first=True)

X_train, X_test, y_train, y_test = train_test_split(X.values, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

models = {
    "LinearRegression": LinearRegression(),
    "Ridge": Ridge(alpha=1.0),
    "Lasso": Lasso(alpha=0.1, max_iter=10000)
}

results = {}

for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    results[name] = (r2, mse)

for name in results:
    print(name, "R2:", results[name][0], "MSE:", results[name][1])

best_model = max(results.items(), key=lambda x: x[1][0])
print("Best model by R2:", best_model[0])


LinearRegression R2: 0.29074518557981455 MSE: 128284.34549672344
Ridge R2: 0.2994019064058938 MSE: 126718.5869812465
Lasso R2: 0.29279784955454047 MSE: 127913.07603309958
Best model by R2: Ridge


Q3


In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso

df = pd.read_csv("/content/Hitters (1).csv")
df = df.dropna()

y = df["Salary"].values
X = df.drop(columns=["Salary"])
X = pd.get_dummies(X, drop_first=True)

X_train, X_test, y_train, y_test = train_test_split(X.values, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

lin_reg = LinearRegression()
ridge_reg = Ridge(alpha=0.5748)
lasso_reg = Lasso(alpha=0.5748, max_iter=10000)

lin_reg.fit(X_train_scaled, y_train)
ridge_reg.fit(X_train_scaled, y_train)
lasso_reg.fit(X_train_scaled, y_train)

from sklearn.metrics import r2_score, mean_squared_error

for name, model in [("Linear", lin_reg), ("Ridge", ridge_reg), ("Lasso", lasso_reg)]:
    y_pred = model.predict(X_test_scaled)
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    print(name, "R2:", r2, "MSE:", mse)


Linear R2: 0.29074518557981455 MSE: 128284.34549672344
Ridge R2: 0.3000359698829351 MSE: 126603.9026442468
Lasso R2: 0.2996256609856722 MSE: 126678.11604014723


In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import RidgeCV, LassoCV
from sklearn.metrics import r2_score

data_url = "http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url, sep=r"\s+", skiprows=22, header=None)
data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
target = raw_df.values[1::2, 2]

X_boston = data
y_boston = target

X_train_b, X_test_b, y_train_b, y_test_b = train_test_split(
    X_boston, y_boston, test_size=0.2, random_state=42
)

scaler_b = StandardScaler()
X_train_b_scaled = scaler_b.fit_transform(X_train_b)
X_test_b_scaled = scaler_b.transform(X_test_b)

alphas = [0.0001, 0.001, 0.01, 0.1, 0.5, 1, 10]

ridge_cv = RidgeCV(alphas=alphas, scoring="r2", cv=5)
ridge_cv.fit(X_train_b_scaled, y_train_b)
y_pred_ridge_cv = ridge_cv.predict(X_test_b_scaled)
r2_ridge_cv = r2_score(y_test_b, y_pred_ridge_cv)

lasso_cv = LassoCV(alphas=alphas, cv=5, max_iter=10000)
lasso_cv.fit(X_train_b_scaled, y_train_b)
y_pred_lasso_cv = lasso_cv.predict(X_test_b_scaled)
r2_lasso_cv = r2_score(y_test_b, y_pred_lasso_cv)

print("RidgeCV best alpha:", ridge_cv.alpha_)
print("RidgeCV test R2:", r2_ridge_cv)
print("LassoCV best alpha:", lasso_cv.alpha_)
print("LassoCV test R2:", r2_lasso_cv)


RidgeCV best alpha: 1.0
RidgeCV test R2: 0.668462435964356
LassoCV best alpha: 0.0001
LassoCV test R2: 0.6687548978932025


Q4


In [10]:
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

iris = load_iris()
X = iris.data
y = iris.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def train_binary_logistic(X, y, lr=0.1, n_iters=3000, lambd=0.0):
    n_samples, n_features = X.shape
    w = np.zeros(n_features)
    b = 0.0
    for _ in range(n_iters):
        linear = X @ w + b
        y_pred = sigmoid(linear)
        error = y_pred - y
        dw = (X.T @ error) / n_samples + (lambd / n_samples) * w
        db = np.sum(error) / n_samples
        w -= lr * dw
        b -= lr * db
    return w, b

classes = np.unique(y_train)
n_classes = len(classes)
n_features = X_train_scaled.shape[1]

W = np.zeros((n_classes, n_features))
b_vec = np.zeros(n_classes)

for idx, c in enumerate(classes):
    y_binary = (y_train == c).astype(int)
    w_c, b_c = train_binary_logistic(X_train_scaled, y_binary, lr=0.1, n_iters=5000, lambd=0.01)
    W[idx] = w_c
    b_vec[idx] = b_c

def predict_ovr(X, W, b_vec):
    logits = X @ W.T + b_vec
    probs = sigmoid(logits)
    return np.argmax(probs, axis=1)

y_pred_test = predict_ovr(X_test_scaled, W, b_vec)
acc = accuracy_score(y_test, y_pred_test)
print("Test accuracy (OvR logistic):", acc)


Test accuracy (OvR logistic): 0.9
