In [None]:
#q1
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.utils import shuffle

url = "https://drive.google.com/uc?id=1O_NwpJT-8xGfU_-3llUl2sgPu0xllOrX"
data = pd.read_csv(url)

X = data.drop(columns=['Price']).values
y = data['Price'].values

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled, y = shuffle(X_scaled, y, random_state=42)

def compute_beta(X_train, y_train):
    ones = np.ones((X_train.shape[0], 1))
    X_train = np.hstack((ones, X_train))
    beta = np.linalg.inv(X_train.T @ X_train) @ (X_train.T @ y_train)
    return beta

def predict(X, beta):
    ones = np.ones((X.shape[0], 1))
    X = np.hstack((ones, X))
    return X @ beta

kf = KFold(n_splits=5, shuffle=True, random_state=42)
r2_scores = []
betas = []

for train_index, test_index in kf.split(X_scaled):
    X_train, X_test = X_scaled[train_index], X_scaled[test_index]
    y_train, y_test = y[train_index], y[test_index]
    beta = compute_beta(X_train, y_train)
    y_pred = predict(X_test, beta)
    score = r2_score(y_test, y_pred)
    r2_scores.append(score)
    betas.append(beta)

for i, score in enumerate(r2_scores):
    print(f"Fold {i+1}: R² = {score:.4f}")

best_index = np.argmax(r2_scores)
print(f"Best R² from CV: {r2_scores[best_index]:.4f}")

split = int(0.7 * len(X_scaled))
X_train, X_test = X_scaled[:split], X_scaled[split:]
y_train, y_test = y[:split], y[split:]

beta_final = compute_beta(X_train, y_train)
y_pred_test = predict(X_test, beta_final)
final_r2 = r2_score(y_test, y_pred_test)
print(f"Final R² on 30% test set: {final_r2:.4f}")


Fold 1: R² = 0.9186
Fold 2: R² = 0.9185
Fold 3: R² = 0.9164
Fold 4: R² = 0.9178
Fold 5: R² = 0.9176
Best R² from CV: 0.9186
Final R² on 30% test set: 0.9217


In [None]:
#q2
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.utils import shuffle

url = "https://drive.google.com/uc?id=1O_NwpJT-8xGfU_-3llUl2sgPu0xllOrX"
data = pd.read_csv(url)

X = data.drop(columns=['Price']).values
y = data['Price'].values
mask = ~np.isnan(X).any(axis=1) & ~np.isnan(y)
X = X[mask]
y = y[mask]

X, y = shuffle(X, y, random_state=42)

n = X.shape[0]
n_train = int(0.56 * n)
n_val = int(0.14 * n)
n_test = n - n_train - n_val

X_train = X[:n_train]
y_train = y[:n_train]
X_val = X[n_train:n_train + n_val]
y_val = y[n_train:n_train + n_val]
X_test = X[n_train + n_val:]
y_test = y[n_train + n_val:]

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

def add_intercept(X):
    ones = np.ones((X.shape[0], 1))
    return np.hstack((ones, X))

X_train_b = add_intercept(X_train_scaled)
X_val_b = add_intercept(X_val_scaled)
X_test_b = add_intercept(X_test_scaled)

def gradient_descent(X, y, lr=0.01, iterations=1000):
    m, d = X.shape
    beta = np.zeros(d)
    for it in range(iterations):
        y_pred = X @ beta
        error = y_pred - y
        grad = (2.0 / m) * (X.T @ error)
        beta = beta - lr * grad
    return beta

learning_rates = [0.001, 0.01, 0.1, 1]
results = {}

for lr in learning_rates:
    beta = gradient_descent(X_train_b, y_train, lr=lr, iterations=1000)
    y_val_pred = X_val_b @ beta
    y_test_pred = X_test_b @ beta
    r2_val = r2_score(y_val, y_val_pred)
    r2_test = r2_score(y_test, y_test_pred)
    results[lr] = {'beta': beta, 'r2_val': r2_val, 'r2_test': r2_test}
    print(f"lr={lr:<6} -> R2_val={r2_val:.4f}, R2_test={r2_test:.4f}")

best_lr = max(results.keys(), key=lambda a: results[a]['r2_val'])
best_beta = results[best_lr]['beta']
best_val_r2 = results[best_lr]['r2_val']
best_test_r2 = results[best_lr]['r2_test']

print("\nBest learning rate (by validation R²):", best_lr)
print(f"Validation R² = {best_val_r2:.4f}")
print(f"Test R² = {best_test_r2:.4f}")
print("\nBest beta (first 10 values shown or fewer if small):")
print(best_beta[:10])

y_test_pred_best = X_test_b @ best_beta
for i in range(min(5, len(y_test))):
    print(f"actual: {y_test[i]:.2f}, predicted: {y_test_pred_best[i]:.2f}, error: {(y_test[i]-y_test_pred_best[i]):.2f}")


lr=0.001  -> R2_val=0.6939, R2_test=0.6950
lr=0.01   -> R2_val=0.9217, R2_test=0.9216
lr=0.1    -> R2_val=0.9217, R2_test=0.9216
lr=1      -> R2_val=-inf, R2_test=-inf

Best learning rate (by validation R²): 0.01
Validation R² = 0.9217
Test R² = 0.9216

Best beta (first 10 values shown or fewer if small):
[1235840.71805373  226407.50681546  163530.556612    122976.14138357
    2093.70050298  148346.06464852]
actual: 1285098.69, predicted: 1379606.86, error: -94508.17
actual: 777718.90, predicted: 999953.60, error: -222234.70
actual: 995144.11, predicted: 1094454.23, error: -99310.12
actual: 1228532.30, predicted: 1198043.23, error: 30489.07
actual: 1127248.61, predicted: 1268899.60, error: -141650.99


  numerator = xp.sum(weight * (y_true - y_pred) ** 2, axis=0)
  numerator = xp.sum(weight * (y_true - y_pred) ** 2, axis=0)


In [None]:
#q3
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA
from sklearn.metrics import r2_score

url = "https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data"
columns = ["symboling","normalized_losses","make","fuel_type","aspiration","num_doors",
           "body_style","drive_wheels","engine_location","wheel_base","length","width",
           "height","curb_weight","engine_type","num_cylinders","engine_size","fuel_system",
           "bore","stroke","compression_ratio","horsepower","peak_rpm","city_mpg","highway_mpg","price"]

data = pd.read_csv(url, names=columns)

data.replace('?', np.nan, inplace=True)
data.dropna(subset=['price'], inplace=True)

for col in ['symboling','normalized_losses','wheel_base','length','width','height','curb_weight',
            'engine_size','bore','stroke','compression_ratio','horsepower','peak_rpm',
            'city_mpg','highway_mpg','price']:
    data[col] = pd.to_numeric(data[col], errors='coerce')

data['num_doors'] = data['num_doors'].replace({'two':2,'four':4})
data['num_cylinders'] = data['num_cylinders'].replace({
    'two':2,'three':3,'four':4,'five':5,'six':6,'eight':8,'twelve':12})

for col in data.columns:
    if data[col].dtype != 'object' and col != 'price':
        data[col].fillna(data[col].median(), inplace=True)

label_cols = ['make','aspiration','engine_location','fuel_type']
for col in label_cols:
    data[col] = LabelEncoder().fit_transform(data[col])

data = pd.get_dummies(data, columns=['body_style','drive_wheels'], drop_first=True)

data['fuel_system'] = data['fuel_system'].apply(lambda x: 1 if 'pfi' in str(x) else 0)
data['engine_type'] = data['engine_type'].apply(lambda x: 1 if 'ohc' in str(x) else 0)

X = data.drop(columns=['price']).values
y = data['price'].values.astype(float)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(f"R² before PCA: {r2_score(y_test, y_pred):.4f}")

pca = PCA(n_components=10)
X_pca = pca.fit_transform(X_scaled)
X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(X_pca, y, test_size=0.3, random_state=42)

model_pca = LinearRegression()
model_pca.fit(X_train_pca, y_train_pca)
y_pred_pca = model_pca.predict(X_test_pca)
print(f"R² after PCA: {r2_score(y_test_pca, y_pred_pca):.4f}")


R² before PCA: 0.8734
R² after PCA: 0.8437


  data['num_doors'] = data['num_doors'].replace({'two':2,'four':4})
  data['num_cylinders'] = data['num_cylinders'].replace({
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(data[col].median(), inplace=True)
