In [9]:
#q1
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.model_selection import KFold, train_test_split


url = "https://drive.google.com/uc?id=1O_NwpJT-8xGfU_-3llUl2sgPu0xllOrX"
df = pd.read_csv(url)

print("Shape of dataset:", df.shape)
print("First 5 rows:\n", df.head())

X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

kf = KFold(n_splits=5, shuffle=True, random_state=42)
r2_scores = []
betas = []

for train_idx, test_idx in kf.split(X_scaled):
    X_train, X_test = X_scaled[train_idx], X_scaled[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]


    X_train_b = np.c_[np.ones(X_train.shape[0]), X_train]
    X_test_b = np.c_[np.ones(X_test.shape[0]), X_test]

    beta = np.linalg.inv(X_train_b.T @ X_train_b) @ X_train_b.T @ y_train
    betas.append(beta)


    y_pred = X_test_b @ beta
    r2 = r2_score(y_test, y_pred)
    r2_scores.append(r2)

print("\nR² Scores for each fold:", r2_scores)
best_beta = betas[np.argmax(r2_scores)]
print("\nBest β:", best_beta)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)
X_train_b = np.c_[np.ones(X_train.shape[0]), X_train]
X_test_b = np.c_[np.ones(X_test.shape[0]), X_test]

y_pred_final = X_test_b @ best_beta
print("\nFinal R² on 30% test set:", r2_score(y_test, y_pred_final))

Shape of dataset: (5000, 6)
First 5 rows:
    Avg. Area Income  Avg. Area House Age  Avg. Area Number of Rooms  \
0       79545.45857             5.682861                   7.009188   
1       79248.64245             6.002900                   6.730821   
2       61287.06718             5.865890                   8.512727   
3       63345.24005             7.188236                   5.586729   
4       59982.19723             5.040555                   7.839388   

   Avg. Area Number of Bedrooms  Area Population         Price  
0                          4.09      23086.80050  1.059034e+06  
1                          3.09      40173.07217  1.505891e+06  
2                          5.13      36882.15940  1.058988e+06  
3                          3.26      34310.24283  1.260617e+06  
4                          4.23      26354.10947  6.309435e+05  

R² Scores for each fold: [0.9179971706985147, 0.9145677884802818, 0.9116116385364478, 0.9193091764960816, 0.9243869413350316]

Best β: [1.2

In [10]:
#q2
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split


url = "https://drive.google.com/uc?id=1O_NwpJT-8xGfU_-3llUl2sgPu0xllOrX"
df = pd.read_csv(url)


X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values


scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


X_train, X_temp, y_train, y_temp = train_test_split(X_scaled, y, test_size=0.44, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=(30/44), random_state=42)


def add_bias(X):
    return np.c_[np.ones(X.shape[0]), X]

X_train_b, X_val_b, X_test_b = add_bias(X_train), add_bias(X_val), add_bias(X_test)


def gradient_descent(X, y, lr, iterations=1000):
    beta = np.zeros(X.shape[1])
    m = len(y)
    for _ in range(iterations):
        gradient = -(2/m) * X.T @ (y - X @ beta)
        beta -= lr * gradient
    return beta

learning_rates = [0.001, 0.01, 0.1, 1]
results = {}

for lr in learning_rates:
    beta = gradient_descent(X_train_b, y_train, lr, iterations=1000)
    val_r2 = r2_score(y_val, X_val_b @ beta)
    test_r2 = r2_score(y_test, X_test_b @ beta)
    results[lr] = (beta, val_r2, test_r2)


print("Learning Rate | Validation R² | Test R²")
print("----------------------------------------")
for lr, (beta, val_r2, test_r2) in results.items():
    print(f"{lr:<13} {val_r2:.4f}        {test_r2:.4f}")

best_lr = max(results, key=lambda x: results[x][1])
best_beta, best_val_r2, best_test_r2 = results[best_lr]

print("\nBest Learning Rate:", best_lr)
print("Best β coefficients:", best_beta)
print(f"Validation R²: {best_val_r2:.4f}, Test R²: {best_test_r2:.4f}")

Learning Rate | Validation R² | Test R²
----------------------------------------
0.001         0.6451        0.6537
0.01          0.9200        0.9134
0.1           0.9200        0.9134
1             -inf        -inf

Best Learning Rate: 0.1
Best β coefficients: [1232180.27200919  230645.88389435  165328.94019375  120045.00851908
    2945.02108903  151375.22971285]
Validation R²: 0.9200, Test R²: 0.9134


  numerator = xp.sum(weight * (y_true - y_pred) ** 2, axis=0)
  numerator = xp.sum(weight * (y_true - y_pred) ** 2, axis=0)


In [11]:
#q3
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split


url_car = "https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data"
columns = ["symboling", "normalized_losses", "make", "fuel_type", "aspiration","num_doors",
           "body_style", "drive_wheels", "engine_location", "wheel_base", "length", "width",
           "height", "curb_weight", "engine_type", "num_cylinders", "engine_size",
           "fuel_system", "bore", "stroke", "compression_ratio", "horsepower",
           "peak_rpm", "city_mpg", "highway_mpg", "price"]

df_car = pd.read_csv(url_car, names=columns)

print("Dataset shape before cleaning:", df_car.shape)


df_car.replace("?", np.nan, inplace=True)

df_car.dropna(subset=["price"], inplace=True)
df_car["price"] = df_car["price"].astype(float)

df_car["num_doors"] = df_car["num_doors"].replace({"two":2, "four":4})
df_car["num_cylinders"] = df_car["num_cylinders"].replace({
    "two":2, "three":3,"four":4,"five":5,
    "six":6,"eight":8,"twelve":12
})


df_car = pd.get_dummies(df_car, columns=["body_style","drive_wheels"], drop_first=True)


for col in ["make","aspiration","engine_location","fuel_type"]:
    df_car[col] = LabelEncoder().fit_transform(df_car[col].astype(str))

df_car["fuel_system"] = df_car["fuel_system"].apply(lambda x: 1 if isinstance(x,str) and "pfi" in x else 0)
df_car["engine_type"] = df_car["engine_type"].apply(lambda x: 1 if isinstance(x,str) and "ohc" in x else 0)


imputer = SimpleImputer(strategy="mean")
df_car[df_car.columns] = imputer.fit_transform(df_car)

print("Dataset shape after preprocessing:", df_car.shape)
print("Any NaNs left?", df_car.isna().sum().sum())


X_car = df_car.drop("price", axis=1).values
y_car = df_car["price"].values


scaler = StandardScaler()
X_car_scaled = scaler.fit_transform(X_car)


X_train, X_test, y_train, y_test = train_test_split(X_car_scaled, y_car, test_size=0.3, random_state=42)


lr = LinearRegression()
lr.fit(X_train, y_train)
r2_no_pca = r2_score(y_test, lr.predict(X_test))
print("\nTest R² without PCA:", r2_no_pca)


pca = PCA(n_components=10)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

lr_pca = LinearRegression()
lr_pca.fit(X_train_pca, y_train)

r2_with_pca = r2_score(y_test, lr_pca.predict(X_test_pca))
print("Test R² with PCA:", r2_with_pca)


Dataset shape before cleaning: (205, 26)
Dataset shape after preprocessing: (201, 30)
Any NaNs left? 0

Test R² without PCA: 0.8730011884362395
Test R² with PCA: 0.7906490594744899


  df_car["num_doors"] = df_car["num_doors"].replace({"two":2, "four":4})
  df_car["num_cylinders"] = df_car["num_cylinders"].replace({
