In [None]:
#Q1

In [5]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import StandardScaler

df = pd.read_csv("C:\\Users\\NEW\\Downloads\\USA_Housing (1).csv")
X = df.drop(columns=["Price"]).values
y = df["Price"].values.reshape(-1,1)

def add_bias(X):
    return np.hstack([np.ones((X.shape[0],1)), X])

def normal_beta(X,y):
    return np.linalg.inv(X.T@X)@X.T@y

def r2(y,y_hat):
    return 1 - np.sum((y-y_hat)**2)/np.sum((y-y.mean())**2)

kf = KFold(n_splits=5,shuffle=True,random_state=42)
scores, betas = [], []

for tr,te in kf.split(X):
    scaler = StandardScaler().fit(X[tr])
    Xtr,Xte = scaler.transform(X[tr]), scaler.transform(X[te])
    Xtr,Xte = add_bias(Xtr), add_bias(Xte)
    beta = normal_beta(Xtr,y[tr])
    y_hat = Xte@beta
    scores.append(r2(y[te],y_hat))
    betas.append(beta)

best = np.argmax(scores)
print("Fold R2:",scores)
print("Best Fold:",best+1,"R2:",scores[best])

Xtr,Xte,ytr,yte = train_test_split(X,y,test_size=0.3,random_state=7)
scaler = StandardScaler().fit(Xtr)
Xtr,Xte = scaler.transform(Xtr),scaler.transform(Xte)
Xtr,Xte = add_bias(Xtr),add_bias(Xte)
beta = normal_beta(Xtr,ytr)
print("Final 70/30 R2:",r2(yte,Xte@beta))


Fold R2: [0.9179971706985147, 0.9145677884802819, 0.9116116385364478, 0.9193091764960816, 0.9243869413350316]
Best Fold: 5 R2: 0.9243869413350316
Final 70/30 R2: 0.9149217287708293


In [None]:
#Q2

In [8]:
from sklearn.model_selection import train_test_split

Xtr_temp,Xtest,ytr_temp,ytest = train_test_split(X,y,test_size=0.3,random_state=1)
Xtrain,Xval,ytrain,yval = train_test_split(Xtr_temp,ytr_temp,test_size=0.2,random_state=1)

scaler = StandardScaler().fit(Xtrain)
Xtrain,Xval,Xtest = scaler.transform(Xtrain),scaler.transform(Xval),scaler.transform(Xtest)
Xtrain,Xval,Xtest = add_bias(Xtrain),add_bias(Xval),add_bias(Xtest)

def gd(X,y,lr,iters):
    beta = np.zeros((X.shape[1],1))
    for _ in range(iters):
        grad = -2/X.shape[0]*X.T@(y-X@beta)
        beta -= lr*grad
    return beta

rates = [0.001,0.01,0.1,1]
for lr in rates:
    beta = gd(Xtrain,ytrain,lr,1000)
    r2_val = r2(yval,Xval@beta)
    r2_test = r2(ytest,Xtest@beta)
    print(lr,"Val R2:",r2_val,"Test R2:",r2_test)


0.001 Val R2: 0.6752036751540692 Test R2: 0.6861452071726952
0.01 Val R2: 0.9219863357213828 Test R2: 0.9167072198129036
0.1 Val R2: 0.9219864090633332 Test R2: 0.9167072348076073
1 Val R2: -inf Test R2: -inf


  return 1 - np.sum((y-y_hat)**2)/np.sum((y-y.mean())**2)


In [None]:
#Q3

In [12]:
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA

cols=["symboling","normalized_losses","make","fuel_type","aspiration","num_doors","body_style","drive_wheels","engine_location","wheel_base","length","width","height","curb_weight","engine_type","num_cylinders","engine_size","fuel_system","bore","stroke","compression_ratio","horsepower","peak_rpm","city_mpg","highway_mpg","price"]
df2 = pd.read_csv("C:\\Users\\NEW\\Downloads\\imports-85.data",names=cols)
df2.replace("?",np.nan,inplace=True)
df2["price"]=pd.to_numeric(df2["price"],errors="coerce")
df2.dropna(subset=["price"],inplace=True)
for c in df2.columns:
    if df2[c].dtype=="O" and c!="price":
        if c in ["num_doors","num_cylinders"]:
            df2[c]=df2[c].map({"two":2,"three":3,"four":4,"five":5,"six":6,"eight":8,"twelve":12}).fillna(df2[c])
        elif c in ["body_style","drive_wheels"]:
            dummies=pd.get_dummies(df2[c],prefix=c)
            df2=pd.concat([df2.drop(c,axis=1),dummies],axis=1)
        elif c in ["make","aspiration","engine_location","fuel_type"]:
            df2[c]=LabelEncoder().fit_transform(df2[c].astype(str))
        elif c=="fuel_system":
            df2[c]=df2[c].apply(lambda x:1 if "pfi" in str(x) else 0)
        elif c=="engine_type":
            df2[c]=df2[c].apply(lambda x:1 if "ohc" in str(x) else 0)
df2=df2.apply(pd.to_numeric,errors="coerce")
df2.fillna(df2.median(),inplace=True)

X=df2.drop(columns=["price"]).values
y=df2["price"].values
scaler=StandardScaler().fit(X)
X=scaler.transform(X)
Xtr,Xte,ytr,yte=train_test_split(X,y,test_size=0.3,random_state=42)

lr=LinearRegression().fit(Xtr,ytr)
print("Original R2:",lr.score(Xte,yte))

pca=PCA(n_components=10).fit(X)
X_pca=pca.transform(X)
Xtr,Xte,ytr,yte=train_test_split(X_pca,y,test_size=0.3,random_state=42)
lr=LinearRegression().fit(Xtr,ytr)
print("PCA R2:",lr.score(Xte,yte))


Original R2: 0.8734104772978125
PCA R2: 0.8310865902221483


  df2[c]=df2[c].map({"two":2,"three":3,"four":4,"five":5,"six":6,"eight":8,"twelve":12}).fillna(df2[c])
