In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.model_selection import KFold, train_test_split

df = pd.read_csv('C:/Users/TUF Gaming/Downloads/archive/USA_Housing.csv')
X = df.drop(['price'], axis=1).values
y = df['price'].values

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
kf = KFold(n_splits=5, shuffle=True, random_state=42)
best_r2 = -np.inf
best_beta = None

for train_index, test_index in kf.split(X_scaled):
    X_train, X_test = X_scaled[train_index], X_scaled[test_index]
    y_train, y_test = y[train_index], y[test_index]
    beta = np.linalg.inv(X_train.T @ X_train) @ X_train.T @ y_train
    y_pred = X_test @ beta
    r2 = r2_score(y_test, y_pred)

    if r2 > best_r2:
        best_r2 = r2
        best_beta = beta

print("Best R-squared:", best_r2)
print("Best beta:", best_beta)

X_train_final, X_test_final, y_train_final, y_test_final = train_test_split(X_scaled, y, test_size=0.3, random_state=42)
y_pred_final = X_test_final @ best_beta
r2_final = r2_score(y_test_final, y_pred_final)
print("Final R-squared:", r2_final)

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

df = pd.read_csv('C:/Users/Acer/Downloads/archive/USA_Housing.csv')
X = df.drop(['price'], axis=1).values
y = df['price'].values

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_temp, X_test, y_temp, y_test = train_test_split(X_scaled, y, test_size=0.30, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.2, random_state=42)

def gradient_descent(X, y, lr, n_iter):
    m, n = X.shape
    X_b = np.hstack([np.ones((m, 1)), X])
    beta = np.zeros(n + 1)
    for i in range(n_iter):
        y_pred = X_b @ beta
        error = y_pred - y
        grad = X_b.T @ error / m
        beta -= lr * grad
    return beta

lrs = [0.001, 0.01, 0.1, 1]
best_r2_val = -np.inf
best_beta = None
best_lr = None

for lr in lrs:
    beta = gradient_descent(X_train, y_train, lr, 1000)
    X_val_b = np.hstack([np.ones((X_val.shape[0], 1)), X_val])
    X_test_b = np.hstack([np.ones((X_test.shape[0], 1)), X_test])
    y_pred_val = X_val_b @ beta
    y_pred_test = X_test_b @ beta
    r2_val = r2_score(y_val, y_pred_val)
    r2_test = r2_score(y_test, y_pred_test)

    print(f"Learning rate: {lr}")
    print(f"Validation R2: {r2_val:.4f}")
    print(f"Test R2: {r2_test:.4f}")
    print(f"Beta: {beta}\n")

    if r2_val > best_r2_val:
        best_r2_val = r2_val
        best_beta = beta
        best_lr = lr

print("Best validation R2:", best_r2_val)
print("Best learning rate:", best_lr)
print("Best beta:", best_beta)

y_pred_best_test = X_test_b @ best_beta
best_test_r2 = r2_score(y_test, y_pred_best_test)
print("Best Test R2 with best beta:", best_test_r2)

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.metrics import r2_score

column = [
    "symboling", "normalized_losses", "make", "fuel_type", "aspiration", "num_doors", "body_style",
    "drive_wheels", "engine_location", "wheel_base", "length", "width", "height", "curb_weight",
    "engine_type", "num_cylinders", "engine_size", "fuel_system", "bore", "stroke",
    "compression_ratio", "horsepower", "peak_rpm", "city_mpg", "highway_mpg", "price"
]
df = pd.read_csv('imports-85.data', names=column, na_values='?')

for col in df.columns:
    if df[col].dtype == 'O':
        df[col].fillna(df[col].mode()[0], inplace=True)
    else:
        df[col].fillna(df[col].median(), inplace=True)
df.dropna(subset=['price'], inplace=True)
df['price'] = pd.to_numeric(df['price'])

doors = {'two':2, 'four':4}
cyl = {'two':2, 'three':3, 'four':4, 'five':5, 'six':6, 'eight':8, 'twelve':12}
df['num_doors'] = df['num_doors'].map(doors).astype(int)
df['num_cylinders'] = df['num_cylinders'].map(cyl).astype(int)
df = pd.get_dummies(df, columns=['body_style', 'drive_wheels'])

for col in ['make', 'aspiration', 'engine_location', 'fuel_type']:
    df[col] = LabelEncoder().fit_transform(df[col])

df['fuel_system'] = df['fuel_system'].apply(lambda x: 1 if 'pfi' in x else 0)
df['engine_type'] = df['engine_type'].apply(lambda x: 1 if 'ohc' in x else 0)

for col in df.columns:
    if df[col].dtype == 'O':
        df[col] = pd.to_numeric(df[col], errors='coerce')

X = df.drop(['price'], axis=1).values
y = df['price'].values
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
r2_initial = r2_score(y_test, y_pred)
print("R2 score without PCA:", r2_initial)

pca = PCA(n_components=0.95)
X_pca = pca.fit_transform(X_scaled)
X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(X_pca, y, test_size=0.3, random_state=42)
lr_pca = LinearRegression()
lr_pca.fit(X_train_pca, y_train_pca)
y_predict_pca = lr_pca.predict(X_test_pca)
r2_pca = r2_score(y_test_pca, y_predict_pca)
print("R2 score with PCA:", r2_pca)