In [22]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)



Q1 – K-Fold Cross Validation

In [2]:
df = pd.read_csv('USA_Housing.csv')
print("House Dataset Shape:", df.shape)
print("Columns:", df.columns.tolist())


House Dataset Shape: (5000, 6)
Columns: ['Avg. Area Income', 'Avg. Area House Age', 'Avg. Area Number of Rooms', 'Avg. Area Number of Bedrooms', 'Area Population', 'Price']


In [3]:
X = df.drop('Price', axis=1)
y = df['Price']

scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)


In [4]:
n = len(X_scaled)
k = 5
fold_size = n // k
best_beta = None
best_r2 = -np.inf
results = []

for i in range(k):
    start_idx = i * fold_size
    end_idx = (i + 1) * fold_size if i < k - 1 else n

    test_indices = list(range(start_idx, end_idx))
    train_indices = list(range(0, start_idx)) + list(range(end_idx, n))

    X_train = X_scaled.iloc[train_indices]
    X_test = X_scaled.iloc[test_indices]
    y_train = y.iloc[train_indices]
    y_test = y.iloc[test_indices]

    X_train_with_intercept = np.column_stack([np.ones(len(X_train)), X_train])
    X_test_with_intercept = np.column_stack([np.ones(len(X_test)), X_test])

    beta = np.linalg.inv(X_train_with_intercept.T @ X_train_with_intercept) @ X_train_with_intercept.T @ y_train
    y_pred = X_test_with_intercept @ beta
    r2 = r2_score(y_test, y_pred)

    results.append({'fold': i + 1, 'beta': beta, 'r2_score': r2})
    print(f"Fold {i+1}: R² = {r2:.4f}")

    if r2 > best_r2:
        best_r2 = r2
        best_beta = beta

print(f"\nBest R² Score: {best_r2:.4f}")


Fold 1: R² = 0.9176
Fold 2: R² = 0.9203
Fold 3: R² = 0.9152
Fold 4: R² = 0.9209
Fold 5: R² = 0.9138

Best R² Score: 0.9209


In [5]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

X_train_with_intercept = np.column_stack([np.ones(len(X_train)), X_train])
X_test_with_intercept = np.column_stack([np.ones(len(X_test)), X_test])

y_pred_test = X_test_with_intercept @ best_beta
final_r2 = r2_score(y_test, y_pred_test)

print(f"Final Test R² Score: {final_r2:.4f}")


Final Test R² Score: 0.9147


Q2 – Gradient Descent with Validation Set

In [6]:
df = pd.read_csv('USA_Housing.csv')
X = df.drop('Price', axis=1)
y = df['Price']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train_val, X_test, y_train_val, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=42)


In [7]:
def gradient_descent(X, y, learning_rate=0.01, iterations=1000):
    X_with_intercept = np.column_stack([np.ones(len(X)), X])
    m, n = X_with_intercept.shape
    beta = np.zeros(n)

    for i in range(iterations):
        y_pred = X_with_intercept @ beta
        gradient = (2 / m) * X_with_intercept.T @ (y_pred - y)
        beta -= learning_rate * gradient

    return beta


In [11]:
learning_rates = [0.001, 0.01, 0.1, 1]
best_lr = None
best_val_r2 = -np.inf
best_coefficients = None

for lr in learning_rates:
    coefficients = gradient_descent(X_train, y_train, lr, 1000)

    X_val_with_intercept = np.column_stack([np.ones(len(X_val)), X_val])
    X_test_with_intercept = np.column_stack([np.ones(len(X_test)), X_test])

    y_val_pred = X_val_with_intercept @ coefficients
    y_test_pred = X_test_with_intercept @ coefficients

    val_r2 = r2_score(y_val, y_val_pred)
    test_r2 = r2_score(y_test, y_test_pred)

    print(f"Learning Rate: {lr}, Validation R²: {val_r2:.4f}, Test R²: {test_r2:.4f}")

    if val_r2 > best_val_r2:
        best_val_r2 = val_r2
        best_lr = lr
        best_coefficients = coefficients

print(f"\nBest Learning Rate: {best_lr}")
print(f"Best Validation R²: {best_val_r2:.4f}")


Learning Rate: 0.001, Validation R²: 0.6820, Test R²: 0.6490
Learning Rate: 0.01, Validation R²: 0.9098, Test R²: 0.9148
Learning Rate: 0.1, Validation R²: 0.9098, Test R²: 0.9148
Learning Rate: 1, Validation R²: -inf, Test R²: -inf

Best Learning Rate: 0.01
Best Validation R²: 0.9098


Q3 – Car Price Prediction with Preprocessing & PCA

In [12]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data"
columns = ["symboling", "normalized_losses", "make", "fuel_type", "aspiration",
           "num_doors", "body_style", "drive_wheels", "engine_location",
           "wheel_base", "length", "width", "height", "curb_weight",
           "engine_type", "num_cylinders", "engine_size", "fuel_system",
           "bore", "stroke", "compression_ratio", "horsepower", "peak_rpm",
           "city_mpg", "highway_mpg", "price"]

df = pd.read_csv(url, names=columns, na_values='?')
print("Car Dataset Shape:", df.shape)


Car Dataset Shape: (205, 26)


In [15]:
df = df.dropna(subset=['price'])

numeric_columns = df.select_dtypes(include=[np.number]).columns
categorical_columns = df.select_dtypes(include=['object']).columns

for col in numeric_columns:
    if col != 'price':
        df[col] = df[col].fillna(df[col].median())

for col in categorical_columns:
    df[col] = df[col].fillna(df[col].mode()[0])

door_mapping = {'two': 2, 'four': 4}
cylinder_mapping = {
    'two': 2, 'three': 3, 'four': 4, 'five': 5,
    'six': 6, 'eight': 8, 'twelve': 12
}
df['num_doors'] = df['num_doors'].map(door_mapping)
df['num_cylinders'] = df['num_cylinders'].map(cylinder_mapping)

for col, prefix in zip(['body_style', 'drive_wheels'], ['body_style', 'drive_wheels']):
    if col in df.columns:
        df = pd.get_dummies(df, columns=[col], prefix=[prefix])

le = LabelEncoder()
for col in ['make', 'aspiration', 'engine_location', 'fuel_type']:
    if col in df.columns:  # ✅ prevent KeyError if column missing
        df[col] = le.fit_transform(df[col])

if 'fuel_system' in df.columns:
    df['fuel_system'] = df['fuel_system'].apply(lambda x: 1 if 'pfi' in str(x).lower() else 0)
if 'engine_type' in df.columns:
    df['engine_type'] = df['engine_type'].apply(lambda x: 1 if 'ohc' in str(x).lower() else 0)

print(f"Processed Dataset Shape: {df.shape}")


Processed Dataset Shape: (201, 32)


In [20]:
df = df.dropna(subset=['price'])

numeric_columns = df.select_dtypes(include=[np.number]).columns
categorical_columns = df.select_dtypes(include=['object']).columns

for col in numeric_columns:
    if col != 'price':
        df[col] = df[col].fillna(df[col].median())

for col in categorical_columns:
    df[col] = df[col].replace('?', np.nan).fillna(df[col].mode()[0])

door_mapping = {'two': 2, 'four': 4}
cylinder_mapping = {'two': 2, 'three': 3, 'four': 4, 'five': 5, 'six': 6, 'eight': 8, 'twelve': 12}
df['num_doors'] = df['num_doors'].map(door_mapping).fillna(0)
df['num_cylinders'] = df['num_cylinders'].map(cylinder_mapping).fillna(0)

dummy_cols = []
if 'body_style' in df.columns:
    dummy_cols.append('body_style')
if 'drive_wheels' in df.columns:
    dummy_cols.append('drive_wheels')

if dummy_cols:
    df = pd.get_dummies(df, columns=dummy_cols, prefix=dummy_cols)

le = LabelEncoder()
for col in ['make', 'aspiration', 'engine_location', 'fuel_type']:
    if col in df.columns:
        df[col] = df[col].replace('?', np.nan).fillna(df[col].mode()[0])
        df[col] = le.fit_transform(df[col])

if 'fuel_system' in df.columns:
    df['fuel_system'] = df['fuel_system'].apply(lambda x: 1 if 'pfi' in str(x).lower() else 0)

if 'engine_type' in df.columns:
    df['engine_type'] = df['engine_type'].apply(lambda x: 1 if 'ohc' in str(x).lower() else 0)

print("Final Columns:", df.columns.tolist())
print("NaNs after preprocessing:", df.isna().sum().sum())


Final Columns: ['symboling', 'normalized_losses', 'make', 'fuel_type', 'aspiration', 'num_doors', 'engine_location', 'wheel_base', 'length', 'width', 'height', 'curb_weight', 'engine_type', 'num_cylinders', 'engine_size', 'fuel_system', 'bore', 'stroke', 'compression_ratio', 'horsepower', 'peak_rpm', 'city_mpg', 'highway_mpg', 'price', 'body_style_convertible', 'body_style_hardtop', 'body_style_hatchback', 'body_style_sedan', 'body_style_wagon', 'drive_wheels_4wd', 'drive_wheels_fwd', 'drive_wheels_rwd']
NaNs after preprocessing: 0


In [24]:
# Impute NaNs after scaling
imputer = SimpleImputer(strategy="median")
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

# ---- Original Model ----
model = LinearRegression()
model.fit(X_train_imputed, y_train)
y_pred = model.predict(X_test_imputed)
r2_original = r2_score(y_test, y_pred)
print(f"Original Model R² Score: {r2_original:.4f}")

# ---- PCA Model ----
pca = PCA(n_components=0.95)
X_train_pca = pca.fit_transform(X_train_imputed)
X_test_pca = pca.transform(X_test_imputed)

print(f"PCA Components: {pca.n_components_} (from {X.shape[1]} original features)")

model_pca = LinearRegression()
model_pca.fit(X_train_pca, y_train)
y_pred_pca = model_pca.predict(X_test_pca)
r2_pca = r2_score(y_test, y_pred_pca)
print(f"PCA Model R² Score: {r2_pca:.4f}")

# ---- Comparison ----
if r2_pca > r2_original:
    print("PCA improved performance!")
elif r2_pca < r2_original:
    print("PCA reduced performance.")
else:
    print("PCA had no significant impact.")

Original Model R² Score: 0.8730
PCA Components: 15 (from 31 original features)
PCA Model R² Score: 0.8506
PCA reduced performance.
