In [4]:
#q1
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

# --- STEP 1: Load the dataset ---
# NOTE: Replace 'house_price_data.csv' with the actual file path/name of your downloaded dataset
try:
    df_q1 = pd.read_csv('house_price_data.csv')
except FileNotFoundError:
    print("ERROR: Please download the dataset and ensure it's in the correct path ('house_price_data.csv').")
    # Using a dummy dataframe for demonstration if file is not found (you should use your real file)
    df_q1 = pd.DataFrame(np.random.rand(100, 4), columns=['Feature1', 'Feature2', 'Feature3', 'Price'])
    # Exit or raise error if the real file is required

# a) Divide the dataset into input features (X) and output variable (y)
X = df_q1.drop('Price', axis=1) # Assuming 'Price' is the output column
y = df_q1['Price']

# b) Scale the values of input features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

# --- Least Square Error (LSE) Fit Function ---
def lse_fit(X_train, y_train):
    # Add a column of ones for the intercept term (β0)
    X_train_b = np.c_[np.ones((len(X_train), 1)), X_train]
    # Normal Equation: β = (X^T * X)^-1 * X^T * y
    beta = np.linalg.inv(X_train_b.T @ X_train_b) @ X_train_b.T @ y_train
    return beta

def predict(X_data, beta):
    # Add a column of ones for the intercept term (β0)
    X_data_b = np.c_[np.ones((len(X_data), 1)), X_data]
    return X_data_b @ beta

# c) Divide input and output features into five folds
# d) Run five iterations for 5-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
r2_scores = []
betas = []
fold_results = []

for iteration, (train_index, test_index) in enumerate(kf.split(X_scaled)):
    # Split data into training and test sets for the current fold
    X_train, X_test = X_scaled.iloc[train_index], X_scaled.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Calculate beta (β) matrix using LSE fit
    beta = lse_fit(X_train, y_train)
    
    # Find predicted values
    y_pred = predict(X_test, beta)
    
    # Calculate R2_score
    r2 = r2_score(y_test, y_pred)
    
    r2_scores.append(r2)
    betas.append(beta)
    
    fold_results.append({
        'Iteration': iteration + 1,
        'R2_score': r2,
        'Beta': beta
    })
    
    print(f"Fold {iteration + 1}: R2 Score = {r2:.4f}")

# Find the best β matrix (for which R2_score is maximum)
best_r2_index = np.argmax(r2_scores)
best_beta = betas[best_r2_index]
print("\n--- 5-Fold Cross Validation Results ---")
print(pd.DataFrame(fold_results))
print(f"\nBest R2 Score: {r2_scores[best_r2_index]:.4f} from Fold {best_r2_index + 1}")
print(f"Best Beta (β) Matrix:\n{best_beta}")

# e) Use the best β matrix to train regressor for 70% of data and test on 30%
print("\n--- Final Performance Test (70/30 split with Best Beta) ---")
X_train_final, X_test_final, y_train_final, y_test_final = train_test_split(
    X_scaled, y, test_size=0.3, random_state=42
)

# Use the best beta directly for prediction on the test set (no need to retrain if using LSE)
y_pred_final = predict(X_test_final, best_beta)
final_r2 = r2_score(y_test_final, y_pred_final)

print(f"R2 Score on 30% Test Data using Best Beta: {final_r2:.4f}")

ERROR: Please download the dataset and ensure it's in the correct path ('house_price_data.csv').
Fold 1: R2 Score = -0.2257
Fold 2: R2 Score = -0.0175
Fold 3: R2 Score = -0.1066
Fold 4: R2 Score = -0.0535
Fold 5: R2 Score = -0.0297

--- 5-Fold Cross Validation Results ---
   Iteration  R2_score                                               Beta
0          1 -0.225721  [0.46638947415781284, -0.03918218712227385, 0....
1          2 -0.017503  [0.4415054018263702, -0.042278672191861634, 0....
2          3 -0.106604  [0.44124435447486077, -0.052574761330143695, 0...
3          4 -0.053457  [0.4339394688781836, -0.04110638237811426, 0.0...
4          5 -0.029712  [0.4282801295041535, -0.010667762211253422, -0...

Best R2 Score: -0.0175 from Fold 2
Best Beta (β) Matrix:
[ 0.4415054  -0.04227867  0.02173684  0.00108627]

--- Final Performance Test (70/30 split with Best Beta) ---
R2 Score on 30% Test Data using Best Beta: -0.1334


In [2]:
#q2
# Reuse the data loaded and scaled in Q1
# X_scaled, y are already defined from Q1

# --- STEP 1: Split the dataset (56% Training, 14% Validation, 30% Test) ---
# Split 70% (Train+Validation) and 30% (Test) first
X_temp, X_test, y_temp, y_test = train_test_split(
    X_scaled, y, test_size=0.3, random_state=42
)

# Split 70% (X_temp, y_temp) into 80% Training and 20% Validation
# 0.7 * 0.8 = 0.56 (56% Training)
# 0.7 * 0.2 = 0.14 (14% Validation)
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=(0.14/0.7), random_state=42
)

print(f"Training size: {len(X_train)} (56%), Validation size: {len(X_val)} (14%), Test size: {len(X_test)} (30%)")

# --- Gradient Descent Optimization Function ---
def gradient_descent(X_train, y_train, learning_rate, n_iterations):
    # Prepare X_train by adding intercept term
    X_b = np.c_[np.ones((len(X_train), 1)), X_train]
    y_train_array = y_train.values.reshape(-1, 1) # Ensure y is a column vector
    
    # Initialize random beta vector (intercept + features)
    beta = np.random.randn(X_b.shape[1], 1)
    m = len(y_train)
    
    for iteration in range(n_iterations):
        # Calculate gradients (partial derivatives of the cost function)
        gradients = (2/m) * X_b.T @ (X_b @ beta - y_train_array)
        # Update beta
        beta = beta - learning_rate * gradients
        
    return beta.flatten() # Return as a 1D array

def predict_gd(X_data, beta):
    # Add intercept column
    X_b = np.c_[np.ones((len(X_data), 1)), X_data]
    return X_b @ beta

# --- Experiment with different learning rates ---
learning_rates = [0.001, 0.01, 0.1, 1]
n_iterations = 1000
gd_results = []

for lr in learning_rates:
    print(f"\nTraining with Learning Rate (α) = {lr}...")
    
    # Compute regression coefficients (beta)
    beta_gd = gradient_descent(X_train, y_train, lr, n_iterations)
    
    # Compute R2_score for Validation Set
    y_val_pred = predict_gd(X_val, beta_gd)
    r2_val = r2_score(y_val, y_val_pred)
    
    # Compute R2_score for Test Set
    y_test_pred = predict_gd(X_test, beta_gd)
    r2_test = r2_score(y_test, y_test_pred)
    
    gd_results.append({
        'Learning_Rate': lr,
        'R2_Validation': r2_val,
        'R2_Test': r2_test,
        'Beta': beta_gd
    })
    
    print(f"  R2_Validation: {r2_val:.4f}, R2_Test: {r2_test:.4f}")

# Find the best value of regression coefficients (based on maximum R2_score on Validation Set)
results_df = pd.DataFrame(gd_results)
best_index = results_df['R2_Validation'].idxmax()
best_lr_result = results_df.iloc[best_index]

print("\n--- Gradient Descent Results Summary ---")
print(results_df[['Learning_Rate', 'R2_Validation', 'R2_Test']])

print(f"\nBest Learning Rate (based on R2_Validation): **{best_lr_result['Learning_Rate']}**")
print(f"Best R2_Validation: {best_lr_result['R2_Validation']:.4f}")
print(f"R2_Test with Best Beta: **{best_lr_result['R2_Test']:.4f}**")
print(f"Best Regression Coefficients (Beta):\n{best_lr_result['Beta']}")

Training size: 55 (56%), Validation size: 15 (14%), Test size: 30 (30%)

Training with Learning Rate (α) = 0.001...
  R2_Validation: -4.8947, R2_Test: -4.9527

Training with Learning Rate (α) = 0.01...
  R2_Validation: -0.1781, R2_Test: -0.3133

Training with Learning Rate (α) = 0.1...
  R2_Validation: -0.1781, R2_Test: -0.3133

Training with Learning Rate (α) = 1...
  R2_Validation: -85447766317579007391777231043577789173872963601049621298126591879036562320453573992875606291464596037245847556062355007610899027043551420253107565734110441855362503895466349078719801010225936360264340245960401309660673974573186793951814100257748015709427356139520.0000, R2_Test: -62289890294800771729119665240167488202295134601862283659957937503880661462444122069934743594408782444553167318793540933131022049913603047573728783001109931615984398442357197102473546270402530309808742306548745420105849394243210301797941656191167381343291920351232.0000

--- Gradient Descent Results Summary ---
   Learning_Rate  R2_

In [6]:
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

# --- STEP 1: Load the dataset and replace '?' with NaN ---
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data"
column_names = [
    "symboling", "normalized_losses", "make", "fuel_type", "aspiration", 
    "num_doors", "body_style", "drive_wheels", "engine_location", "wheel_base", 
    "length", "width", "height", "curb_weight", "engine_type", "num_cylinders", 
    "engine_size", "fuel_system", "bore", "stroke", "compression_ratio", 
    "horsepower", "peak_rpm", "city_mpg", "highway_mpg", "price"
]
df_q3 = pd.read_csv(url, header=None, names=column_names)
df_q3 = df_q3.replace('?', np.nan)

# --- STEP 2: Handle Missing Values ---
# Convert columns that should be numeric but contain '?' to float
cols_to_convert = ["normalized_losses", "bore", "stroke", "horsepower", "peak_rpm", "price"]
for col in cols_to_convert:
    df_q3[col] = pd.to_numeric(df_q3[col])

# Replace all NaN values with central tendency imputation (mean for numeric, mode for object)
for col in df_q3.columns:
    if df_q3[col].dtype == 'object':
        # Mode imputation for categorical
        df_q3[col] = df_q3[col].fillna(df_q3[col].mode()[0])
    else:
        # Mean imputation for numeric
        df_q3[col] = df_q3[col].fillna(df_q3[col].mean())

# Drop the rows with NaN values in the price column (although imputation was used, this is a safety step)
df_q3.dropna(subset=['price'], inplace=True)

# --- STEP 3: Convert non-numeric values to numeric ---
print("Applying Categorical Encoding...")

# (i) Convert number names to figures
num_map = {'two': 2, 'three': 3, 'four': 4, 'five': 5, 'six': 6, 'eight': 8, 'twelve': 12}
df_q3['num_doors'] = df_q3['num_doors'].map({'two': 2, 'four': 4}).fillna(df_q3['num_doors']) # Fill missing due to mode being applied
df_q3['num_cylinders'] = df_q3['num_cylinders'].map(num_map)

# (ii) Dummy Encoding Scheme (One-Hot Encoding)
df_q3 = pd.get_dummies(df_q3, columns=['body_style', 'drive_wheels'], drop_first=True)

# (iii) Label Encoding Scheme
label_encode_cols = ['make', 'aspiration', 'engine_location', 'fuel_type']
le = LabelEncoder()
for col in label_encode_cols:
    df_q3[col] = le.fit_transform(df_q3[col])

# (iv) For fuel_system
df_q3['fuel_system'] = df_q3['fuel_system'].apply(lambda x: 1 if 'pfi' in str(x).lower() else 0)

# (v) For engine_type
df_q3['engine_type'] = df_q3['engine_type'].apply(lambda x: 1 if 'ohc' in str(x).lower() else 0)

# --- STEP 4: Divide and Scale ---
X_q3 = df_q3.drop('price', axis=1)
y_q3 = df_q3['price']

# Scale all input features
scaler_q3 = StandardScaler()
X_q3_scaled = scaler_q3.fit_transform(X_q3)
X_q3_scaled = pd.DataFrame(X_q3_scaled, columns=X_q3.columns)

# Split data for training/testing
X_train_q3, X_test_q3, y_train_q3, y_test_q3 = train_test_split(
    X_q3_scaled, y_q3, test_size=0.3, random_state=42
)

# --- STEP 5: Train a linear regressor (Original Data) ---
print("\n--- 5. Standard Linear Regression (70/30 Split) ---")
lr_model = LinearRegression()
lr_model.fit(X_train_q3, y_train_q3)
y_pred_lr = lr_model.predict(X_test_q3)
r2_lr = r2_score(y_test_q3, y_pred_lr)
print(f"R2 Score on Test Set (Standard LR): **{r2_lr:.4f}**")

# --- STEP 6: PCA and then train a linear regressor ---
print("\n--- 6. PCA-based Linear Regression ---")

# Determine optimal number of components for 95% variance retention
pca = PCA(n_components=0.95)
pca.fit(X_train_q3)
X_train_pca = pca.transform(X_train_q3)
X_test_pca = pca.transform(X_test_q3)

n_components = pca.n_components_
print(f"Number of components retaining 95% variance: **{n_components}** (down from {X_q3.shape[1]} features)")

# Train a linear regressor on reduced data
lr_pca_model = LinearRegression()
lr_pca_model.fit(X_train_pca, y_train_q3)
y_pred_pca = lr_pca_model.predict(X_test_pca)
r2_pca = r2_score(y_test_q3, y_pred_pca)
print(f"R2 Score on Test Set (PCA-based LR): **{r2_pca:.4f}**")

# Performance comparison
performance_improvement = r2_pca - r2_lr
print(f"\nPerformance Change (PCA R2 - Standard LR R2): {performance_improvement:.4f}")

if r2_pca > r2_lr:
    print("Conclusion: Yes, PCA decomposition **led to a performance improvement** on the test set.")
else:
    print("Conclusion: No, PCA decomposition **did not lead to a performance improvement** on the test set.")

Applying Categorical Encoding...

--- 5. Standard Linear Regression (70/30 Split) ---
R2 Score on Test Set (Standard LR): **0.8044**

--- 6. PCA-based Linear Regression ---
Number of components retaining 95% variance: **16** (down from 29 features)
R2 Score on Test Set (PCA-based LR): **0.7578**

Performance Change (PCA R2 - Standard LR R2): -0.0467
Conclusion: No, PCA decomposition **did not lead to a performance improvement** on the test set.
