# Q1: Linear Regression — House Price Prediction
Dataset: `linear_regression_dataset_houseprice.csv`

In [None]:
# Common imports used across notebooks
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
RANDOM_STATE = 42


In [None]:
# Load data
df = pd.read_csv('/mnt/data/aiml/linear_regression_dataset_houseprice.csv')
df.head()

In [None]:
# Quick checks
print('shape:', df.shape)
print(df.isna().sum())
display(df.describe())

In [None]:
# Feature engineering
df['age'] = 2025 - df['built_year']
df.drop(columns=['built_year'], inplace=True)
# standard boolean mapping for parking
df['parking'] = df['parking'].map({'Yes':1, 'No':0}).fillna(df['parking'])
df.head()

In [None]:
# Preprocessing pipeline
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression, RidgeCV

target = 'price'
y = df[target]
X = df.drop(columns=[target])

num_cols = X.select_dtypes(include=['int64','float64']).columns.tolist()
cat_cols = X.select_dtypes(include=['object','category']).columns.tolist()

num_transform = Pipeline([('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())])
cat_transform = Pipeline([('imputer', SimpleImputer(strategy='most_frequent')), ('ohe', OneHotEncoder(handle_unknown='ignore', drop='first'))])

preprocessor = ColumnTransformer([('num', num_transform, num_cols), ('cat', cat_transform, cat_cols)])

lr_pipeline = Pipeline([('pre', preprocessor), ('reg', LinearRegression())])
ridge_pipeline = Pipeline([('pre', preprocessor), ('reg', RidgeCV(alphas=[0.1,1,10], cv=5))])
print('num_cols:', num_cols)
print('cat_cols:', cat_cols)

In [None]:
# Train/test split and fit
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)
lr_pipeline.fit(X_train, y_train)
ridge_pipeline.fit(X_train, y_train)


In [None]:
# Evaluation
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
def eval_model(name, pipe):
    y_pred = pipe.predict(X_test)
    print(name)
    print('MAE:', mean_absolute_error(y_test, y_pred))
    print('RMSE:', mean_squared_error(y_test, y_pred, squared=False))
    print('R2:', r2_score(y_test, y_pred))
    plt.figure(figsize=(6,4))
    plt.scatter(y_pred, y_test - y_pred)
    plt.axhline(0, linestyle='--', color='r')
    plt.xlabel('Predicted'); plt.ylabel('Residual (Actual - Pred)')
    plt.title(f'Residuals: {name}')
    plt.show()

eval_model('LinearRegression', lr_pipeline)
eval_model('RidgeCV', ridge_pipeline)

### Interpretation / Discussion

- **Briefly interpret the above results here.** Explain model performance (e.g., accuracy / R² / precision / recall), any issues (overfitting, class imbalance), and recommended next steps (feature engineering, hyperparameter tuning, regularization). Keep all outputs and interpretations visible inline — do **not** save files externally.

In [None]:
# Map coefficients back to features (for LinearRegression)
pre = lr_pipeline.named_steps['pre']
num_features = num_cols
ohe = pre.named_transformers_['cat'].named_steps['ohe']
ohe_names = list(ohe.get_feature_names_out(cat_cols)) if hasattr(ohe, 'get_feature_names_out') else []
feature_names = num_features + ohe_names
coefs = lr_pipeline.named_steps['reg'].coef_
coef_df = pd.DataFrame({'feature': feature_names, 'coef': coefs}).sort_values(by='coef', key=lambda s: s.abs(), ascending=False)
display(coef_df.head(20))