# Q2: Multiple Regression â€” Rent Prediction
Dataset: `multiple_regression_rent.csv`

In [None]:
# Common imports used across notebooks
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
RANDOM_STATE = 42


In [None]:
df = pd.read_csv('/mnt/data/aiml/multiple_regression_rent.csv')
df.head()

In [None]:
print(df.info()); print(df.isna().sum())

In [None]:
# Preprocessing & model
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression, RidgeCV

target = 'rent'
y = df[target]
X = df.drop(columns=[target])

num_cols = X.select_dtypes(include=['int64','float64']).columns.tolist()
cat_cols = X.select_dtypes(include=['object','category']).columns.tolist()

num_transform = Pipeline([('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())])
cat_transform = Pipeline([('imputer', SimpleImputer(strategy='most_frequent')), ('ohe', OneHotEncoder(handle_unknown='ignore', drop='first'))])

preprocessor = ColumnTransformer([('num', num_transform, num_cols), ('cat', cat_transform, cat_cols)])
ridge_pipeline = Pipeline([('pre', preprocessor), ('reg', RidgeCV(alphas=[0.1,1,10], cv=5))])

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)
ridge_pipeline.fit(X_train, y_train)
print('CV R2:', cross_val_score(ridge_pipeline, X, y, cv=5, scoring='r2').mean())

In [None]:
# Evaluation
from sklearn.metrics import mean_absolute_error, r2_score
y_pred = ridge_pipeline.predict(X_test)
print('MAE:', mean_absolute_error(y_test, y_pred))
print('RMSE:', mean_squared_error(y_test, y_pred, squared=False))
print('R2:', r2_score(y_test, y_pred))

In [None]:
# VIF calculation for numeric features
from statsmodels.stats.outliers_influence import variance_inflation_factor
X_num = df[num_cols].fillna(df[num_cols].median())
vif = pd.DataFrame({'feature': X_num.columns, 'VIF': [variance_inflation_factor(X_num.values, i) for i in range(X_num.shape[1])]})
display(vif)