# Import Libraries

In [None]:
# Importing Necessary Libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Dataset Information

In [None]:
# Read the CSV File

df = pd.read_csv('kc_house_data.csv')

In [None]:
# First 5 rows in the dataframe

df.head()

In [None]:
# Correlation between columns

df.corr(numeric_only=True)['price']

In [None]:
# Dataframe shape

df.shape

In [None]:
# General Information of the Dataframe

df.info()

In [None]:
# Statistical Information about the Dataframe

df.describe().transpose()

In [None]:
# Checking for Null values

df.isna().sum()

In [None]:
# Column names in the Dataframe

df.columns

# EDA

## Univariate Analysis (Single Feature)

In [None]:
# Histogram of 'price'

sns.histplot(df['price'], bins=50, kde=True)
plt.show()

In [None]:
# Histogram for 'sqft_living'

sns.histplot(df['sqft_living'], bins=50, kde=True)
plt.show()

In [None]:
# Boxplot of 'grade' vs 'price'

sns.boxplot(x=df['grade'], y=df['price'])
plt.show();

In [None]:
# Countplot on 'bedrooms'

sns.countplot(data=df, x='bedrooms')
plt.show()

In [None]:
# Countplot on 'bathrooms'

sns.countplot(data=df, x='bathrooms')
plt.xticks(rotation=90)
plt.show()

In [None]:
# Countplot of 'floors'

sns.countplot(x=df['floors'])
plt.show()

## Bivariate Analysis (Feature Relationships)

In [None]:
# Scatterplot on 'price' vs 'bathrooms'

sns.scatterplot(data=df, x='sqft_living', y='price')
plt.show();

In [None]:
# Scatterplot on 'price' vs 'bathrooms'

sns.scatterplot(data=df, x='sqft_above', y='price')
plt.show();

In [None]:
# Boxplot 'waterfront' vs 'price'

sns.boxplot(x=df['waterfront'], y=df['price'])
plt.show()

In [None]:
# Correlation Heatmap

plt.figure(figsize=(12,6))
sns.heatmap(df.corr(numeric_only=True), annot=True, cmap="coolwarm", fmt=".2f")
plt.show()

# Feature Engineering

In [None]:
# Dropping outlier row

df = df.drop(df[(df['sqft_living']>13000)].index, axis=0)

In [None]:
# Replacing Null Value with 'median'

df['sqft_above'] = df['sqft_above'].fillna(df['sqft_above'].median())

In [None]:
# Dropping 'id', 'sqft_basement' Column

df = df.drop(['id','sqft_basement', 'sqft_above'], axis=1)

In [None]:
# Transforming & dropping 'date' column and creating 2 new columns

df['date'] = pd.to_datetime(df['date'])
df['year_sold'] = df['date'].dt.year
df['month_sold'] = df['date'].dt.month
df.drop(columns=['date'], inplace=True)

In [None]:
# Correlation of columns with price

df.corr()['price']

In [None]:
# Dropping low correlated columns

df = df.drop(['sqft_lot', 'sqft_lot15', 'condition', 'month_sold'],axis=1)

In [None]:
# Correlation of columns with price

df.corr()['price']

In [None]:
# Created 'age_of_home' column based on 'year_sold' and 'yr_built'

df['age_of_home'] = df['year_sold'] - df['yr_built']
df["age_of_home"] = df["age_of_home"].apply(lambda x: max(x, 0))

In [None]:
# Correlation of columns with price

df.corr()['price']

In [None]:
# Dropping low correlated columns

df = df.drop(['yr_built', 'year_sold', 'age_of_home'],axis=1)

In [None]:
# Correlation of columns with price

df.corr()['price']

In [None]:
# Creating 'location_cluster' column based on 'lat' and 'long'

from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=5, random_state=42)
df['location_cluster'] = kmeans.fit_predict(df[['lat', 'long']])
df.drop(columns=['lat', 'long'], inplace=True)

In [None]:
# Correlation of columns with price

df.corr()['price']

In [None]:
# Creating 'zipcode_group' column based on mean price for zipcode

df['zipcode_group'] = df.groupby('zipcode')['price'].transform('median')
df.drop(columns=['zipcode'], inplace=True)

In [None]:
# Correlation of columns with price

df.corr()['price']

In [None]:
# Creating 'was_renovated' column based on 'yr_renovated'

df['was_renovated'] = (df['yr_renovated'] > 0).astype(int)
df.drop(columns=['yr_renovated'], inplace=True)

In [None]:
# Correlation of columns with price

df.corr()['price']

In [None]:
# Correlation of columns with price

df.dtypes

# Data Split

In [None]:
# X & y split

X = df.drop('price', axis=1)
y = df['price']

# Train-Test Split

In [None]:
# Train-Test split

from sklearn.model_selection import train_test_split

X_train, X_other, y_train, y_other = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_other, y_other, test_size=0.5, random_state=42)

# Scaling

In [None]:
# Scaling X features

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Check Variance

In [None]:
# Checking the variance of the features

from statsmodels.stats.outliers_influence import variance_inflation_factor

vif_data = pd.DataFrame({
    "Feature": X_train.columns,
    "VIF": [variance_inflation_factor(X_train_scaled, i) for i in range(X_train_scaled.shape[1])]
}).sort_values(by="VIF", ascending=False)
print(vif_data)

# Model Building

## Linear Regression

In [None]:
# Linear model

from sklearn.linear_model import LinearRegression

lr_model = LinearRegression()
lr_model.fit(X_train_scaled, y_train)

train_score = lr_model.score(X_train_scaled, y_train)
val_score = lr_model.score(X_val_scaled, y_val)

print(f"Linear Regression Train Score: {train_score:.4f}")
print(f"Linear Regression Validation Score: {val_score:.4f}")

In [None]:
# Linear Regression feature importances

feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'LinearRegression_Coeff': lr_model.coef_
}).set_index('Feature').sort_values(by='LinearRegression_Coeff', ascending=False)

print(feature_importance)

## Polynomial Regression

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline

degree = 2
poly_features = PolynomialFeatures(degree=degree)

poly_model = make_pipeline(poly_features, LinearRegression())
poly_model.fit(X_train_scaled, y_train)

train_score = poly_model.score(X_train_scaled, y_train)
val_score = poly_model.score(X_val_scaled, y_val)

print(f"Polynomial Regression Train Score: {train_score:.4f}")
print(f"Polynomial Regression Validation Score: {val_score:.4f}")

In [None]:
# Polynomial Regression feature importances

linear_reg = poly_model.named_steps['linearregression']

feature_importance = pd.DataFrame({
    'Feature': poly_features.get_feature_names_out(input_features=X.columns),
    'PolynomialRegression_Coeff': linear_reg.coef_
}).set_index('Feature').sort_values(by='PolynomialRegression_Coeff', ascending=False)

print(feature_importance)


## RidgeCV

In [None]:
# RigdeCV model

from sklearn.linear_model import RidgeCV

ridge_cv = RidgeCV(alphas=np.logspace(-3, 3, 100), cv=5)
ridge_cv.fit(X_train_scaled, y_train)

train_score = ridge_cv.score(X_train_scaled, y_train)
val_score = ridge_cv.score(X_val_scaled, y_val)
print(f"Ridge Train Score: {train_score:.4f}")
print(f"Ridge Validation Score: {val_score:.4f}")

In [None]:
# RidgeCV feature importances

feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Ridge_Coeff': ridge_cv.coef_
}).set_index('Feature').sort_values(by='Ridge_Coeff', ascending=False)

print(feature_importance)

## LassoCV

In [None]:
# LassoCV model

from sklearn.linear_model import LassoCV

lasso_cv = LassoCV(alphas=np.logspace(-3, 3, 100), cv=5, max_iter=100000, random_state=42)
lasso_cv.fit(X_train_scaled, y_train)

train_score_lasso = lasso_cv.score(X_train_scaled, y_train)
print(f"Lasso Train Score: {train_score_lasso:.4f}")

val_score_lasso = lasso_cv.score(X_val_scaled, y_val)
print(f"Lasso Validation Score: {val_score_lasso:.4f}")

In [None]:
# LassoCV feature importances

lasso_coefficients = pd.DataFrame({
    'Feature': X.columns,
    'Lasso_Coeff': lasso_cv.coef_
}).set_index('Feature')

print(lasso_coefficients)

## ElasticNetCV

In [None]:
# ElasticCV model

from sklearn.linear_model import ElasticNetCV

elastic_cv = ElasticNetCV(l1_ratio=np.linspace(0.1, 1, 10), 
                          alphas=np.logspace(-3, 3, 100), 
                          cv=5, max_iter=100000, 
                          random_state=42)

elastic_cv.fit(X_train_scaled, y_train)

train_score = elastic_cv.score(X_train_scaled, y_train)
print(f"ElasticNet Train Score: {train_score:.4f}")

val_score = elastic_cv.score(X_val_scaled, y_val)
print(f"ElasticNet Validation Score: {val_score:.4f}")

In [None]:
# ElasticCV feature importances

elasticnet_coefficients = pd.DataFrame({
    'Feature': X.columns,
    'Elastic_Coeff': elastic_cv.coef_
}).set_index('Feature')

print(elasticnet_coefficients)

## Polynomial Regression with Lasso

In [None]:
# LassoCV Polynomial Regression model

from sklearn.linear_model import LassoCV

lasso_cv_poly_model = make_pipeline(
    PolynomialFeatures(degree=2), 
    LassoCV(alphas=[0.001, 0.01, 0.1, 1, 10, 100], cv=5, max_iter=5000)
)

lasso_cv_poly_model.fit(X_train_scaled, y_train)

train_score = lasso_cv_poly_model.score(X_train_scaled, y_train)
val_score = lasso_cv_poly_model.score(X_val_scaled, y_val)
best_alpha_lasso = lasso_cv_poly_model.named_steps["lassocv"].alpha_

print(f"Best Lasso Alpha: {best_alpha_lasso}")
print(f"Polynomial Lasso Regression (Degree=2) Train Score: {train_score:.4f}")
print(f"Polynomial Lasso Regression (Degree=2) Validation Score: {val_score:.4f}")

In [None]:
# LassoCV - Polynomial Regression feature importances

lasso_model = lasso_cv_poly_model.named_steps["lassocv"]
poly_transformer = lasso_cv_poly_model.named_steps["polynomialfeatures"]
feature_names = poly_transformer.get_feature_names_out(input_features=X.columns)

lasso_coefficients = lasso_model.coef_

feature_importance = pd.DataFrame({
    'Feature': feature_names,
    'Lasso_Coefficient': lasso_coefficients
}).set_index('Feature').sort_values(by='Lasso_Coefficient', ascending=False)

print(feature_importance)


# Final Evaluation

In [None]:
#Final Evaluations on Test Data

test_score_lr = lr_model.score(X_test_scaled, y_test)
test_score_pl = poly_model.score(X_test_scaled, y_test)
test_score_ridge = ridge_cv.score(X_test_scaled, y_test)
test_score_lasso = lasso_cv.score(X_test_scaled, y_test)
test_score = elastic_cv.score(X_test_scaled, y_test)
test_score_lassopoly = lasso_cv_poly_model.score(X_test_scaled, y_test)

print(f"Linear Regression Test Score: {test_score_lr:.4f}")
print(f"Polynomial Regression Test Score: {test_score_pl:.4f}")
print(f"Ridge Test Score: {test_score_ridge:.4f}")
print(f"Lasso Test Score: {test_score_lasso:.4f}")
print(f"ElasticNet Test Score: {test_score:.4f}")
print(f"Lasso - Polynomial Regression Test Score: {test_score_lassopoly:.4f}")

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

y_test_pred_linear = lr_model.predict(X_test_scaled)
y_test_pred_polynomial = poly_model.predict(X_test_scaled)
y_test_pred_ridge = ridge_cv.predict(X_test_scaled)
y_test_pred_lasso = lasso_cv.predict(X_test_scaled)
y_test_pred_elastic = elastic_cv.predict(X_test_scaled)
y_test_pred_lassopoly = lasso_cv_poly_model.predict(X_test_scaled)

for model_name, y_pred in zip(
    ["LinearRegression", "PolynomialRegressio", "RidgeCV", "LassoCV", "ElasticNetCV", "LassoPoly"], 
    [y_test_pred_linear, y_test_pred_polynomial, y_test_pred_ridge, y_test_pred_lasso, y_test_pred_elastic, y_test_pred_lassopoly]
):
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    mae = mean_absolute_error(y_test, y_pred)
    print(f"{model_name} RMSE: {rmse:.4f}, MAE: {mae:.4f}")

# Conclusion

***Polynomial Regression significantly improves performance over Linear Regression, and adding Lasso regularization (LassoPoly) further enhances validation and test scores, achieving the lowest RMSE and MAE. This suggests that polynomial features capture complex patterns, while Lasso prevents overfitting.***