# Imports

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

import joblib

# Importing Dataset

In [2]:
df = pd.read_csv("../Dataset/HousingData.csv")

# Analysing Dataset

In [3]:
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,,36.2


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     486 non-null    float64
 1   ZN       486 non-null    float64
 2   INDUS    486 non-null    float64
 3   CHAS     486 non-null    float64
 4   NOX      506 non-null    float64
 5   RM       506 non-null    float64
 6   AGE      486 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    int64  
 9   TAX      506 non-null    int64  
 10  PTRATIO  506 non-null    float64
 11  B        506 non-null    float64
 12  LSTAT    486 non-null    float64
 13  MEDV     506 non-null    float64
dtypes: float64(12), int64(2)
memory usage: 55.5 KB


In [5]:
df.isnull().sum()

CRIM       20
ZN         20
INDUS      20
CHAS       20
NOX         0
RM          0
AGE        20
DIS         0
RAD         0
TAX         0
PTRATIO     0
B           0
LSTAT      20
MEDV        0
dtype: int64

# Data Cleaning

In [6]:
df = df.fillna(df.mean(numeric_only=True))

# Post Dataset Analysis

In [7]:
df.isnull().sum()

CRIM       0
ZN         0
INDUS      0
CHAS       0
NOX        0
RM         0
AGE        0
DIS        0
RAD        0
TAX        0
PTRATIO    0
B          0
LSTAT      0
MEDV       0
dtype: int64

# Feature Splitting

In [9]:
X = df.drop("MEDV", axis=1)
y = df["MEDV"]

# Train/Test Split

In [10]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Baseline Model Introduction: Linear Regression

In [12]:
lr = LinearRegression()
lr.fit(X_train, y_train)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [13]:
lr_pred = lr.predict(X_test)

In [14]:
lr_mse = mean_squared_error(y_test, lr_pred)
lr_r2 = r2_score(y_test, lr_pred)

# Baseline Model Improvement: By Scaling

In [15]:
scaler = StandardScaler()

In [16]:
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [17]:
lr_scaled = LinearRegression()
lr_scaled.fit(X_train_scaled, y_train)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [19]:
lr_scaled_pred = lr_scaled.predict(X_test_scaled)
lr_scaled_r2 = r2_score(y_test, lr_scaled_pred)

# Advanced Model Introduction: Random Forest Regression

In [20]:
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [21]:
rf_pred = rf.predict(X_test)

In [22]:
rf_mse = mean_squared_error(y_test, rf_pred)
rf_r2 = r2_score(y_test, rf_pred)

# Final Result: Comparison of the Two Models

In [23]:
print("Linear R2:", lr_r2)
print("Scaled Linear R2:", lr_scaled_r2)
print("Random Forest R2:", rf_r2)

Linear R2: 0.658852019550814
Scaled Linear R2: 0.6588520195508119
Random Forest R2: 0.8878080447367498


# Error Table: Comparison Table

In [26]:
comparison = pd.DataFrame({
    "Actual": y_test,
    "Linear": lr_pred,
    "RandomForest": rf_pred
})
comparison["Linear_Error"] = abs(comparison["Actual"] - comparison["Linear"])
comparison["RF_Error"] = abs(comparison["Actual"] - comparison["RandomForest"])
comparison.head()


Unnamed: 0,Actual,Linear,RandomForest,Linear_Error,RF_Error
173,23.6,29.14325,23.409,5.54325,0.191
274,32.4,36.535668,30.686,4.135668,1.714
491,13.6,14.492513,17.076,0.892513,3.476
72,22.8,25.08111,23.86,2.28111,1.06
452,16.1,18.456092,16.581,2.356092,0.481


# Saving Model Data

In [27]:
joblib.dump(rf, "boston_model_best.pkl")

['boston_model_best.pkl']

In [28]:
joblib.dump(lr_scaled,"boston_model_scaler.pkl")

['boston_model_scaler.pkl']