In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib

Load the data set

In [None]:
file_path = "BostonHousingModel/Boston Housing Data.csv"
df = pd.read_csv(file_path)

Display the first few rows 

In [None]:
print(df.head())

Basic understanding of data

In [None]:
print("Dataset Info:")
df.info()
print(f"\nSummary statistics:\n{df.describe()}")

Handling of missing values

In [None]:
missing_values = df.isnull().sum()
print(f"missing value in each column is:")
print(missing_values)

Fill the missing values with median of the respective column

In [None]:
df.fillna(df.median(), inplace= True)
print("\nMissing Values After Treatment:")
print(df.isnull().sum())

Handling outliers

In [None]:
plt.figure(figsize=(15, 10))
df.boxplot()
plt.xticks(rotation=90)
plt.title('Boxplot of all columns to detect outliers')
plt.show()

Remove all outliers using Z-score

In [None]:
z_score = np.abs(stats.zscore(df.select_dtypes(include=np.number)))
df_clean = df[(z_score<3).all(axis=1)]
print(f"\nNumber of rows after outlier removal: {df_clean.shape[0]} (original: {df.shape[0]})")

Analysing variables strongly affecting prices

In [None]:
corr_matrix = df_clean.corr()
plt.figure(figsize=(12,8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix')
plt.show()

Variables strongly affecting housing rates

In [None]:
important_variables = df_clean.corr()['MEDV'].sort_values(ascending=False)
print("\nVariables most strongly affecting house prices:")
print(important_variables)

Prepare the dataset for linear regression

In [None]:
x = df_clean.drop(columns=['MEDV']) #features
y = df_clean['MEDV'] #Targets

In [None]:
X_train,X_test,y_train,y_test =train_test_split(x,y,test_size = 0.2,random_state = 42)

Initialising and traing the regression model

In [None]:
model = LinearRegression()
model.fit(X_train,y_train)

Predict on the test set

In [None]:
y_pred = model.predict(X_test)

Evaluate the model

In [None]:
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [None]:
print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared (R^2): {r2}")

Plotting predicted vs actual values

In [None]:
plt.figure(figsize=(10,6))
plt.scatter(y_test,y_pred, alpha=0.7,edgecolors='k')
plt.plot([y_test.min(),y_test.max()],[y_test.min(),y_test.max(),],'r--', lw=2)
plt.xlabel('Actual Prices')
plt.ylabel('Predicted Prices')
plt.title('Actual vs Predicted House Prices')
plt.show()

Saving the model

In [None]:
model_path = 'linear_regression_model.pkl'
joblib.dump(model,model_path)
print(f"model saved in {model_path}")