# *Importing the libraries*

In [2]:
import pandas as pd
from pandas import DataFrame, Series
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# *Loading the data*

In [None]:
df = pd.read_csv("E:\Programming\Python for Data Science and Machine Learning\Datasets\california_housing_dataset.csv")
df = df.drop(columns = {'ocean_proximity', 'housing_median_age', 'latitude', 'longitude'})
df = df.iloc[:500,:]
df.head()

# *Data Visualization*

In [None]:
sns.pairplot(data = df)
plt.show()

In [None]:
figure, axes = plt.subplots(2, 3, figsize = (14,6))
axes = axes.flatten()
column = ['total_rooms', 'total_bedrooms', 'households', 'population', 'median_income', 'median_house_value']
for num in range(6):
    sns.histplot(df[column[num]], ax = axes[num])    
plt.show()

# *Exploratory Data Analysis*

In [69]:
# Since the data consists outliers which might negatively impact our linear regression model, we replace the outlier values with the median value, removing the bias present in the data eventually. The histograms and pairplots above indicate the presence of outliers.

def correction(df, column_arr):
    data = df.copy()
    for col in column_arr:
        Q1 = data[col].quantile(0.25)
        Q3 = data[col].quantile(0.75)
        inter_quartile_range = Q3 - Q1
        lower_bound = Q1 - (1.5 * inter_quartile_range)
        upper_bound = Q3 + (1.5 * inter_quartile_range)
        median = data[col].median()
        data[col] = data[col].apply(lambda val : median if ((val < lower_bound) or (val > upper_bound)) else val)
    return data

arr = ['total_rooms', 'total_bedrooms', 'households', 'population', 'median_income', 'median_house_value']
cleaned_data = correction(df, arr)

# *Plotting boxplots to verify that the data has been treated so that the influence of outliers has been reduced/removed*

In [None]:
figure, axes = plt.subplots(2, 3, figsize = (15,6))
axes = axes.flatten()
column = ['total_rooms', 'total_bedrooms', 'households', 'population', 'median_income', 'median_house_value']
for num in range(6):
    sns.boxplot(data = cleaned_data, y = column[num], ax = axes[num])
plt.show()

In [None]:
sns.heatmap(data = cleaned_data.corr(), cmap = 'coolwarm', annot = True)
plt.show()

*In the above heatmap, we can see that features like total_rooms, total_bedrooms, population, households have strong correlation among each others, hence we can use to predict their values.*

# *Building the model for predicting the 'total_rooms' feature*

In [None]:
cleaned_data = cleaned_data.fillna(cleaned_data.median())
X = cleaned_data[['households', 'population', 'total_bedrooms']]
Y = cleaned_data['total_rooms']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2)
model = LinearRegression()
model.fit(X_train, Y_train)
slope = model.coef_[0]
intercept = model.intercept_
print(f'The regression line was found to be Y = {slope:.2f} * X + {intercept:.2f}')
Y_pred = model.predict(X_test)
mse = mean_squared_error(Y_test, Y_pred)
r_square = r2_score(Y_test, Y_pred)
print(f'Mean Squared Error was found to be :  {mse:.2f}\nR² score was found to be : {r_square:.2f}')

# *Building the model for predicting the 'total_bedrooms' feature*

In [None]:
X = cleaned_data[['households', 'population', 'total_rooms']]
Y = cleaned_data['total_bedrooms']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2)
model = LinearRegression()
model.fit(X_train, Y_train)
slope = model.coef_[0]
intercept = model.intercept_
print(f'The regression line was found to be Y = {slope:.2f} * X + {intercept:.2f}')
Y_pred = model.predict(X_test)
mse = mean_squared_error(Y_test, Y_pred)
r_square = r2_score(Y_test, Y_pred)
print(f'Mean Squared Error was found to be :  {mse:.2f}\nR² score was found to be : {r_square:.2f}')

# *Building the model for predicting the 'households' feature*

In [None]:
X = cleaned_data[['total_bedrooms', 'population', 'total_rooms']]
Y = cleaned_data['households']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2)
model = LinearRegression()
model.fit(X_train, Y_train)
slope = model.coef_[0]
intercept = model.intercept_
print(f'The regression line was found to be Y = {slope:.2f} * X + {intercept:.2f}')
Y_pred = model.predict(X_test)
mse = mean_squared_error(Y_test, Y_pred)
r_square = r2_score(Y_test, Y_pred)
print(f'Mean Squared Error was found to be :  {mse:.2f}\nR² score was found to be : {r_square:.2f}')

# *Building the model for predicting the 'households' feature*

In [None]:
X = cleaned_data[['total_bedrooms', 'households', 'total_rooms']]
Y = cleaned_data['population']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2)
model = LinearRegression()
model.fit(X_train, Y_train)
slope = model.coef_[0]
intercept = model.intercept_
print(f'The regression line was found to be Y = {slope:.2f} * X + {intercept:.2f}')
Y_pred = model.predict(X_test)
mse = mean_squared_error(Y_test, Y_pred)
r_square = r2_score(Y_test, Y_pred)
print(f'Mean Squared Error was found to be :  {mse:.2f}\nR² score was found to be : {r_square:.2f}')