# IMPORTING LIBRARIES

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# plt.style.use("dark_background")
import warnings
warnings.filterwarnings(action='ignore')

In [None]:
data = pd.read_csv('/kaggle/input/housingdataset2/Transformed_Housing_Data2.csv')

In [None]:
data.head()

# SCALING THE DATASET

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
Y = data['Sale_Price']
X = scaler.fit_transform(data.drop(columns = ['Sale_Price']))
X = pd.DataFrame(data = X, columns = data.drop(columns=['Sale_Price']).columns)
X.head()

# Checking correlation among the independent variables & Removing Multicollinearity

In [None]:
X.corr()

In [None]:
##Finding pair of independet variables with corr>0.5
k = X.corr()
z = [[str(i), str(j)] for i in k.columns for j in k.columns if (k.loc[i, j] > abs(0.5))&(i != j)]
z, len(z)

In [None]:
z0 = {tuple(sorted(item)) for item in z}
z0, len(z0)

# TREATING MULTICOLLINEARITY

In [None]:
# importing variance inflation factor function from statsmodel
from statsmodels.stats.outliers_influence import variance_inflation_factor as VIF 
vif_data = X
#calculating vif for every column
vif_table = pd.Series([VIF(vif_data.values, i) for i in range(vif_data.shape[1])], index = vif_data.columns)
vif_table

Clearly only 7 variables are showing multicolleniarity (vif > 5)

In [None]:
vif_table[vif_table == vif_table.max()].index[0]

In [None]:

def Mc_remover(data):
    vif_table = pd.Series([VIF(data.values, i) for i in range(data.shape[1])], index = data.columns)
    if vif_table.max() > 5:
        print(vif_table[vif_table == vif_table.max()].index[0], 'has been removed')
        data = data.drop(columns = [vif_table[vif_table == vif_table.max()].index[0]])
        return data
    else:
        print('No multicollinearity present anymore')
        return data

In [None]:
for i in range(7):
    vif_data = Mc_remover(vif_data)    
vif_data.head()

# REMAINING COLUMNS

In [None]:
#calculating vif for remaining columns
vif_table = pd.Series([VIF(vif_data.values, i) for i in range(vif_data.shape[1])], index = vif_data.columns)
vif_table, len(vif_table)

# Train Test Set Bifurcation of the model

In [None]:
x = vif_data
y = data['Sale_Price']

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=101)

x_train.shape, x_test.shape, y_train.shape, y_test.shape

# Linear Regression using SkLearn

In [None]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
# normalize = True will normalize all the independent variables such that LinearRegression will not have to find the value of intercept
lr.fit(x_train, y_train) #implements gradient descent

In [None]:
lr.coef_  #this returns all the coefficients corresponding to independent variables that our LinearRegression has calculated
# final equation => y = m0 + m1.x1 + m2.x2 + m3.x3 + ..... + mnxn
#Since data is normalized, intercept m0 = 0 

In [None]:
predictions = lr.predict(x_test) #create predictions
lr.score(x_test, y_test) #gives R-square value

# Verifying Assumptions of LinearRegression

# 1.Residuals

In [None]:
residuals = predictions - y_test
residual_table = pd.DataFrame(
    {
        'residuals': residuals,
        'predictions': predictions
    }
)
residual_table = residual_table.sort_values(by = 'predictions')

In [None]:
z = [i for i in range(int(residual_table['predictions'].max()))]
k = [0 for i in range(int(residual_table['predictions'].max()))]

In [None]:
plt.figure(dpi = 130, figsize=(17, 7))

plt.scatter(residual_table['predictions'], residual_table['residuals'], color = 'red', s=2)
plt.plot(z, k, color = 'green', linewidth = 3, label = 'regression line')
plt.ylim(-800000, 800000)
plt.xlabel('fitted points: ordered by predictions')
plt.ylabel('residuals')
plt.title('Residual plot')
plt.legend()
plt.show()

Most of the residuals are densly populated between limits 200,000 and -200,000. so we can say residuals are normally distributed
But the residual plot resembles neither cone shape or pipe shape so there is some room for improvement in data
Also there are some outliers

# 2.Distribution of Errors

In [None]:
plt.figure(dpi = 100, figsize=(10, 7))
plt.hist(residual_table['residuals'], color = 'blue', bins = 200)
plt.xlabel('residuals')
plt.ylabel('frequency')
plt.title('Distribution of residuals')
plt.show()

Clearly bulk of the residuals  are normally distributed. Also
there are some outliers on the far right

# MODEL COEFFICIENTS

In [None]:
coeff_table = pd.DataFrame({'columns': x_train.columns,
                            'coefficients': lr.coef_})
coeff_table = coeff_table.sort_values(by = 'coefficients')

In [None]:
plt.figure(figsize=(8, 6), dpi = 120)
x = coeff_table['columns']
y = coeff_table['coefficients']
plt.barh(x, y)
plt.xlabel('Coefficients')
plt.ylabel('Variables')
plt.title('Normalized Coefficients plot')
plt.show()

Zip code has highest of the coefficients: Therefore Location plays major role in sale price
Also area of house, overall grade plays major role.
Year since renovation is negatively significant that means customers are more likely to buy houses that are recently renovated.
One important thing to not is that Longitude is negatively significant whereas Latitude is positive. This data can tell us about the geographical note of the sale prices. 