In [None]:
# Import relevant libraries 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import warnings 
warnings.filterwarnings('ignore')

In [None]:
raw_data=pd.read_csv('../input/boston-housing-dataset/HousingData.csv')    # Load Data
raw_data.head()

CRIM - per capita crime rate by town

ZN - proportion of residential land zoned for lots over 25,000 sq.ft.

INDUS - proportion of non-retail business acres per town.

CHAS - Charles River dummy variable (1 if tract bounds river; 0 otherwise)

NOX - nitric oxides concentration (parts per 10 million)

RM - average number of rooms per dwelling

AGE - proportion of owner-occupied units built prior to 1940

DIS - weighted distances to five Boston employment centres

RAD - index of accessibility to radial highways

TAX - full-value property-tax rate per $10,000

PTRATIO - pupil-teacher ratio by town

B - 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town

LSTAT -  lower status of the population

MEDV - Median value of owner-occupied homes in $1000's

In [None]:
raw_data.describe(include='all')

In [None]:
raw_data.info()

In [None]:
data=raw_data.copy()

In [None]:
data.dropna(axis=0,inplace = True)     # Drop null values

In [None]:
data.isnull().sum()

### Treating Outliers by IQR method

In [None]:
q1=data.quantile(0.25)
q3=data.quantile(0.75)
iqr=q3 - q1 
iqr

In [None]:
data_clean=data[~((data < (q1 - 1.5 * iqr)) | (data > (q3 + iqr))).any(axis=1)]
data_clean.skew()

In [None]:
sns.pairplot(data_clean)

In [None]:
log_medv=np.log(data_clean['MEDV'])
data_clean['log_MEDV'] = log_medv
data_clean.head()

In [None]:
data_clean=data_clean.drop(['MEDV'],axis = 1)

In [None]:
data_clean.corr().abs() # Analysing Correlation 

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor  # Checking for Multi-collinearity
variables = data_clean[['PTRATIO','INDUS','AGE','RM','LSTAT']]
vif = pd.DataFrame()
vif["VIF"] = [variance_inflation_factor(variables.values, i) for i in range(variables.shape[1])]
vif["Features"] = variables.columns
vif

In [None]:
data_clean.drop('PTRATIO',axis=1,inplace= True)

In [None]:
data_pre=data_clean.copy()

In [None]:
data_pre = data_clean[['INDUS','AGE','RM','LSTAT','log_MEDV']]

# Building Linear Model

In [None]:
targets = data_pre['log_MEDV']
inputs = data_pre.drop(['log_MEDV'],axis = 1)

In [None]:
from sklearn.preprocessing import StandardScaler    # Scaling Data

In [None]:
scaler=StandardScaler()
scaler.fit(inputs)

In [None]:
inputs_sc = scaler.transform(inputs) 

In [None]:
from sklearn.feature_selection import f_regression     # Checking p_values

In [None]:
f_regression(inputs_sc,targets)

In [None]:
p_values = f_regression(inputs_sc,targets)[1]
p_values.round(3)

In [None]:
from sklearn.model_selection import train_test_split    # Splitting data in Train - Test
x_train, x_test, y_train, y_test = train_test_split(inputs_sc, targets, test_size=0.2, random_state=365)

In [None]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression()

In [None]:
reg.fit(x_train,y_train)

In [None]:
y_hat = reg.predict(x_train)

In [None]:
sns.scatterplot(x=y_train,y=y_hat)

In [None]:
sns.distplot(y_train - y_hat)

In [None]:
reg.score(x_train,y_train)    # R2 

In [None]:
def adj_r2(x,y):
    r2 = reg.score(x,y)
    n = x.shape[0]
    p = x.shape[1]
    adjusted_r2 = 1-(1-r2)*(n-1)/(n-p-1)
    return adjusted_r2

In [None]:
adj_r2(inputs_sc,targets)

In [None]:
reg.intercept_

In [None]:
reg.coef_

In [None]:
y_hat_test = reg.predict(x_test)

In [None]:
df_pf = pd.DataFrame(np.exp(y_hat_test), columns=['Prediction'])
df_pf.head()

In [None]:
df_pf['Target'] = np.exp(y_test)
df_pf

In [None]:
y_test = y_test.reset_index(drop=True)
y_test.head()

In [None]:
df_pf['Target'] = np.exp(y_test)
df_pf

In [None]:
df_pf['Residual'] = df_pf['Target'] - df_pf['Prediction']

In [None]:
df_pf['Difference%'] = np.absolute(df_pf['Residual']/df_pf['Target']*100)
df_pf

In [None]:
pd.options.display.max_rows = 999
# Moreover, to make the dataset clear, we can display the result with only 2 digits after the dot 
pd.set_option('display.float_format', lambda x: '%.2f' % x)
# Finally, we sort by difference in % and manually check the model
df_pf.sort_values(by=['Difference%'])

END