In [2]:
import pandas as pd
import numpy as np
import statsmodels.api as stm

In [3]:
df = pd.read_csv('Data/House_Price.csv')
df.head()

Unnamed: 0,price,crime_rate,resid_area,air_qual,room_num,age,dist1,dist2,dist3,dist4,teachers,poor_prop,airport,n_hos_beds,n_hot_rooms,waterbody,rainfall,bus_ter,parks
0,24.0,0.00632,32.31,0.538,6.575,65.2,4.35,3.81,4.18,4.01,24.7,4.98,YES,5.48,11.192,River,23,YES,0.049347
1,21.6,0.02731,37.07,0.469,6.421,78.9,4.99,4.7,5.12,5.06,22.2,9.14,NO,7.332,12.1728,Lake,42,YES,0.046146
2,34.7,0.02729,37.07,0.469,7.185,61.1,5.03,4.86,5.01,4.97,22.2,4.03,NO,7.394,101.12,,38,YES,0.045764
3,33.4,0.03237,32.18,0.458,6.998,45.8,6.21,5.93,6.16,5.96,21.3,2.94,YES,9.268,11.2672,Lake,45,YES,0.047151
4,36.2,0.06905,32.18,0.458,7.147,54.2,6.16,5.86,6.37,5.86,21.3,5.33,NO,8.824,11.2896,Lake,55,YES,0.039474


In [4]:
df.isnull().sum()

price          0
crime_rate     0
resid_area     0
air_qual       0
room_num       0
age            0
dist1          0
dist2          0
dist3          0
dist4          0
teachers       0
poor_prop      0
airport        0
n_hos_beds     8
n_hot_rooms    0
waterbody      0
rainfall       0
bus_ter        0
parks          0
dtype: int64

In [5]:
df['n_hos_beds'] = df['n_hos_beds'].fillna(df['n_hos_beds'].mean())

In [6]:
df.isnull().sum()

price          0
crime_rate     0
resid_area     0
air_qual       0
room_num       0
age            0
dist1          0
dist2          0
dist3          0
dist4          0
teachers       0
poor_prop      0
airport        0
n_hos_beds     0
n_hot_rooms    0
waterbody      0
rainfall       0
bus_ter        0
parks          0
dtype: int64

In [7]:
df.corr()

Unnamed: 0,price,crime_rate,resid_area,air_qual,room_num,age,dist1,dist2,dist3,dist4,teachers,poor_prop,n_hos_beds,n_hot_rooms,rainfall,parks
price,1.0,-0.389582,-0.484754,-0.4293,0.696304,-0.377999,0.251355,0.249459,0.24665,0.2482,0.505655,-0.740836,0.10888,0.023122,-0.047426,-0.391574
crime_rate,-0.389582,1.0,0.406583,0.420972,-0.219247,0.352734,-0.38005,-0.379813,-0.380069,-0.376462,-0.289946,0.455621,0.017371,0.013518,0.059204,0.383235
resid_area,-0.484754,0.406583,1.0,0.763651,-0.391676,0.644779,-0.706481,-0.707956,-0.707566,-0.705819,-0.383248,0.6038,0.005799,-0.000839,0.05581,0.707635
air_qual,-0.4293,0.420972,0.763651,1.0,-0.302188,0.73147,-0.768589,-0.769724,-0.769157,-0.764873,-0.188933,0.590879,-0.049553,-0.004882,0.092104,0.915544
room_num,0.696304,-0.219247,-0.391676,-0.302188,1.0,-0.240265,0.208464,0.203981,0.201907,0.205397,0.355501,-0.613808,0.032009,0.030674,-0.064694,-0.282817
age,-0.377999,0.352734,0.644779,0.73147,-0.240265,1.0,-0.746904,-0.746493,-0.747021,-0.746707,-0.261515,0.602339,-0.021012,0.00938,0.075198,0.67385
dist1,0.251355,-0.38005,-0.706481,-0.768589,0.208464,-0.746904,1.0,0.997905,0.997735,0.994073,0.232834,-0.498823,-0.030365,-0.014463,-0.036794,-0.706319
dist2,0.249459,-0.379813,-0.707956,-0.769724,0.203981,-0.746493,0.997905,1.0,0.998097,0.994003,0.233707,-0.495693,-0.031071,-0.010239,-0.038005,-0.708237
dist3,0.24665,-0.380069,-0.707566,-0.769157,0.201907,-0.747021,0.997735,0.998097,1.0,0.994126,0.233588,-0.49429,-0.028315,-0.010077,-0.04147,-0.709346
dist4,0.2482,-0.376462,-0.705819,-0.764873,0.205397,-0.746707,0.994073,0.994003,0.994126,1.0,0.228256,-0.496084,-0.021534,-0.00585,-0.032542,-0.703508


In [8]:
x = df['room_num']
y = df['price']

In [9]:
# Define Slope

slope = ((x-x.mean())*(y-y.mean())).sum()/(np.square(x-x.mean())).sum()
slope

9.099669663064661

In [10]:
# Define Intercept

intercept = y.mean() - (slope*x.mean())
intercept

-34.65924312309723

In [11]:
# Define Residual sum of squares (RSS) as part of RSE formula

rss = (np.square(y-intercept-(slope*x))).sum()
rss

21934.391586319405

In [12]:
# Define Residual Standard Error (RSE)

rse = np.sqrt(rss/(len(x)-2))
rse

6.597015857705984

In [13]:
# Define R-Square

tss = np.square(y-y.mean()).sum() # Total Sum of Squares
r2 = 1 - (rss/tss)
r2 

0.48483897365368667

In [14]:
# Define model prediction using model y = mx + b
# m = slope
# x = data point (independent variable)
# b = intercept

y_pred = ((slope*x)+intercept)
y_pred

0      25.171085
1      23.769736
2      30.721883
3      29.020245
4      30.376096
         ...    
501    25.334879
502    21.030735
503    28.820052
504    27.163913
505    20.211765
Name: room_num, Length: 506, dtype: float64

In [24]:
# OLS using statsmodels

X = stm.add_constant(df['room_num'])
y = df['price']

model_ols = stm.OLS(y, X)
result = model_ols.fit()
result.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.485
Model:,OLS,Adj. R-squared:,0.484
Method:,Least Squares,F-statistic:,474.3
Date:,"Mon, 07 Dec 2020",Prob (F-statistic):,1.31e-74
Time:,16:23:12,Log-Likelihood:,-1671.6
No. Observations:,506,AIC:,3347.0
Df Residuals:,504,BIC:,3356.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-34.6592,2.642,-13.118,0.000,-39.850,-29.468
room_num,9.0997,0.418,21.779,0.000,8.279,9.921

0,1,2,3
Omnibus:,103.753,Durbin-Watson:,0.681
Prob(Omnibus):,0.0,Jarque-Bera (JB):,633.429
Skew:,0.729,Prob(JB):,2.84e-138
Kurtosis:,8.284,Cond. No.,58.4


In [29]:
print('Parameters: ', result.params)
print('\n')
print('Standard Error: ', result.bse)
print('\n')
print('r2: ', result.rsquared)

Parameters:  const      -34.659243
room_num     9.099670
dtype: float64


Standard Error:  const       2.642136
room_num    0.417814
dtype: float64


r2:  0.4848389736536869
