In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
df = pd.read_csv('SaratogaHouses.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,price,lotSize,age,landValue,livingArea,pctCollege,bedrooms,fireplaces,bathrooms,rooms,heating,fuel,sewer,waterfront,newConstruction,centralAir
0,1,132500,0.09,42,50000,906,35,2,1,1.0,5,electric,electric,septic,No,No,No
1,2,181115,0.92,0,22300,1953,51,3,0,2.5,6,hot water/steam,gas,septic,No,No,No
2,3,109000,0.19,133,7300,1944,51,4,1,1.0,8,hot water/steam,gas,public/commercial,No,No,No
3,4,155000,0.41,13,18700,1944,51,3,1,1.5,5,hot air,gas,septic,No,No,No
4,5,86060,0.11,0,15000,840,51,2,0,1.0,3,hot air,gas,public/commercial,No,Yes,Yes


In [4]:
pd.unique(df['heating'])

array(['electric', 'hot water/steam', 'hot air'], dtype=object)

In [5]:
pd.unique(df['newConstruction'])

array(['No', 'Yes'], dtype=object)

In [6]:
pd.unique(df['sewer'])

array(['septic', 'public/commercial', 'none'], dtype=object)

In [7]:
pd.unique(df['fuel'])

array(['electric', 'gas', 'oil'], dtype=object)

In [8]:
lab = LabelEncoder()
df['newConstruction'] = lab.fit_transform(df['newConstruction'])
df['centralAir'] = lab.fit_transform(df['centralAir'])
df['waterfront'] = lab.fit_transform(df['waterfront'])

In [9]:
df.head()

Unnamed: 0.1,Unnamed: 0,price,lotSize,age,landValue,livingArea,pctCollege,bedrooms,fireplaces,bathrooms,rooms,heating,fuel,sewer,waterfront,newConstruction,centralAir
0,1,132500,0.09,42,50000,906,35,2,1,1.0,5,electric,electric,septic,0,0,0
1,2,181115,0.92,0,22300,1953,51,3,0,2.5,6,hot water/steam,gas,septic,0,0,0
2,3,109000,0.19,133,7300,1944,51,4,1,1.0,8,hot water/steam,gas,public/commercial,0,0,0
3,4,155000,0.41,13,18700,1944,51,3,1,1.5,5,hot air,gas,septic,0,0,0
4,5,86060,0.11,0,15000,840,51,2,0,1.0,3,hot air,gas,public/commercial,0,1,1


In [10]:
sewer = df[['sewer']]
fuel = df[['fuel']]
heating = df[['heating']]

In [11]:
# df[['price']]

In [12]:
sewer_onehot = OneHotEncoder()
sewer = sewer_onehot.fit_transform(sewer)

In [13]:
fuel_onehot = OneHotEncoder()
fuel = fuel_onehot.fit_transform(fuel)

In [14]:
heat_onehot = OneHotEncoder()
heating = heat_onehot.fit_transform(heating)

In [15]:
sewer

<1728x3 sparse matrix of type '<class 'numpy.float64'>'
	with 1728 stored elements in Compressed Sparse Row format>

In [16]:
sewer = sewer.toarray()

In [17]:
fuel = fuel.toarray()
heating = heating.toarray()

In [18]:
sewer

array([[0., 0., 1.],
       [0., 0., 1.],
       [0., 1., 0.],
       ...,
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])

In [19]:
df.drop(columns=['Unnamed: 0','heating','sewer','fuel'], inplace=True)

In [20]:
df.head()

Unnamed: 0,price,lotSize,age,landValue,livingArea,pctCollege,bedrooms,fireplaces,bathrooms,rooms,waterfront,newConstruction,centralAir
0,132500,0.09,42,50000,906,35,2,1,1.0,5,0,0,0
1,181115,0.92,0,22300,1953,51,3,0,2.5,6,0,0,0
2,109000,0.19,133,7300,1944,51,4,1,1.0,8,0,0,0
3,155000,0.41,13,18700,1944,51,3,1,1.5,5,0,0,0
4,86060,0.11,0,15000,840,51,2,0,1.0,3,0,1,1


In [21]:
X = df.iloc[:,1:].values
y = df['price'].values

In [22]:
X = np.c_[X, sewer, fuel, heating]

In [23]:
X.shape

(1728, 21)

In [24]:
X[0]

array([9.00e-02, 4.20e+01, 5.00e+04, 9.06e+02, 3.50e+01, 2.00e+00,
       1.00e+00, 1.00e+00, 5.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
       0.00e+00, 0.00e+00, 1.00e+00, 1.00e+00, 0.00e+00, 0.00e+00,
       1.00e+00, 0.00e+00, 0.00e+00])

In [25]:
minmax = MinMaxScaler()
X = minmax.fit_transform(X)
y = minmax.fit_transform(y.reshape(-1,1))

In [26]:
X[0]

array([0.00737705, 0.18666667, 0.12075655, 0.06287944, 0.24193548,
       0.16666667, 0.25      , 0.22222222, 0.3       , 0.        ,
       0.        , 0.        , 0.        , 0.        , 1.        ,
       1.        , 0.        , 0.        , 1.        , 0.        ,
       0.        ])

In [27]:
y[0]

array([0.16558442])

In [28]:
x_train, x_test, y_train, y_test = train_test_split(X,y,test_size=0.25)

In [29]:
x_train.shape

(1296, 21)

In [30]:
x_test.shape

(432, 21)

In [31]:
regression = LinearRegression()
regression.fit(x_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [32]:
regression.coef_

array([[ 1.36001226e-01, -3.79873982e-02,  5.02267144e-01,
         4.31461701e-01, -1.80011670e-03, -5.57193252e-02,
        -1.05561725e-02,  1.44497607e-01,  3.87880033e-02,
         1.41026950e-01, -6.68925941e-02,  1.20989620e-02,
        -2.18934684e-02,  1.18268083e-02,  1.00666601e-02,
        -5.46441301e-03,  5.63932067e-03, -1.74907654e-04,
         2.53103353e-03,  3.64176303e-03, -6.17279656e-03]])

In [33]:
y_pred = regression.predict(x_test)

In [34]:
mean_squared_error(y_test, y_pred)

0.004323285516773255

In [35]:
r2_score(y_test, y_pred)

0.6810297917918844

In [83]:
regression.score(x_test, y_test)

0.6810297917918844

In [84]:
from statsmodels.api import OLS

In [86]:
OLS(y_train, x_train).fit().summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.644
Model:,OLS,Adj. R-squared:,0.639
Method:,Least Squares,F-statistic:,128.1
Date:,"Thu, 19 Nov 2020",Prob (F-statistic):,1.74e-270
Time:,11:03:24,Log-Likelihood:,1460.4
No. Observations:,1296,AIC:,-2883.0
Df Residuals:,1277,BIC:,-2785.0
Df Model:,18,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,0.1360,0.043,3.140,0.002,0.051,0.221
x2,-0.0380,0.021,-1.811,0.070,-0.079,0.003
x3,0.5023,0.032,15.893,0.000,0.440,0.564
x4,0.4315,0.033,13.078,0.000,0.367,0.496
x5,-0.0018,0.015,-0.124,0.902,-0.030,0.027
x6,-0.0557,0.024,-2.323,0.020,-0.103,-0.009
x7,-0.0106,0.018,-0.572,0.568,-0.047,0.026
x8,0.1445,0.023,6.149,0.000,0.098,0.191
x9,0.0388,0.015,2.576,0.010,0.009,0.068

0,1,2,3
Omnibus:,461.925,Durbin-Watson:,1.963
Prob(Omnibus):,0.0,Jarque-Bera (JB):,3855.541
Skew:,1.416,Prob(JB):,0.0
Kurtosis:,10.961,Cond. No.,1.77e+16


In [36]:
# df.head()

In [37]:
# df.insert(loc=1,column='inserted',value=df['rooms']/2)

In [38]:
# df.head()

In [39]:
# df = df.reindex(columns=['price','age'])

In [39]:
minmax.inverse_transform(y_pred)[:5]

array([[105602.73962659],
       [216379.50394752],
       [161066.16839908],
       [198717.30156509],
       [ 95433.14311489]])

In [64]:
newCons = 'Yes'
waterfront = 'No'
centralAir = 'Yes'
sewer_test = 'septic'
fuel_test = 'gas'
heating_test = 'electric'

In [65]:
newCons = lab.transform([newCons])[0]

In [66]:
waterfront = lab.transform([waterfront])[0]

In [67]:
centralAir = lab.transform([centralAir])[0]

In [46]:
sewer = sewer_onehot.transform([[sewer_test]]).toarray()[0]

In [47]:
fuel = fuel_onehot.transform([[fuel_test]]).toarray()[0]

In [48]:
heat = heat_onehot.transform([[heating_test]]).toarray()[0]

In [68]:
newCons, waterfront, centralAir

(1, 0, 1)

In [72]:
test_x = np.array([[0.09,38,50000,906,35,2,2,2.0,2,waterfront,newCons,centralAir]])

In [75]:
test_x = np.c_[test_x, [sewer], [fuel], [heat]]

In [76]:
test_x

array([[9.00e-02, 3.80e+01, 5.00e+04, 9.06e+02, 3.50e+01, 2.00e+00,
        2.00e+00, 2.00e+00, 2.00e+00, 0.00e+00, 1.00e+00, 1.00e+00,
        0.00e+00, 0.00e+00, 1.00e+00, 0.00e+00, 1.00e+00, 0.00e+00,
        1.00e+00, 0.00e+00, 0.00e+00]])

In [77]:
regression.predict(test_x)

array([[25503.01031664]])

In [78]:
import pickle as pkl

In [79]:
# Serialization/DeSerialization

In [80]:
file = open('model.pkl','wb')
pkl.dump(regression, file)
file.close()

In [81]:
file = open('label_encoder.pkl','wb')
pkl.dump(lab, file)
file.close()

In [82]:
file = open('sewer.pkl','wb')
pkl.dump(sewer_onehot, file)
file.close()

file = open('fuel.pkl','wb')
pkl.dump(fuel_onehot, file)
file.close()

file = open('heat.pkl','wb')
pkl.dump(heat_onehot, file)
file.close()