In [1]:
import pandas as pd
import numpy as np

# phase III : modelling

In [2]:
data = pd.read_excel('cleaned_data.xlsx')
data = data.drop(['Unnamed: 0'], axis=1)
data

Unnamed: 0,location,total_sqft,bath,price,BHK
0,1st Block Jayanagar,2850.0,4,428.0,4
1,1st Block Jayanagar,1630.0,3,194.0,3
2,1st Block Jayanagar,1875.0,2,235.0,3
3,1st Block Jayanagar,1200.0,2,130.0,3
4,1st Block Jayanagar,1235.0,2,148.0,2
...,...,...,...,...,...
7356,other,1200.0,2,70.0,2
7357,other,1800.0,1,200.0,1
7358,other,1353.0,2,110.0,2
7359,other,812.0,1,26.0,1


In [3]:
X = data.drop(['price'], axis=1)
y = data[['price']]


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score


In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [6]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(5888, 4)
(1473, 4)
(5888, 1)
(1473, 1)


# applying linear regression

In [7]:
column_trans = make_column_transformer((OneHotEncoder(sparse=False), ['location']), remainder='passthrough')

In [8]:
scaler = StandardScaler()

In [9]:
model = LinearRegression(normalize=True)
pipe = make_pipeline(column_trans, scaler, model)
pipe.fit(X_train, y_train)

If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)




Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(sparse=False),
                                                  ['location'])])),
                ('standardscaler', StandardScaler()),
                ('linearregression', LinearRegression(normalize=True))])

In [10]:
y_pred_model = pipe.predict(X_test)

In [11]:
r2_score(y_test, y_pred_model)

0.8788054229820892

In [12]:
import pickle
filename = 'bangalore_housing_pred.pkl'
pickle.dump(pipe, open(filename, 'wb'))