In [72]:
from google.colab import drive
drive.mount('/gdrive')

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [73]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

### 1) Load the input data

In [74]:
input_path = '/gdrive/My Drive/Lemalabs/Assignments/data/'

In [75]:
raw_data = pd.read_csv(input_path + 'Chennai_house_price_multivariate_train.csv')
raw_data.head()

Unnamed: 0,PRT_ID,AREA,INT_SQFT,DATE_SALE,DIST_MAINROAD,N_BEDROOM,N_BATHROOM,N_ROOM,SALE_COND,PARK_FACIL,DATE_BUILD,BUILDTYPE,UTILITY_AVAIL,STREET,MZZONE,QS_ROOMS,QS_BATHROOM,QS_BEDROOM,QS_OVERALL,REG_FEE,COMMIS,SALES_PRICE
0,P03210,Karapakkam,1004,04-05-2011,131,1.0,1.0,3,AbNormal,Yes,15-05-1967,Commercial,AllPub,Paved,A,4.0,3.9,4.9,4.33,380000,144400,7600000
1,P09411,Anna Nagar,1986,19-12-2006,26,2.0,1.0,5,AbNormal,No,22-12-1995,Commercial,AllPub,Gravel,RH,4.9,4.2,2.5,3.765,760122,304049,21717770
2,P01812,Adyar,909,04-02-2012,70,1.0,1.0,3,AbNormal,Yes,09-02-1992,Commercial,ELO,Gravel,RL,4.1,3.8,2.2,3.09,421094,92114,13159200
3,P05346,Velachery,1855,13-03-2010,14,3.0,2.0,5,Family,No,18-03-1988,Others,NoSewr,Paved,I,4.7,3.9,3.6,4.01,356321,77042,9630290
4,P06210,Karapakkam,1226,05-10-2009,84,1.0,1.0,3,AbNormal,Yes,13-10-1979,Others,AllPub,Gravel,C,3.0,2.5,4.1,3.29,237000,74063,7406250


### 2) Preprocessing the data

Check for nulls or missing data

In [76]:
raw_data.isnull().sum(axis=0)

PRT_ID            0
AREA              0
INT_SQFT          0
DATE_SALE         0
DIST_MAINROAD     0
N_BEDROOM         1
N_BATHROOM        5
N_ROOM            0
SALE_COND         0
PARK_FACIL        0
DATE_BUILD        0
BUILDTYPE         0
UTILITY_AVAIL     0
STREET            0
MZZONE            0
QS_ROOMS          0
QS_BATHROOM       0
QS_BEDROOM        0
QS_OVERALL       48
REG_FEE           0
COMMIS            0
SALES_PRICE       0
dtype: int64

Fill the missing data with the mean values of that column. Also those records (rows) can be dropped when it contains the missing data

In [77]:
raw_data = raw_data.fillna({'N_BEDROOM': raw_data['N_BEDROOM'].mean(), 'N_BATHROOM': raw_data['N_BATHROOM'].mean(), 'QS_OVERALL': raw_data['QS_OVERALL'].mean()})

In [78]:
raw_data.isnull().sum(axis=0)

PRT_ID           0
AREA             0
INT_SQFT         0
DATE_SALE        0
DIST_MAINROAD    0
N_BEDROOM        0
N_BATHROOM       0
N_ROOM           0
SALE_COND        0
PARK_FACIL       0
DATE_BUILD       0
BUILDTYPE        0
UTILITY_AVAIL    0
STREET           0
MZZONE           0
QS_ROOMS         0
QS_BATHROOM      0
QS_BEDROOM       0
QS_OVERALL       0
REG_FEE          0
COMMIS           0
SALES_PRICE      0
dtype: int64

In [79]:
raw_data.shape

(7109, 22)

Drop the columns which are not of any significance and also split the input data (x) and output data(y)

In [80]:
y = raw_data['SALES_PRICE']
y = np.array(y)
y = y.reshape(-1,1)
y.shape

(7109, 1)

In [81]:
x_raw_data = raw_data.drop(columns=['PRT_ID','DATE_SALE', 'SALES_PRICE', 'DATE_BUILD'])
x_raw_data.head()

Unnamed: 0,AREA,INT_SQFT,DIST_MAINROAD,N_BEDROOM,N_BATHROOM,N_ROOM,SALE_COND,PARK_FACIL,BUILDTYPE,UTILITY_AVAIL,STREET,MZZONE,QS_ROOMS,QS_BATHROOM,QS_BEDROOM,QS_OVERALL,REG_FEE,COMMIS
0,Karapakkam,1004,131,1.0,1.0,3,AbNormal,Yes,Commercial,AllPub,Paved,A,4.0,3.9,4.9,4.33,380000,144400
1,Anna Nagar,1986,26,2.0,1.0,5,AbNormal,No,Commercial,AllPub,Gravel,RH,4.9,4.2,2.5,3.765,760122,304049
2,Adyar,909,70,1.0,1.0,3,AbNormal,Yes,Commercial,ELO,Gravel,RL,4.1,3.8,2.2,3.09,421094,92114
3,Velachery,1855,14,3.0,2.0,5,Family,No,Others,NoSewr,Paved,I,4.7,3.9,3.6,4.01,356321,77042
4,Karapakkam,1226,84,1.0,1.0,3,AbNormal,Yes,Others,AllPub,Gravel,C,3.0,2.5,4.1,3.29,237000,74063


Encode all the categorical data from the input data

In [82]:
from sklearn.preprocessing import LabelEncoder

area_labelencoder = LabelEncoder()
x_raw_data["AREA"] = area_labelencoder.fit_transform(x_raw_data["AREA"])

salcond_labelencoder = LabelEncoder()
x_raw_data["SALE_COND"] = salcond_labelencoder.fit_transform(x_raw_data["SALE_COND"])

park_labelencoder = LabelEncoder()
x_raw_data["PARK_FACIL"] = park_labelencoder.fit_transform(x_raw_data["PARK_FACIL"])

btype_labelencoder = LabelEncoder()
x_raw_data["BUILDTYPE"] = btype_labelencoder.fit_transform(x_raw_data["BUILDTYPE"])

uavail_labelencoder = LabelEncoder()
x_raw_data["UTILITY_AVAIL"] = uavail_labelencoder.fit_transform(x_raw_data["UTILITY_AVAIL"])

street_labelencoder = LabelEncoder()
x_raw_data["STREET"] = street_labelencoder.fit_transform(x_raw_data["STREET"])

mzzone_labelencoder = LabelEncoder()
x_raw_data["MZZONE"] = mzzone_labelencoder.fit_transform(x_raw_data["MZZONE"])

In [83]:
x_raw_data.head()

Unnamed: 0,AREA,INT_SQFT,DIST_MAINROAD,N_BEDROOM,N_BATHROOM,N_ROOM,SALE_COND,PARK_FACIL,BUILDTYPE,UTILITY_AVAIL,STREET,MZZONE,QS_ROOMS,QS_BATHROOM,QS_BEDROOM,QS_OVERALL,REG_FEE,COMMIS
0,12,1004,131,1.0,1.0,3,1,2,0,1,4,0,4.0,3.9,4.9,4.33,380000,144400
1,4,1986,26,2.0,1.0,5,1,0,0,1,0,3,4.9,4.2,2.5,3.765,760122,304049
2,0,909,70,1.0,1.0,3,1,2,0,2,0,4,4.1,3.8,2.2,3.09,421094,92114
3,15,1855,14,3.0,2.0,5,4,0,2,4,4,2,4.7,3.9,3.6,4.01,356321,77042
4,12,1226,84,1.0,1.0,3,1,2,2,1,0,1,3.0,2.5,4.1,3.29,237000,74063


Now all the input variables have become numeric to enable training the model

Next step is to normalize the scale of the input variables.

This creates the final set of input variables and output variable

In [84]:
from sklearn.preprocessing import StandardScaler
x_scaler = StandardScaler()
y_scaler = StandardScaler()

x = x_scaler.fit_transform(x_raw_data)
y = y_scaler.fit_transform(y)

x.shape, y.shape

((7109, 18), (7109, 1))

On the final set of input and output variables, split the data for training and testing

In [85]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((5331, 18), (1778, 18), (5331, 1), (1778, 1))

3) Create a linear regression model and train the model with the training data

In [86]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()

model.fit(x_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [87]:
train_pred = model.predict(x_train)
test_pred = model.predict(x_test)

score = model.score(x_train,y_train)
print('train model score : {} %'.format(score*100))

score = model.score(x_test,y_test)
print('test model score : {} %'.format(score*100))

train model score : 88.08804835818223 %
test model score : 88.14241196080476 %


Use the testing data to test the model predicting the output for the given input variables

In [88]:
from sklearn.metrics import mean_absolute_error

train_error = mean_absolute_error(y_train, train_pred)
train_accuracy = 1 - train_error
print('accuracy on train data : {} %'.format(train_accuracy*100))

test_error = mean_absolute_error(y_test,test_pred)
test_accuracy = 1 - test_error
print('accuracy on test data : {} %'.format(test_accuracy*100))

accuracy on train data : 72.31775956276361 %
accuracy on test data : 72.22733410196673 %


### 4) Now load the test data set and predict the values

This dataset should also undergo the same pre processing steps and passed to the mode to predict

In [89]:
test_path = '/gdrive/My Drive/Lemalabs/Assignments/data/'
raw_test_data = pd.read_csv(test_path + 'Chennai_house_price_multivariate_test.csv')
raw_test_data.head()

Unnamed: 0,PRT_ID,AREA,INT_SQFT,DATE_SALE,DIST_MAINROAD,N_BEDROOM,N_BATHROOM,N_ROOM,SALE_COND,PARK_FACIL,DATE_BUILD,BUILDTYPE,UTILITY_AVAIL,STREET,MZZONE,QS_ROOMS,QS_BATHROOM,QS_BEDROOM,QS_OVERALL,REG_FEE,COMMIS
0,P05996,Chrompet,958,13-11-2009,185,1,1,3,AbNormal,No,20-11-1982,Others,NoSewr,Paved,RH,3.1,4.4,3.9,3.86,203260,93813
1,P09294,Anna Nagar,1807,29-01-2007,108,2,1,5,AdjLand,Yes,02-02-1990,Others,NoSeWa,No Access,RM,2.6,4.1,2.1,2.775,370410,222246
2,P03807,Karapakkam,1658,07-01-2011,59,2,2,4,AdjLand,No,11-01-1995,Others,NoSeWa,Paved,I,2.5,2.3,3.2,2.72,387972,113159
3,P00539,Anna Nagar,1592,22-01-2014,102,1,1,4,AbNormal,Yes,27-01-1995,Others,AllPub,Gravel,RL,4.1,4.8,2.5,3.635,408134,84442
4,P01448,Chrompet,857,01-05-2012,62,1,1,3,Family,No,10-05-1978,House,ELO,Gravel,RH,3.0,4.5,2.9,3.57,181212,34849


Complete the same preprocessing steps done for the previous dataset. also use the same scalers and encoder instances for this data set

In [90]:
raw_test_data.isnull().sum(axis=0)

PRT_ID            0
AREA              0
INT_SQFT          0
DATE_SALE         0
DIST_MAINROAD     0
N_BEDROOM         0
N_BATHROOM        0
N_ROOM            0
SALE_COND         0
PARK_FACIL        0
DATE_BUILD        0
BUILDTYPE         0
UTILITY_AVAIL     0
STREET            0
MZZONE            0
QS_ROOMS          0
QS_BATHROOM       0
QS_BEDROOM        0
QS_OVERALL       19
REG_FEE           0
COMMIS            0
dtype: int64

In [91]:
raw_test_data = raw_test_data.fillna({'N_BEDROOM': raw_test_data['N_BEDROOM'].mean(), 'N_BATHROOM': raw_test_data['N_BATHROOM'].mean(), 'QS_OVERALL': raw_test_data['QS_OVERALL'].mean()})
raw_test_data.isnull().sum(axis=0)

PRT_ID           0
AREA             0
INT_SQFT         0
DATE_SALE        0
DIST_MAINROAD    0
N_BEDROOM        0
N_BATHROOM       0
N_ROOM           0
SALE_COND        0
PARK_FACIL       0
DATE_BUILD       0
BUILDTYPE        0
UTILITY_AVAIL    0
STREET           0
MZZONE           0
QS_ROOMS         0
QS_BATHROOM      0
QS_BEDROOM       0
QS_OVERALL       0
REG_FEE          0
COMMIS           0
dtype: int64

In [92]:
raw_test_data = raw_test_data.drop(columns=['PRT_ID','DATE_SALE', 'DATE_BUILD'])
raw_test_data.head()

Unnamed: 0,AREA,INT_SQFT,DIST_MAINROAD,N_BEDROOM,N_BATHROOM,N_ROOM,SALE_COND,PARK_FACIL,BUILDTYPE,UTILITY_AVAIL,STREET,MZZONE,QS_ROOMS,QS_BATHROOM,QS_BEDROOM,QS_OVERALL,REG_FEE,COMMIS
0,Chrompet,958,185,1,1,3,AbNormal,No,Others,NoSewr,Paved,RH,3.1,4.4,3.9,3.86,203260,93813
1,Anna Nagar,1807,108,2,1,5,AdjLand,Yes,Others,NoSeWa,No Access,RM,2.6,4.1,2.1,2.775,370410,222246
2,Karapakkam,1658,59,2,2,4,AdjLand,No,Others,NoSeWa,Paved,I,2.5,2.3,3.2,2.72,387972,113159
3,Anna Nagar,1592,102,1,1,4,AbNormal,Yes,Others,AllPub,Gravel,RL,4.1,4.8,2.5,3.635,408134,84442
4,Chrompet,857,62,1,1,3,Family,No,House,ELO,Gravel,RH,3.0,4.5,2.9,3.57,181212,34849


In [93]:
from sklearn.preprocessing import LabelEncoder

labelencoder = LabelEncoder()
raw_test_data['AREA'] = labelencoder.fit_transform(raw_test_data['AREA'])
raw_test_data['SALE_COND'] = labelencoder.fit_transform(raw_test_data['SALE_COND'])
raw_test_data['PARK_FACIL'] = labelencoder.fit_transform(raw_test_data['PARK_FACIL'])
raw_test_data['BUILDTYPE'] = labelencoder.fit_transform(raw_test_data['BUILDTYPE'])
raw_test_data['UTILITY_AVAIL'] = labelencoder.fit_transform(raw_test_data['UTILITY_AVAIL'])
raw_test_data['STREET'] = labelencoder.fit_transform(raw_test_data['STREET'])
raw_test_data['MZZONE'] = labelencoder.fit_transform(raw_test_data['MZZONE'])

raw_test_data.head()

Unnamed: 0,AREA,INT_SQFT,DIST_MAINROAD,N_BEDROOM,N_BATHROOM,N_ROOM,SALE_COND,PARK_FACIL,BUILDTYPE,UTILITY_AVAIL,STREET,MZZONE,QS_ROOMS,QS_BATHROOM,QS_BEDROOM,QS_OVERALL,REG_FEE,COMMIS
0,5,958,185,1,1,3,0,0,2,4,3,3,3.1,4.4,3.9,3.86,203260,93813
1,2,1807,108,2,1,5,2,2,2,3,1,5,2.6,4.1,2.1,2.775,370410,222246
2,9,1658,59,2,2,4,2,0,2,3,3,2,2.5,2.3,3.2,2.72,387972,113159
3,2,1592,102,1,1,4,0,2,2,1,0,4,4.1,4.8,2.5,3.635,408134,84442
4,5,857,62,1,1,3,3,0,1,2,0,3,3.0,4.5,2.9,3.57,181212,34849


In [94]:
new_x_test = x_scaler.fit_transform(raw_test_data)

new_x_test.shape

(2925, 18)

As the final test data is ready, it can passed to the model to predict the output values

In [95]:
new_y_pred = model.predict(new_x_test)

Since the input and output variables are scaled down for training the model, the predicted value should be scaled up to match the actual output value scale range

In [96]:
new_y = y_scaler.inverse_transform(new_y_pred)

In [97]:
new_y.shape

(2925, 1)

Write the final predicted output to a CSV file for validating it further

In [98]:
output_path = '/gdrive/My Drive/Lemalabs/Participants/Revanth Krishna/Week_3/Multivariate_House_Price_prediction/'
np.savetxt(output_path + "/house_price_pred_sklearn.csv", new_y, delimiter=",")

## Grid Search

In [99]:
val =list(np.arange(0.01,1,0.1))
val

[0.01,
 0.11,
 0.21000000000000002,
 0.31000000000000005,
 0.41000000000000003,
 0.51,
 0.6100000000000001,
 0.7100000000000001,
 0.81,
 0.91]

In [100]:
val =list(np.arange(0.01,1,0.1))

from sklearn.linear_model import Ridge
model = Ridge()

from sklearn.model_selection import GridSearchCV

parameters = { 'alpha':val}

clf = GridSearchCV(model, parameters, n_jobs= -2)
clf.fit(x_train, y_train)

GridSearchCV(cv=None, error_score=nan,
             estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True,
                             max_iter=None, normalize=False, random_state=None,
                             solver='auto', tol=0.001),
             iid='deprecated', n_jobs=-2,
             param_grid={'alpha': [0.01, 0.11, 0.21000000000000002,
                                   0.31000000000000005, 0.41000000000000003,
                                   0.51, 0.6100000000000001, 0.7100000000000001,
                                   0.81, 0.91]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [101]:
print(clf.best_params_)

{'alpha': 0.91}


In [102]:
train_pred1 = clf.predict(x_train)

test_pred1 = clf.predict(x_test)

In [103]:
n_score = clf.score(x_train,y_train)
print('Train F1 Score = {} '.format(score))

n_score1 = clf.score(x_test, y_test)
print('Test F1 Score = {} '.format(score1))

Train F1 Score = 0.8814241196080477 
Test F1 Score = 0.8874589435650856 


In [104]:
from sklearn.metrics import mean_absolute_error

train_error1 = mean_absolute_error(train_pred1,y_train)
train_accuracy1 = 1 - train_error1
print('accuracy on train data : {} %'.format(train_accuracy1*100))

test_error1 = mean_absolute_error(test_pred1, y_test)
test_accuracy1 = 1 - test_error1
print('accuracy on test data : {} %'.format(test_accuracy1*100))

accuracy on train data : 72.31911990093334 %
accuracy on test data : 72.22824805532852 %
