In [147]:
import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn import svm
from sklearn.ensemble import RandomForestRegressor

In [148]:
data = pd.read_csv('../data/Housing.csv')

In [149]:
data.isnull().sum()

price               0
area                0
bedrooms            0
bathrooms           0
stories             0
mainroad            0
guestroom           0
basement            0
hotwaterheating     0
airconditioning     0
parking             0
prefarea            0
furnishingstatus    0
dtype: int64

In [150]:
categorical_col =  ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea']

data[categorical_col]

Unnamed: 0,mainroad,guestroom,basement,hotwaterheating,airconditioning,prefarea
0,yes,no,no,no,yes,yes
1,yes,no,no,no,yes,no
2,yes,no,yes,no,no,yes
3,yes,no,yes,no,yes,yes
4,yes,yes,yes,no,yes,no
...,...,...,...,...,...,...
540,yes,no,yes,no,no,no
541,no,no,no,no,no,no
542,yes,no,no,no,no,no
543,no,no,no,no,no,no


In [151]:
def binary_map(x):
    return x.map({'yes': 1, 'no': 0})

In [152]:
data[categorical_col] = data[categorical_col].apply(binary_map)


data[categorical_col]

Unnamed: 0,mainroad,guestroom,basement,hotwaterheating,airconditioning,prefarea
0,1,0,0,0,1,1
1,1,0,0,0,1,0
2,1,0,1,0,0,1
3,1,0,1,0,1,1
4,1,1,1,0,1,0
...,...,...,...,...,...,...
540,1,0,1,0,0,0
541,0,0,0,0,0,0
542,1,0,0,0,0,0
543,0,0,0,0,0,0


In [153]:
dummy_col = pd.get_dummies(data['furnishingstatus'], drop_first=True)
dummy_col.head()

Unnamed: 0,semi-furnished,unfurnished
0,False,False
1,False,False
2,True,False
3,False,False
4,False,False


In [154]:
data = pd.concat([data, dummy_col], axis=1)


data.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus,semi-furnished,unfurnished
0,13300000,7420,4,2,3,1,0,0,0,1,2,1,furnished,False,False
1,12250000,8960,4,4,4,1,0,0,0,1,3,0,furnished,False,False
2,12250000,9960,3,2,2,1,0,1,0,0,2,1,semi-furnished,True,False
3,12215000,7500,4,2,2,1,0,1,0,1,3,1,furnished,False,False
4,11410000,7420,4,1,2,1,1,1,0,1,2,0,furnished,False,False


In [155]:
dummy_col = pd.get_dummies(data['furnishingstatus'], drop_first=True)

In [156]:
data.drop(['furnishingstatus'], axis=1, inplace=True)


data

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,semi-furnished,unfurnished
0,13300000,7420,4,2,3,1,0,0,0,1,2,1,False,False
1,12250000,8960,4,4,4,1,0,0,0,1,3,0,False,False
2,12250000,9960,3,2,2,1,0,1,0,0,2,1,True,False
3,12215000,7500,4,2,2,1,0,1,0,1,3,1,False,False
4,11410000,7420,4,1,2,1,1,1,0,1,2,0,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
540,1820000,3000,2,1,1,1,0,1,0,0,2,0,False,True
541,1767150,2400,3,1,1,0,0,0,0,0,0,0,True,False
542,1750000,3620,2,1,1,1,0,0,0,0,0,0,False,True
543,1750000,2910,3,1,1,0,0,0,0,0,0,0,False,False


In [157]:
scaler = MinMaxScaler()
cole_to_scale = ['price', 'area', 'bedrooms', 'bathrooms', 'stories', 'parking']
data[cole_to_scale] = scaler.fit_transform(data[cole_to_scale])



concat = + to dataset

In [158]:
x = data.drop('price',axis=1)
y = data['price']

In [159]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [160]:
model_SVR = svm.SVR()
model_SVR.fit(x_train, y_train)
y_pred = model_SVR.predict(x_test)
print('svr model accuracy:')
print(r2_score(y_test, y_pred))

svr model accuracy:
0.6412608486236433


In [161]:
model_RFR = RandomForestRegressor(n_estimators=10)
model_RFR.fit(x_train, y_train)
y_pred = model_RFR.predict(x_test)
print('Random forest model accuracy:')
print(model_RFR.score(x_test, y_test))

Random forest model accuracy:
0.6212323403977174
