## Assignment 3

Problem Statement:

Consider a real estate company that has a dataset containing the prices of properties in the Delhi region. It wishes to use the data to optimise the sale prices of the properties based on important factors such as area, bedrooms, parking, etc.

Essentially, the company wants —


- To identify the variables affecting house prices, e.g. area, number of rooms, bathrooms, etc.

- To create a linear model that quantitatively relates house prices with variables such as number of rooms, area, number of bathrooms, etc.

- To know the accuracy of the model, i.e. how well these variables can predict house prices.

In [162]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn.linear_model as lm
import sklearn.ensemble as en
import sklearn
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate

In [2]:
housing = pd.read_csv('Housing.csv')

In [3]:
housing.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [6]:
housing.describe(include="all")

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
count,545.0,545.0,545.0,545.0,545.0,545,545,545,545,545,545.0,545,545
unique,,,,,,2,2,2,2,2,,2,3
top,,,,,,yes,no,no,no,no,,no,semi-furnished
freq,,,,,,468,448,354,520,373,,417,227
mean,4766729.0,5150.541284,2.965138,1.286239,1.805505,,,,,,0.693578,,
std,1870440.0,2170.141023,0.738064,0.50247,0.867492,,,,,,0.861586,,
min,1750000.0,1650.0,1.0,1.0,1.0,,,,,,0.0,,
25%,3430000.0,3600.0,2.0,1.0,1.0,,,,,,0.0,,
50%,4340000.0,4600.0,3.0,1.0,2.0,,,,,,0.0,,
75%,5740000.0,6360.0,3.0,2.0,2.0,,,,,,1.0,,


In [42]:
housing.dtypes

price                int64
area                 int64
bedrooms             int64
bathrooms            int64
stories              int64
mainroad            object
guestroom           object
basement            object
hotwaterheating     object
airconditioning     object
parking              int64
prefarea            object
furnishingstatus    object
dtype: object

In [14]:
housing["furnishingstatus"].value_counts()

semi-furnished    227
unfurnished       178
furnished         140
Name: furnishingstatus, dtype: int64

In [24]:
housing.isnull().sum()

price               0
area                0
bedrooms            0
bathrooms           0
stories             0
mainroad            0
guestroom           0
basement            0
hotwaterheating     0
airconditioning     0
parking             0
prefarea            0
furnishingstatus    0
dtype: int64

In [87]:
int_cols = [ col for col in housing.columns if housing[col].dtypes == "int64" and col != "price" ]
int_cols

['area', 'bedrooms', 'bathrooms', 'stories', 'parking']

In [110]:
lr = lm.LinearRegression()
lr.fit( housing[int_cols], housing.price )

pred_price = lr.predict( housing[int_cols] )

lr.score( housing[int_cols], housing.price )

0.5615825438679802

### Improving Model

In [107]:
housing_new = housing.copy()

housing_new.replace( { "mainroad" : { "yes" : 1, "no" : 0 } }, inplace=True )
housing_new.replace( { "guestroom" : { "yes" : 1, "no" : 0 } }, inplace=True )
housing_new.replace( { "basement" : { "yes" : 1, "no" : 0 } }, inplace=True )
housing_new.replace( { "hotwaterheating" : { "yes" : 1, "no" : 0 } }, inplace=True )
housing_new.replace( { "airconditioning" : { "yes" : 1, "no" : 0 } }, inplace=True )
housing_new.replace( { "prefarea" : { "yes" : 1, "no" : 0 } }, inplace=True )
housing_new.replace( { "furnishingstatus" : { "unfurnished" : 0, "semi-furnished" : 1, "furnished" : 2 } }, inplace=True )

housing_new.price = np.log( housing_new.price )
housing_new.area = np.log( housing_new.area )

housing_new.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,16.403275,8.911934,4,2,3,1,0,0,0,1,2,1,2
1,16.321036,9.100526,4,4,4,1,0,0,0,1,3,0,2
2,16.321036,9.206332,3,2,2,1,0,1,0,0,2,1,1
3,16.318175,8.922658,4,2,2,1,0,1,0,1,3,1,2
4,16.250001,8.911934,4,1,2,1,1,1,0,1,2,0,2


In [111]:
lr1 = lm.LinearRegression()

lr1.fit( housing_new.iloc[ : , 1 : ], housing_new.price )

pred_price1 = lr1.predict( housing_new.iloc[ : , 1 : ] )

lr1.score( housing_new.iloc[ : , 1 : ], housing_new.price )

0.7005416693434587

### Random Forest Regressor

In [125]:
rfr = en.RandomForestRegressor(random_state=42)

rfr.fit( housing_new.iloc[ : , 1 : ], housing_new.price )

pred_price1 = rfr.predict( housing_new.iloc[ : , 1 : ] )

rfr.score( housing_new.iloc[ : , 1 : ], housing_new.price )



0.9324230242931651

### Using train_test_split

In [143]:
train_x, test_x, train_y, test_y = train_test_split( housing_new.iloc[ : , 1 : ], housing_new.price, test_size=0.20, random_state=42 )

rfr1 = en.RandomForestRegressor(random_state=42)

rfr1.fit( train_x, train_y )

pred_price1 = rfr1.predict( test_x )

rfr1.score( test_x, test_y )



0.6104181660046184

### Using Cross-Validation

In [166]:
rfr2 = en.RandomForestRegressor()

cross_validate( rfr2, housing_new.iloc[ : , 1 : ], housing_new.price, cv=5, return_estimator=True )

# rfr1.score( test_x, test_y )



{'fit_time': array([0.01499009, 0.01297712, 0.01297379, 0.01296616, 0.01195836]),
 'score_time': array([0.00199699, 0.00199723, 0.00098968, 0.00199389, 0.00199413]),
 'estimator': (RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
             max_features='auto', max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
             oob_score=False, random_state=None, verbose=0, warm_start=False),
  RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
             max_features='auto', max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
             oob_score=False, random_state=None, verbose=0, warm_start=False),
  RandomForestRegressor

In [167]:
# sorted(sklearn.metrics.SCORERS.keys())