# Predicting House Price in Banglore

In [1]:
# Import packages
import pandas as pd

In [2]:
# Read the training data

train_df = pd.read_csv('Train.csv')

In [3]:
train_df.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [4]:
train_df.dtypes

area_type        object
availability     object
location         object
size             object
society          object
total_sqft       object
bath            float64
balcony         float64
price           float64
dtype: object

In [5]:
# train_df['total_sqft'] = train_df['total_sqft'].astype('float64') # result is error

In [6]:
def change_to_float(area_size):
    if isinstance(area_size, str):
        area_size = area_size.split('Sq.')[0]
        area_size = area_size.split('Perch')[0]
        area_size = area_size.split('Acres')[0]
        area_size = area_size.split('Guntha')[0]
        area_size = area_size.split('Cents')[0]
        area_size = area_size.split('Grounds')[0]
        area_size = area_size.strip().split('-')
        area_size = list(map(float, area_size))
        area_size = sum(area_size)/len(area_size)
    return area_size

In [7]:
train_df['total_sqft'] = train_df['total_sqft'].apply(lambda x: change_to_float(x))

In [8]:
train_df.dtypes

area_type        object
availability     object
location         object
size             object
society          object
total_sqft      float64
bath            float64
balcony         float64
price           float64
dtype: object

In [9]:
size_mode = train_df['size'].mode()[0]
train_df.loc[train_df['size'].isna(), 'size'] = size_mode 

In [10]:
train_df['size'].unique()

array(['2 BHK', '4 Bedroom', '3 BHK', '4 BHK', '6 Bedroom', '3 Bedroom',
       '1 BHK', '1 RK', '1 Bedroom', '8 Bedroom', '2 Bedroom',
       '7 Bedroom', '5 BHK', '7 BHK', '6 BHK', '5 Bedroom', '11 BHK',
       '9 BHK', '9 Bedroom', '27 BHK', '10 Bedroom', '11 Bedroom',
       '10 BHK', '19 BHK', '16 BHK', '43 Bedroom', '14 BHK', '8 BHK',
       '12 Bedroom', '13 BHK', '18 Bedroom'], dtype=object)

In [12]:
train_df['size'] = train_df['size'].apply(lambda x: x.split(' ')[0])

In [14]:
train_df['size'] = train_df['size'].astype('float64')

In [15]:
train_df.dtypes

area_type        object
availability     object
location         object
size            float64
society          object
total_sqft      float64
bath            float64
balcony         float64
price           float64
dtype: object

In [16]:
train_df.describe()

Unnamed: 0,size,total_sqft,bath,balcony,price
count,13320.0,13320.0,13247.0,12711.0,13320.0
mean,2.802778,1555.971707,2.69261,1.584376,112.565627
std,1.294496,1238.902448,1.341458,0.817263,148.971674
min,1.0,1.0,1.0,0.0,8.0
25%,2.0,1100.0,2.0,1.0,50.0
50%,3.0,1275.0,2.0,2.0,72.0
75%,3.0,1679.25,3.0,2.0,120.0
max,43.0,52272.0,40.0,3.0,3600.0


In [25]:
train_df.isna().sum()

area_type       0
availability    0
location        1
size            0
total_sqft      0
bath            0
balcony         0
price           0
dtype: int64

In [21]:
train_df.drop('society', axis = 1, inplace = True)

In [23]:
bath_med = train_df['bath'].median()
balcony_med = train_df['balcony'].median()

In [24]:
train_df.loc[train_df['bath'].isna(), 'bath'] = bath_med
train_df.loc[train_df['balcony'].isna(), 'balcony'] = balcony_med

In [26]:
train_df['area_type'].unique()

array(['Super built-up  Area', 'Plot  Area', 'Built-up  Area',
       'Carpet  Area'], dtype=object)

In [27]:
train_df.corr()

Unnamed: 0,size,total_sqft,bath,balcony,price
size,1.0,0.346416,0.895786,0.194358,0.398074
total_sqft,0.346416,1.0,0.387637,0.15534,0.573396
bath,0.895786,0.387637,1.0,0.20797,0.454449
balcony,0.194358,0.15534,0.20797,1.0,0.124951
price,0.398074,0.573396,0.454449,0.124951,1.0


In [28]:
X = train_df[['size', 'total_sqft', 'bath', 'balcony']]
y = train_df[['price']]

In [29]:
from sklearn.model_selection import train_test_split

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [31]:
from sklearn.linear_model import LinearRegression

In [32]:
lr = LinearRegression()
lr.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [36]:
preds = lr.predict(X_test)

In [37]:
from sklearn.metrics import mean_absolute_error

In [38]:
mean_absolute_error(y_test, preds)

44.79410616033218

In [39]:
from sklearn.metrics import r2_score
r2_score(y_test, preds)

0.4942909393171021

In [40]:
lr.coef_

array([[-6.09579594,  0.05006305, 38.44754206, -0.04908778]])

In [47]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=300)
rf.fit(X_train, y_train)

  This is separate from the ipykernel package so we can avoid doing imports until


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=300,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [48]:
preds = rf.predict(X_test)

In [49]:
mean_absolute_error(y_test, preds)

36.09459073985268

In [46]:
r2_score(y_test, preds)

0.5818940215587389