# Real Estate Price Prediction Mumbai

In [122]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Data Cleaning

In [123]:
df = pd.read_csv('Mumbai1.csv')
df.head()

Unnamed: 0,Price,Area,Location,No. of Bedrooms,New/Resale,Gymnasium,Lift Available,Car Parking,Maintenance Staff,24x7 Security,Children's Play Area,Clubhouse,Intercom,Landscaped Gardens,Indoor Games,Gas Connection,Jogging Track,Swimming Pool
0,4850000,720,Kharghar,1,0,0,1,1,1,1,0,0,0,0,0,0,0,0
1,4500000,600,Kharghar,1,0,1,1,1,1,1,0,1,0,0,0,0,1,1
2,6700000,650,Kharghar,1,0,1,1,1,1,1,1,1,1,0,0,0,1,1
3,4500000,650,Kharghar,1,0,0,1,1,1,1,0,0,1,1,0,0,0,0
4,5000000,665,Kharghar,1,0,0,1,1,1,1,0,0,1,1,0,0,0,0


#### Many columns are unnecessary like swimming pool, jogging track, indore games, gardens, etc. so removing those columns

In [124]:
df1 = df.drop(["New/Resale","Intercom","Gas Connection","Jogging Track","Landscaped Gardens","Indoor Games","Maintenance Staff","24x7 Security","Children's Play Area","Swimming Pool"],axis='columns')
df1.head()

Unnamed: 0,Price,Area,Location,No. of Bedrooms,Gymnasium,Lift Available,Car Parking,Clubhouse
0,4850000,720,Kharghar,1,0,1,1,0
1,4500000,600,Kharghar,1,1,1,1,1
2,6700000,650,Kharghar,1,1,1,1,1
3,4500000,650,Kharghar,1,0,1,1,0
4,5000000,665,Kharghar,1,0,1,1,0


In [125]:
df1.rename(columns={'No. of Bedrooms':'bhk'},inplace=True)
df1.rename(columns={'Area':'sqft_area'},inplace=True)
df1.rename(columns={'Gymnasium':'gym'},inplace=True)
df1.rename(columns={'Lift Available':'lift'},inplace=True)
df1.rename(columns={'Car Parking':'parking'},inplace=True)
df1.head()

Unnamed: 0,Price,sqft_area,Location,bhk,gym,lift,parking,Clubhouse
0,4850000,720,Kharghar,1,0,1,1,0
1,4500000,600,Kharghar,1,1,1,1,1
2,6700000,650,Kharghar,1,1,1,1,1
3,4500000,650,Kharghar,1,0,1,1,0
4,5000000,665,Kharghar,1,0,1,1,0


In [126]:
df1.isnull().sum()

Price        0
sqft_area    0
Location     0
bhk          0
gym          0
lift         0
parking      0
Clubhouse    0
dtype: int64

#### As there are no null values in the data set, moving forward

### Feature Engineering

In [127]:
df1['sqft_area'].unique()

array([ 720,  600,  650, ...,  250, 1162,  435], dtype=int64)

In [128]:
def is_int(x):
    try:
        int(X)
    except:
        return False
    return True

In [129]:
df1[~(df1['sqft_area'].apply(is_int))].head()

Unnamed: 0,Price,sqft_area,Location,bhk,gym,lift,parking,Clubhouse
0,4850000,720,Kharghar,1,0,1,1,0
1,4500000,600,Kharghar,1,1,1,1,1
2,6700000,650,Kharghar,1,1,1,1,1
3,4500000,650,Kharghar,1,0,1,1,0
4,5000000,665,Kharghar,1,0,1,1,0


In [130]:
df1.isnull().sum()

Price        0
sqft_area    0
Location     0
bhk          0
gym          0
lift         0
parking      0
Clubhouse    0
dtype: int64

### Making price_per_sqft column

In [131]:
df1['price_per_sqft'] = round(df1['Price']/df1['sqft_area'],2)
df1.head()

Unnamed: 0,Price,sqft_area,Location,bhk,gym,lift,parking,Clubhouse,price_per_sqft
0,4850000,720,Kharghar,1,0,1,1,0,6736.11
1,4500000,600,Kharghar,1,1,1,1,1,7500.0
2,6700000,650,Kharghar,1,1,1,1,1,10307.69
3,4500000,650,Kharghar,1,0,1,1,0,6923.08
4,5000000,665,Kharghar,1,0,1,1,0,7518.8


In [132]:
df1['price_per_sqft'].describe()

count      6347.000000
mean      13555.465878
std        9573.921384
min        1597.440000
25%        7234.040000
50%       10493.830000
75%       17142.860000
max      109950.520000
Name: price_per_sqft, dtype: float64

In [133]:
df2 = df1.copy()

In [134]:
df2['Price'] = df2['Price']/100000
df2.head()

Unnamed: 0,Price,sqft_area,Location,bhk,gym,lift,parking,Clubhouse,price_per_sqft
0,48.5,720,Kharghar,1,0,1,1,0,6736.11
1,45.0,600,Kharghar,1,1,1,1,1,7500.0
2,67.0,650,Kharghar,1,1,1,1,1,10307.69
3,45.0,650,Kharghar,1,0,1,1,0,6923.08
4,50.0,665,Kharghar,1,0,1,1,0,7518.8


### Sorting location column

In [135]:
df2.Location.unique()

array(['Kharghar', 'Sector-13 Kharghar', 'Sector 18 Kharghar',
       'Sector 20 Kharghar', 'Sector 15 Kharghar', 'Dombivali',
       'Churchgate', 'Prabhadevi', 'Jogeshwari West', 'Kalyan East',
       'Malad East', 'Virar East', 'Virar', 'Malad West', 'Borivali East',
       'Mira Road East', 'Goregaon West', 'Kandivali West',
       'Borivali West', 'Kandivali East', 'Andheri East', 'Goregaon East',
       'Wadala', 'Ulwe', 'Dahisar', 'kandivali', 'Goregaon',
       'Bhandup West', 'thakur village kandivali east', 'Santacruz West',
       'Kanjurmarg', 'I C Colony', 'Dahisar W', 'Marol', 'Parel',
       'Lower Parel', 'Worli', 'Jogeshwari East', 'Chembur Shell Colony',
       'Central Avenue', 'Chembur East', 'Diamond Market Road', 'Mulund',
       'Nalasopara West', 'raheja vihar', 'Powai Lake', 'MHADA Colony 20',
       'Tolaram Colony', 'Taloja', 'Thane West', 'Vangani',
       'Sector 5 Ulwe', 'Sector12 New Panvel', 'Sector 17 Ulwe',
       'Sector9 Kamothe', 'Sector 19 Kharghar

In [136]:
location_stats = df2.Location.value_counts()
location_stats

Kharghar             533
Thane West           418
Mira Road East       390
Ulwe                 319
Borivali West        176
                    ... 
worli sea face         1
Sea Face               1
Shiv Sagar Estate      1
Natakwala Lane         1
Padle Gaon             1
Name: Location, Length: 413, dtype: int64

In [137]:
loc_lessthan_10 = location_stats[location_stats<=10]
loc_lessthan_10

Sector 19 Kharghar    10
Sion                  10
Majiwada              10
Sector12 Kamothe       9
kandivali              9
                      ..
worli sea face         1
Sea Face               1
Shiv Sagar Estate      1
Natakwala Lane         1
Padle Gaon             1
Name: Location, Length: 324, dtype: int64

In [138]:
df2.Location = df2.Location.apply(lambda x: 'others' if x in loc_lessthan_10 else x)
df2.head(15)

Unnamed: 0,Price,sqft_area,Location,bhk,gym,lift,parking,Clubhouse,price_per_sqft
0,48.5,720,Kharghar,1,0,1,1,0,6736.11
1,45.0,600,Kharghar,1,1,1,1,1,7500.0
2,67.0,650,Kharghar,1,1,1,1,1,10307.69
3,45.0,650,Kharghar,1,0,1,1,0,6923.08
4,50.0,665,Kharghar,1,0,1,1,0,7518.8
5,170.0,2000,Kharghar,4,1,1,1,1,8500.0
6,125.0,1550,Kharghar,3,0,1,1,0,8064.52
7,105.0,1370,others,3,0,1,1,0,7664.23
8,105.0,1356,Kharghar,3,1,1,1,1,7743.36
9,150.0,1680,Kharghar,3,1,1,1,1,8928.57


## Outlier Removal

In [139]:
df2.sqft_area.describe()

count    6347.000000
mean     1004.327084
std       556.375703
min       200.000000
25%       650.000000
50%       905.000000
75%      1182.000000
max      8511.000000
Name: sqft_area, dtype: float64

In [140]:
df2[df2.sqft_area/(df2['bhk'])<300].head()

Unnamed: 0,Price,sqft_area,Location,bhk,gym,lift,parking,Clubhouse,price_per_sqft
239,112.0,573,Ulwe,2,1,1,1,1,19546.25
503,40.0,568,Bhiwandi,2,1,1,1,0,7042.25
545,66.0,550,Thane,2,1,1,1,1,12000.0
546,69.0,565,Thane,2,1,1,1,1,12212.39
547,68.0,547,Thane,2,1,1,1,1,12431.44


In [141]:
df2.shape

(6347, 9)

#### Usually, bedroom has area of >=300 sqft. Hence, in the above line, those data points are shown where sqft_area per bedroom is <300 sqft

In [142]:
df3 = df2[~(df2.sqft_area/(df2['bhk'])<300)]
df3.head()

Unnamed: 0,Price,sqft_area,Location,bhk,gym,lift,parking,Clubhouse,price_per_sqft
0,48.5,720,Kharghar,1,0,1,1,0,6736.11
1,45.0,600,Kharghar,1,1,1,1,1,7500.0
2,67.0,650,Kharghar,1,1,1,1,1,10307.69
3,45.0,650,Kharghar,1,0,1,1,0,6923.08
4,50.0,665,Kharghar,1,0,1,1,0,7518.8


In [143]:
df3.shape

(6187, 9)

In [144]:
df3.price_per_sqft.describe()

count      6187.000000
mean      13457.088230
std        9583.492648
min        1597.440000
25%        7142.860000
50%       10307.690000
75%       17022.660000
max      109950.520000
Name: price_per_sqft, dtype: float64

In [145]:
# def remove_msd_ouliers(df):
#     df_out = pd.DataFrame()
#     for key,subdf in df.groupby('Location'):
#         m = np.mean(subdf.price_per_sqft)
#         sd = np.std(subdf.price_per_sqft)
#         reduced_df = subdf[(subdf.price_per_sqft>(m-sd)) & (subdf.price_per_sqft<=(m+sd))]
#         df_out = pd.concat([df_out,reduced_df], ignore_index=False)
#         return df_out
# df4 = remove_msd_ouliers(df3)
# df4.shape (only 19 records left)

# not removing ouliers beyond (m+sd) and (m-sd)

In [146]:
df3.head()

Unnamed: 0,Price,sqft_area,Location,bhk,gym,lift,parking,Clubhouse,price_per_sqft
0,48.5,720,Kharghar,1,0,1,1,0,6736.11
1,45.0,600,Kharghar,1,1,1,1,1,7500.0
2,67.0,650,Kharghar,1,1,1,1,1,10307.69
3,45.0,650,Kharghar,1,0,1,1,0,6923.08
4,50.0,665,Kharghar,1,0,1,1,0,7518.8


In [147]:
df3.shape

(6187, 9)

In [148]:
df4 =  df3.drop(['price_per_sqft'],axis='columns')
df4.head()

Unnamed: 0,Price,sqft_area,Location,bhk,gym,lift,parking,Clubhouse
0,48.5,720,Kharghar,1,0,1,1,0
1,45.0,600,Kharghar,1,1,1,1,1
2,67.0,650,Kharghar,1,1,1,1,1
3,45.0,650,Kharghar,1,0,1,1,0
4,50.0,665,Kharghar,1,0,1,1,0


## Data Modeling

### One Hot Encoding

In [149]:
dummies = pd.get_dummies(df4.Location)
dummies.head()

Unnamed: 0,Airoli,Ambernath East,Ambernath West,Andheri,Andheri East,Andheri West,Badlapur East,Bandra East,Bandra West,Belapur,...,Ville Parle East,Virar,Virar East,Virar West,Wadala,Wadala East Wadala,Worli,matunga east,mumbai,others
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [150]:
df5 = pd.concat([df4,dummies.drop('others',axis='columns')],axis='columns')
df5.head()

Unnamed: 0,Price,sqft_area,Location,bhk,gym,lift,parking,Clubhouse,Airoli,Ambernath East,...,Vashi,Ville Parle East,Virar,Virar East,Virar West,Wadala,Wadala East Wadala,Worli,matunga east,mumbai
0,48.5,720,Kharghar,1,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,45.0,600,Kharghar,1,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,67.0,650,Kharghar,1,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,45.0,650,Kharghar,1,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,50.0,665,Kharghar,1,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [151]:
df5 = df5.drop(['Location'],axis='columns')
df5.head()

Unnamed: 0,Price,sqft_area,bhk,gym,lift,parking,Clubhouse,Airoli,Ambernath East,Ambernath West,...,Vashi,Ville Parle East,Virar,Virar East,Virar West,Wadala,Wadala East Wadala,Worli,matunga east,mumbai
0,48.5,720,1,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,45.0,600,1,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,67.0,650,1,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,45.0,650,1,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,50.0,665,1,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Modeling

In [152]:
X = df5.drop('Price',axis='columns')
X.head()

Unnamed: 0,sqft_area,bhk,gym,lift,parking,Clubhouse,Airoli,Ambernath East,Ambernath West,Andheri,...,Vashi,Ville Parle East,Virar,Virar East,Virar West,Wadala,Wadala East Wadala,Worli,matunga east,mumbai
0,720,1,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,600,1,1,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,650,1,1,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,650,1,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,665,1,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [153]:
y = df5.Price
y.head()

0    48.5
1    45.0
2    67.0
3    45.0
4    50.0
Name: Price, dtype: float64

### Splitting data into training and test set

In [154]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size =0.2, random_state=0)

In [155]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train,y_train)

LinearRegression()

In [156]:
regressor.score(X_test,y_test)

0.7062530868106554

In [157]:
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score

cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)

cross_val_score(LinearRegression(), X, y, cv=cv)

array([0.70625309, 0.67049873, 0.64913515, 0.63140057, 0.65354266])

In [161]:
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

def find_best_model_using_gridsearchcv(X,y):
    algos = {
        'linear_regression' : {
            'model': LinearRegression(),
            'params': {
                'normalize': [True, False]
            }
        },
        'lasso': {
            'model': Lasso(),
            'params': {
                'alpha': [1,2],
                'selection': ['random', 'cyclic']
            }
        },
        'decision_tree': {
            'model': DecisionTreeRegressor(),
            'params': {
                'criterion' : ['mse','friedman_mse'],
                'splitter': ['best','random']
            }
        }
    }
    scores = []
    cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
    for algo_name, config in algos.items():
        gs =  GridSearchCV(config['model'], config['params'], cv=cv, return_train_score=False)
        gs.fit(X,y)
        scores.append({
            'model': algo_name,
            'best_score': gs.best_score_,
            'best_params': gs.best_params_
        })

    return pd.DataFrame(scores,columns=['model','best_score','best_params'])

find_best_model_using_gridsearchcv(X,y)

Unnamed: 0,model,best_score,best_params
0,linear_regression,0.662166,{'normalize': False}
1,lasso,0.583577,"{'alpha': 1, 'selection': 'random'}"
2,decision_tree,0.431438,"{'criterion': 'mse', 'splitter': 'best'}"


In [169]:
def predict_price(Location,sqft,bhk,gym,lift,parking,clubhouse):    
    loc_index = np.where(X.columns==Location)[0][0]

    x = np.zeros(len(X.columns))
    x[0] = sqft
    x[1] = bhk
    x[2] = gym
    x[3] = lift
    x[4] = parking
    x[5] = clubhouse
    if loc_index >= 0:
        x[loc_index] = 1

    return regressor.predict([x])[0]

In [170]:
df5.head()

Unnamed: 0,Price,sqft_area,bhk,gym,lift,parking,Clubhouse,Airoli,Ambernath East,Ambernath West,...,Vashi,Ville Parle East,Virar,Virar East,Virar West,Wadala,Wadala East Wadala,Worli,matunga east,mumbai
0,48.5,720,1,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,45.0,600,1,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,67.0,650,1,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,45.0,650,1,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,50.0,665,1,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [171]:
predict_price('Kharghar',1000,2,0,1,1,0)

76.97688851016076

In [None]:
predict_price('Andheri West',1361,2,0,0,1,0)