In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import LinearSVR, SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor


import warnings
warnings.filterwarnings(action='ignore')

In [2]:
df1 = pd.read_csv('/Users/shubhvashishth/Downloads/MagicBricks.csv')
df1

Unnamed: 0,Area,BHK,Bathroom,Furnishing,Locality,Parking,Price,Status,Transaction,Type,Per_Sqft
0,800.0,3,2.0,Semi-Furnished,Rohini Sector 25,1.0,6500000,Ready_to_move,New_Property,Builder_Floor,
1,750.0,2,2.0,Semi-Furnished,"J R Designers Floors, Rohini Sector 24",1.0,5000000,Ready_to_move,New_Property,Apartment,6667.0
2,950.0,2,2.0,Furnished,"Citizen Apartment, Rohini Sector 13",1.0,15500000,Ready_to_move,Resale,Apartment,6667.0
3,600.0,2,2.0,Semi-Furnished,Rohini Sector 24,1.0,4200000,Ready_to_move,Resale,Builder_Floor,6667.0
4,650.0,2,2.0,Semi-Furnished,Rohini Sector 24 carpet area 650 sqft status R...,1.0,6200000,Ready_to_move,New_Property,Builder_Floor,6667.0
...,...,...,...,...,...,...,...,...,...,...,...
1254,4118.0,4,5.0,Unfurnished,Chittaranjan Park,3.0,55000000,Ready_to_move,New_Property,Builder_Floor,12916.0
1255,1050.0,3,2.0,Semi-Furnished,Chittaranjan Park,3.0,12500000,Ready_to_move,Resale,Builder_Floor,12916.0
1256,875.0,3,3.0,Semi-Furnished,Chittaranjan Park,3.0,17500000,Ready_to_move,New_Property,Builder_Floor,12916.0
1257,990.0,2,2.0,Unfurnished,Chittaranjan Park Block A,1.0,11500000,Ready_to_move,Resale,Builder_Floor,12916.0


In [3]:
df1.isna().sum()

Area             0
BHK              0
Bathroom         2
Furnishing       5
Locality         0
Parking         33
Price            0
Status           0
Transaction      0
Type             5
Per_Sqft       241
dtype: int64

In [4]:
dummies_1 = pd.get_dummies(df1.Furnishing)
dummies_1.head()

Unnamed: 0,Furnished,Semi-Furnished,Unfurnished
0,0,1,0
1,0,1,0
2,1,0,0
3,0,1,0
4,0,1,0


In [5]:
df2 = pd.concat([df1,dummies_1],axis='columns')
df2

Unnamed: 0,Area,BHK,Bathroom,Furnishing,Locality,Parking,Price,Status,Transaction,Type,Per_Sqft,Furnished,Semi-Furnished,Unfurnished
0,800.0,3,2.0,Semi-Furnished,Rohini Sector 25,1.0,6500000,Ready_to_move,New_Property,Builder_Floor,,0,1,0
1,750.0,2,2.0,Semi-Furnished,"J R Designers Floors, Rohini Sector 24",1.0,5000000,Ready_to_move,New_Property,Apartment,6667.0,0,1,0
2,950.0,2,2.0,Furnished,"Citizen Apartment, Rohini Sector 13",1.0,15500000,Ready_to_move,Resale,Apartment,6667.0,1,0,0
3,600.0,2,2.0,Semi-Furnished,Rohini Sector 24,1.0,4200000,Ready_to_move,Resale,Builder_Floor,6667.0,0,1,0
4,650.0,2,2.0,Semi-Furnished,Rohini Sector 24 carpet area 650 sqft status R...,1.0,6200000,Ready_to_move,New_Property,Builder_Floor,6667.0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1254,4118.0,4,5.0,Unfurnished,Chittaranjan Park,3.0,55000000,Ready_to_move,New_Property,Builder_Floor,12916.0,0,0,1
1255,1050.0,3,2.0,Semi-Furnished,Chittaranjan Park,3.0,12500000,Ready_to_move,Resale,Builder_Floor,12916.0,0,1,0
1256,875.0,3,3.0,Semi-Furnished,Chittaranjan Park,3.0,17500000,Ready_to_move,New_Property,Builder_Floor,12916.0,0,1,0
1257,990.0,2,2.0,Unfurnished,Chittaranjan Park Block A,1.0,11500000,Ready_to_move,Resale,Builder_Floor,12916.0,0,0,1


In [6]:
df3 = df2.drop(['Status','Per_Sqft','Furnishing'],axis='columns')
df3.head()

Unnamed: 0,Area,BHK,Bathroom,Locality,Parking,Price,Transaction,Type,Furnished,Semi-Furnished,Unfurnished
0,800.0,3,2.0,Rohini Sector 25,1.0,6500000,New_Property,Builder_Floor,0,1,0
1,750.0,2,2.0,"J R Designers Floors, Rohini Sector 24",1.0,5000000,New_Property,Apartment,0,1,0
2,950.0,2,2.0,"Citizen Apartment, Rohini Sector 13",1.0,15500000,Resale,Apartment,1,0,0
3,600.0,2,2.0,Rohini Sector 24,1.0,4200000,Resale,Builder_Floor,0,1,0
4,650.0,2,2.0,Rohini Sector 24 carpet area 650 sqft status R...,1.0,6200000,New_Property,Builder_Floor,0,1,0


In [7]:
dummies2 = pd.get_dummies(df3.Transaction)
dummies2.head()

Unnamed: 0,New_Property,Resale
0,1,0
1,1,0
2,0,1
3,0,1
4,1,0


In [8]:
dummies3 = pd.get_dummies(df3.Type)
dummies3.head()

Unnamed: 0,Apartment,Builder_Floor
0,0,1
1,1,0
2,1,0
3,0,1
4,0,1


In [9]:
df4 = pd.concat([df3,dummies2,dummies3],axis='columns')
df4.head()

Unnamed: 0,Area,BHK,Bathroom,Locality,Parking,Price,Transaction,Type,Furnished,Semi-Furnished,Unfurnished,New_Property,Resale,Apartment,Builder_Floor
0,800.0,3,2.0,Rohini Sector 25,1.0,6500000,New_Property,Builder_Floor,0,1,0,1,0,0,1
1,750.0,2,2.0,"J R Designers Floors, Rohini Sector 24",1.0,5000000,New_Property,Apartment,0,1,0,1,0,1,0
2,950.0,2,2.0,"Citizen Apartment, Rohini Sector 13",1.0,15500000,Resale,Apartment,1,0,0,0,1,1,0
3,600.0,2,2.0,Rohini Sector 24,1.0,4200000,Resale,Builder_Floor,0,1,0,0,1,0,1
4,650.0,2,2.0,Rohini Sector 24 carpet area 650 sqft status R...,1.0,6200000,New_Property,Builder_Floor,0,1,0,1,0,0,1


In [10]:
df5 = df4.drop(['Transaction','Type'],axis='columns')
df5.head()

Unnamed: 0,Area,BHK,Bathroom,Locality,Parking,Price,Furnished,Semi-Furnished,Unfurnished,New_Property,Resale,Apartment,Builder_Floor
0,800.0,3,2.0,Rohini Sector 25,1.0,6500000,0,1,0,1,0,0,1
1,750.0,2,2.0,"J R Designers Floors, Rohini Sector 24",1.0,5000000,0,1,0,1,0,1,0
2,950.0,2,2.0,"Citizen Apartment, Rohini Sector 13",1.0,15500000,1,0,0,0,1,1,0
3,600.0,2,2.0,Rohini Sector 24,1.0,4200000,0,1,0,0,1,0,1
4,650.0,2,2.0,Rohini Sector 24 carpet area 650 sqft status R...,1.0,6200000,0,1,0,1,0,0,1


In [11]:
df5.isna().sum()

Area               0
BHK                0
Bathroom           2
Locality           0
Parking           33
Price              0
Furnished          0
Semi-Furnished     0
Unfurnished        0
New_Property       0
Resale             0
Apartment          0
Builder_Floor      0
dtype: int64

In [12]:
df5.Bathroom.unique()

array([ 2.,  3.,  4.,  1., nan,  5.,  6.,  7.])

In [13]:
df5[df5.Bathroom>df5.BHK+1]

Unnamed: 0,Area,BHK,Bathroom,Locality,Parking,Price,Furnished,Semi-Furnished,Unfurnished,New_Property,Resale,Apartment,Builder_Floor
248,10350.0,4,7.0,"Maharani Bagh, New Friends Colony",3.0,160000000,0,1,0,1,0,1,0
1029,5236.0,5,7.0,"The Amaryllis, Karol Bagh",3.0,73700000,0,0,1,1,0,1,0
1081,1250.0,2,4.0,Sheikh Sarai,1.0,19000000,0,1,0,0,1,0,1
1211,4350.0,4,6.0,"Orchid Metropolis, Hauz Khas",2.0,135000000,0,1,0,1,0,1,0


In [14]:
df6 = df5[df5.Bathroom<df5.BHK+2]
df6

Unnamed: 0,Area,BHK,Bathroom,Locality,Parking,Price,Furnished,Semi-Furnished,Unfurnished,New_Property,Resale,Apartment,Builder_Floor
0,800.0,3,2.0,Rohini Sector 25,1.0,6500000,0,1,0,1,0,0,1
1,750.0,2,2.0,"J R Designers Floors, Rohini Sector 24",1.0,5000000,0,1,0,1,0,1,0
2,950.0,2,2.0,"Citizen Apartment, Rohini Sector 13",1.0,15500000,1,0,0,0,1,1,0
3,600.0,2,2.0,Rohini Sector 24,1.0,4200000,0,1,0,0,1,0,1
4,650.0,2,2.0,Rohini Sector 24 carpet area 650 sqft status R...,1.0,6200000,0,1,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1254,4118.0,4,5.0,Chittaranjan Park,3.0,55000000,0,0,1,1,0,0,1
1255,1050.0,3,2.0,Chittaranjan Park,3.0,12500000,0,1,0,0,1,0,1
1256,875.0,3,3.0,Chittaranjan Park,3.0,17500000,0,1,0,1,0,0,1
1257,990.0,2,2.0,Chittaranjan Park Block A,1.0,11500000,0,0,1,0,1,0,1


In [15]:
df6.Parking.unique()

array([  1.,   5.,   2.,   4.,  nan,   9.,   3.,  39.,  10., 114.])

In [16]:
df7 = df6[df6.Parking<10]
df7.head()

Unnamed: 0,Area,BHK,Bathroom,Locality,Parking,Price,Furnished,Semi-Furnished,Unfurnished,New_Property,Resale,Apartment,Builder_Floor
0,800.0,3,2.0,Rohini Sector 25,1.0,6500000,0,1,0,1,0,0,1
1,750.0,2,2.0,"J R Designers Floors, Rohini Sector 24",1.0,5000000,0,1,0,1,0,1,0
2,950.0,2,2.0,"Citizen Apartment, Rohini Sector 13",1.0,15500000,1,0,0,0,1,1,0
3,600.0,2,2.0,Rohini Sector 24,1.0,4200000,0,1,0,0,1,0,1
4,650.0,2,2.0,Rohini Sector 24 carpet area 650 sqft status R...,1.0,6200000,0,1,0,1,0,0,1


In [17]:
df7.isna().sum()

Area              0
BHK               0
Bathroom          0
Locality          0
Parking           0
Price             0
Furnished         0
Semi-Furnished    0
Unfurnished       0
New_Property      0
Resale            0
Apartment         0
Builder_Floor     0
dtype: int64

In [18]:
df7.Area.describe()

count     1211.000000
mean      1469.183807
std       1564.350249
min         28.000000
25%        810.000000
50%       1200.000000
75%       1700.000000
max      24300.000000
Name: Area, dtype: float64

In [19]:
df8 = df7[df7.Area>100]
df8

Unnamed: 0,Area,BHK,Bathroom,Locality,Parking,Price,Furnished,Semi-Furnished,Unfurnished,New_Property,Resale,Apartment,Builder_Floor
0,800.0,3,2.0,Rohini Sector 25,1.0,6500000,0,1,0,1,0,0,1
1,750.0,2,2.0,"J R Designers Floors, Rohini Sector 24",1.0,5000000,0,1,0,1,0,1,0
2,950.0,2,2.0,"Citizen Apartment, Rohini Sector 13",1.0,15500000,1,0,0,0,1,1,0
3,600.0,2,2.0,Rohini Sector 24,1.0,4200000,0,1,0,0,1,0,1
4,650.0,2,2.0,Rohini Sector 24 carpet area 650 sqft status R...,1.0,6200000,0,1,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1254,4118.0,4,5.0,Chittaranjan Park,3.0,55000000,0,0,1,1,0,0,1
1255,1050.0,3,2.0,Chittaranjan Park,3.0,12500000,0,1,0,0,1,0,1
1256,875.0,3,3.0,Chittaranjan Park,3.0,17500000,0,1,0,1,0,0,1
1257,990.0,2,2.0,Chittaranjan Park Block A,1.0,11500000,0,0,1,0,1,0,1


In [20]:
df8.Area.describe()

count     1194.000000
mean      1489.178048
std       1566.382668
min        125.000000
25%        820.000000
50%       1200.000000
75%       1700.000000
max      24300.000000
Name: Area, dtype: float64

In [21]:
df8.Locality.unique()

array(['Rohini Sector 25', 'J R Designers Floors, Rohini Sector 24',
       'Citizen Apartment, Rohini Sector 13', 'Rohini Sector 24',
       'Rohini Sector 24 carpet area 650 sqft status Ready to Move floor 4 out of 4 floors transaction New Property furnishing Semi-Furnished facing East overlooking Garden/Park, Main Road car parking 1 Open bathroom 2 balcony 1 ownership Freehold Newly Constructed Property Newly Constructed Property East Facing Property 2BHK Newly build property for Sale. A House is waiting for a Friendly Family to make it a lovely home.So please come and make his house feel alive once again. read more Contact Agent View Phone No. Share Feedback Garima properties Certified Agent Trusted by Users Genuine Listings Market Knowledge',
       'Delhi Homes, Rohini Sector 24', 'Rohini Sector 21',
       'Rohini Sector 22', 'Rohini Sector 20',
       'Rohini Sector 8 How Auctions work? The borrower has the physical possession of the Property. However the lender (Bank) can lega

In [22]:
dummies4 = pd.get_dummies(df8.Locality)
dummies4.head()

Unnamed: 0,"APL Builder Floor, Greater Kailash 1","Aashirwaad Chowk, Dwarka","Abhimanyu Apartments, Vasundhara Enclave","Abul Fazal Enclave Part 1, Okhla","Abul Fazal Enclave Part-II, Okhla","Adarsh Homes, Dwarka Mor","Ahinsha Vatika, Ram Nagar, Shahdara",Alaknanda,"Amar Colony, Lajpat Nagar","Andheria Mor, Mehrauli",...,"Vijay Enclave, Mahavir Enclave","Vijay Vihar, Rohini","Vikram Vihar, Lajpat Nagar","Vinoba Puri, Lajpat Nagar","Virat Residency, Dwarka Mor","Vishwas Nagar, Shahdara","Yamuna Apartment, Alaknanda","Yamuna Vihar, Shahdara","Zakir Nagar, New Friends Colony","mind. The space is airy, is well located and can be easily customized as per the needs. Prime facilities are easily accessible from this place with Government Girls Senior Secondary School No 2 (0 km ). Other key neighbourhood are B Block Market (0 km ). read more Contact Owner View Phone No. Share Feedback Owner vinod kumar"
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
df9 = pd.concat([df8,dummies4],axis='columns')
df9.head()

Unnamed: 0,Area,BHK,Bathroom,Locality,Parking,Price,Furnished,Semi-Furnished,Unfurnished,New_Property,...,"Vijay Enclave, Mahavir Enclave","Vijay Vihar, Rohini","Vikram Vihar, Lajpat Nagar","Vinoba Puri, Lajpat Nagar","Virat Residency, Dwarka Mor","Vishwas Nagar, Shahdara","Yamuna Apartment, Alaknanda","Yamuna Vihar, Shahdara","Zakir Nagar, New Friends Colony","mind. The space is airy, is well located and can be easily customized as per the needs. Prime facilities are easily accessible from this place with Government Girls Senior Secondary School No 2 (0 km ). Other key neighbourhood are B Block Market (0 km ). read more Contact Owner View Phone No. Share Feedback Owner vinod kumar"
0,800.0,3,2.0,Rohini Sector 25,1.0,6500000,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
1,750.0,2,2.0,"J R Designers Floors, Rohini Sector 24",1.0,5000000,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
2,950.0,2,2.0,"Citizen Apartment, Rohini Sector 13",1.0,15500000,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,600.0,2,2.0,Rohini Sector 24,1.0,4200000,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,650.0,2,2.0,Rohini Sector 24 carpet area 650 sqft status R...,1.0,6200000,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0


In [26]:
df_final = df9.drop('Locality',axis='columns')
df_final

Unnamed: 0,Area,BHK,Bathroom,Parking,Price,Furnished,Semi-Furnished,Unfurnished,New_Property,Resale,...,"Vijay Enclave, Mahavir Enclave","Vijay Vihar, Rohini","Vikram Vihar, Lajpat Nagar","Vinoba Puri, Lajpat Nagar","Virat Residency, Dwarka Mor","Vishwas Nagar, Shahdara","Yamuna Apartment, Alaknanda","Yamuna Vihar, Shahdara","Zakir Nagar, New Friends Colony","mind. The space is airy, is well located and can be easily customized as per the needs. Prime facilities are easily accessible from this place with Government Girls Senior Secondary School No 2 (0 km ). Other key neighbourhood are B Block Market (0 km ). read more Contact Owner View Phone No. Share Feedback Owner vinod kumar"
0,800.0,3,2.0,1.0,6500000,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,750.0,2,2.0,1.0,5000000,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,950.0,2,2.0,1.0,15500000,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,600.0,2,2.0,1.0,4200000,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,650.0,2,2.0,1.0,6200000,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1254,4118.0,4,5.0,3.0,55000000,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1255,1050.0,3,2.0,3.0,12500000,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1256,875.0,3,3.0,3.0,17500000,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1257,990.0,2,2.0,1.0,11500000,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0


In [27]:
x = df_final.drop('Price',axis='columns')
y = df_final.Price

In [35]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size =0.3)

In [36]:
models = {
    "                     Linear Regression": LinearRegression(),
    " Linear Regression (L2 Regularization)": Ridge(),
    " Linear Regression (L1 Regularization)": Lasso(),
    "                   K-Nearest Neighbors": KNeighborsRegressor(),
    "                        Neural Network": MLPRegressor(),
    "Support Vector Machine (Linear Kernel)": LinearSVR(),
    "   Support Vector Machine (RBF Kernel)": SVR(),
    "                         Decision Tree": DecisionTreeRegressor(),
    "                         Random Forest": RandomForestRegressor(),
    "                     Gradient Boosting": GradientBoostingRegressor()
}


for name,model in models.items():
    model.fit(x_train,y_train)
    print(name + "model trained..")
    

                     Linear Regressionmodel trained..
 Linear Regression (L2 Regularization)model trained..
 Linear Regression (L1 Regularization)model trained..
                   K-Nearest Neighborsmodel trained..
                        Neural Networkmodel trained..
Support Vector Machine (Linear Kernel)model trained..
   Support Vector Machine (RBF Kernel)model trained..
                         Decision Treemodel trained..
                         Random Forestmodel trained..
                     Gradient Boostingmodel trained..


In [37]:
for name,model in models.items():
     print(name + " R^2 Score: {:.5f}".format(model.score(x_test, y_test)))

                     Linear Regression R^2 Score: -6285322140938.29688
 Linear Regression (L2 Regularization) R^2 Score: 0.62176
 Linear Regression (L1 Regularization) R^2 Score: 0.60293
                   K-Nearest Neighbors R^2 Score: 0.55821
                        Neural Network R^2 Score: -0.67629
Support Vector Machine (Linear Kernel) R^2 Score: 0.09519
   Support Vector Machine (RBF Kernel) R^2 Score: -0.04162
                         Decision Tree R^2 Score: 0.68010
                         Random Forest R^2 Score: 0.83234
                     Gradient Boosting R^2 Score: 0.83479


In [39]:
best_model = GradientBoostingRegressor()

In [40]:
best_model.fit(x_train,y_train)

GradientBoostingRegressor()

In [60]:
best_model.score(x_test,y_test)

0.8366482711711581

In [61]:
import pickle
with open('delhi_houseprice_model.pickle','wb') as f:
    pickle.dump(best_model,f)

In [62]:
import json
columns = {
    'data_column' : [col.lower() for col in x.columns]
}

with open('columns.json','w') as f:
    f.write(json.dumps(columns))