In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,KFold,cross_val_score
from sklearn.preprocessing import OrdinalEncoder,OneHotEncoder,StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import xgboost as xg
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [5]:
df = pd.read_csv('data_for_model.csv')

In [7]:
df

Unnamed: 0,flat_type,price,address,buildup_area,which_floor,furnishing,bedrooms,balcony
0,3 BHK Flat,3.95,Kanjurmarg West,1550,middle,Fully Furnished,3.0,1.0
1,1 BHK Flat,0.19,Palghar,630,lower,Unfurnished,1.0,2.0
2,1 BHK Flat,0.73,Mira Road East,580,middle,Semi Furnished,1.0,2.0
3,1 BHK Flat,1.10,Dahisar East,410,middle,Semi Furnished,1.0,1.0
4,3 BHK Flat,2.25,others,1052,middle,Semi Furnished,3.0,3.0
...,...,...,...,...,...,...,...,...
8310,3 BHK Flat,1.85,Mulund West,1100,lower,Fully Furnished,3.0,1.0
8311,1 BHK Flat,0.20,Naigaon East,280,lower,Semi Furnished,1.0,0.0
8312,1 BHK Flat,0.97,Tardeo,315,lower,Semi Furnished,1.0,0.0
8313,2 BHK Flat,2.36,Kandivali West,706,middle,Unfurnished,2.0,0.0


In [6]:
df.duplicated().sum()

0

In [8]:
X = df.drop(columns=['price'],axis=1)
y = df['price']

In [9]:
X

Unnamed: 0,flat_type,address,buildup_area,which_floor,furnishing,bedrooms,balcony
0,3 BHK Flat,Kanjurmarg West,1550,middle,Fully Furnished,3.0,1.0
1,1 BHK Flat,Palghar,630,lower,Unfurnished,1.0,2.0
2,1 BHK Flat,Mira Road East,580,middle,Semi Furnished,1.0,2.0
3,1 BHK Flat,Dahisar East,410,middle,Semi Furnished,1.0,1.0
4,3 BHK Flat,others,1052,middle,Semi Furnished,3.0,3.0
...,...,...,...,...,...,...,...
8310,3 BHK Flat,Mulund West,1100,lower,Fully Furnished,3.0,1.0
8311,1 BHK Flat,Naigaon East,280,lower,Semi Furnished,1.0,0.0
8312,1 BHK Flat,Tardeo,315,lower,Semi Furnished,1.0,0.0
8313,2 BHK Flat,Kandivali West,706,middle,Unfurnished,2.0,0.0


In [12]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num',StandardScaler(),['buildup_area']),
        ('cat',OrdinalEncoder(),['flat_type','which_floor','furnishing']),
        ('ohe',OneHotEncoder(),['address'])
    ],remainder='passthrough'
)

pipeline = Pipeline([
    ('preprocessor',preprocessor),
    ('regressor',LinearRegression())
])

kfold = KFold(n_splits=10,shuffle=True,random_state=42)
score = cross_val_score(pipeline,X,y,cv=kfold,scoring='r2')
print(score.mean(),score.std())

0.830143044361014 0.01396424780678478


In [13]:
y_transformed = np.log1p(y)

In [14]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num',StandardScaler(),['buildup_area']),
        ('cat',OrdinalEncoder(),['flat_type','which_floor','furnishing']),
        ('ohe',OneHotEncoder(),['address'])
    ],remainder='passthrough'
)

pipeline = Pipeline([
    ('preprocessor',preprocessor),
    ('regressor',LinearRegression())
])

kfold = KFold(n_splits=10,shuffle=True,random_state=42)
score = cross_val_score(pipeline,X,y_transformed,cv=kfold,scoring='r2')
print(score.mean(),score.std())

0.9155569613484105 0.007488421119608453


In [19]:

preprocessor = ColumnTransformer(
    transformers=[
        ('num',StandardScaler(),['buildup_area']),
        ('cat',OrdinalEncoder(),['flat_type','which_floor','furnishing']),
        ('ohe',OneHotEncoder(),['address'])
    ],remainder='passthrough'
)

pipeline = Pipeline([
    ('preprocessor',preprocessor),
    ('regressor',LinearRegression())
])

kfold = KFold(n_splits=10,shuffle=True,random_state=42)
score = cross_val_score(pipeline,X,y_transformed,cv=kfold,scoring='r2')
print(score.mean(),score.std())

X_train, X_test, y_train,y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
pipeline.fit(X_train,y_train)
y_pred = pipeline.predict(X_test)
y_pred = np.expm1(y_pred)
print(mean_absolute_error(np.expm1(y_test),y_pred))
print(mean_squared_error(np.expm1(y_test),y_pred))

0.9155569613484105 0.007488421119608453
0.563616000643587
1.4800831026113106


In [21]:
# support vector machine
preprocessor = ColumnTransformer(
    transformers=[
        ('num',StandardScaler(),['buildup_area']),
        ('cat',OrdinalEncoder(),['flat_type','which_floor','furnishing']),
        ('ohe',OneHotEncoder(),['address'])
    ],remainder='passthrough'
)

pipeline = Pipeline([
    ('preprocessor',preprocessor),
    ('regressor',SVR())
])

kfold = KFold(n_splits=10,shuffle=True,random_state=42)
score = cross_val_score(pipeline,X,y_transformed,cv=kfold,scoring='r2')
print(score.mean(),score.std())

X_train, X_test, y_train,y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
pipeline.fit(X_train,y_train)
y_pred = pipeline.predict(X_test)
y_pred = np.expm1(y_pred)
print(mean_absolute_error(np.expm1(y_test),y_pred))

0.9283050910119991 0.006997333302999006
0.5284570376838722


In [22]:
# Gradient Bossting
preprocessor = ColumnTransformer(
    transformers=[
        ('num',StandardScaler(),['buildup_area']),
        ('cat',OrdinalEncoder(),['flat_type','which_floor','furnishing']),
        ('ohe',OneHotEncoder(),['address'])
    ],remainder='passthrough'
)

pipeline = Pipeline([
    ('preprocessor',preprocessor),
    ('regressor',GradientBoostingRegressor(n_estimators=100))
])

kfold = KFold(n_splits=10,shuffle=True,random_state=42)
score = cross_val_score(pipeline,X,y_transformed,cv=kfold,scoring='r2')
print(score.mean(),score.std())

X_train, X_test, y_train,y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
pipeline.fit(X_train,y_train)
y_pred = pipeline.predict(X_test)
y_pred = np.expm1(y_pred)
print(mean_absolute_error(np.expm1(y_test),y_pred))

0.8851488355273084 0.008035196414639128
0.6511634279152343


In [23]:

preprocessor = ColumnTransformer(
    transformers=[
        ('num',StandardScaler(),['buildup_area']),
        ('cat',OrdinalEncoder(),['flat_type','which_floor','furnishing']),
        ('ohe',OneHotEncoder(),['address'])
    ],remainder='passthrough'
)

pipeline = Pipeline([
    ('preprocessor',preprocessor),
    ('regressor',RandomForestRegressor(n_estimators=150))
])

kfold = KFold(n_splits=10,shuffle=True,random_state=42)
score = cross_val_score(pipeline,X,y_transformed,cv=kfold,scoring='r2')
print(score.mean(),score.std())

X_train, X_test, y_train,y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
pipeline.fit(X_train,y_train)
y_pred = pipeline.predict(X_test)
y_pred = np.expm1(y_pred)
print(mean_absolute_error(np.expm1(y_test),y_pred))

0.9129075350906126 0.0066462776410731355
0.5446570415595213


In [24]:

preprocessor = ColumnTransformer(
    transformers=[
        ('num',StandardScaler(),['buildup_area']),
        ('cat',OrdinalEncoder(),['flat_type','which_floor','furnishing']),
        ('ohe',OneHotEncoder(),['address'])
    ],remainder='passthrough'
)

pipeline = Pipeline([
    ('preprocessor',preprocessor),
    ('regressor',KNeighborsRegressor(n_neighbors=5))
])

kfold = KFold(n_splits=10,shuffle=True,random_state=42)
score = cross_val_score(pipeline,X,y_transformed,cv=kfold,scoring='r2')
print(score.mean(),score.std())

X_train, X_test, y_train,y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
pipeline.fit(X_train,y_train)
y_pred = pipeline.predict(X_test)
y_pred = np.expm1(y_pred)
print(mean_absolute_error(np.expm1(y_test),y_pred))

0.89550281006725 0.008151236426054326
0.6018079160006978


In [25]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num',StandardScaler(),['buildup_area']),
        ('cat',OrdinalEncoder(),['flat_type','which_floor','furnishing']),
        ('ohe',OneHotEncoder(),['address'])
    ],remainder='passthrough'
)

pipeline = Pipeline([
    ('preprocessor',preprocessor),
    ('regressor',Ridge(alpha=0.0001))
])

kfold = KFold(n_splits=10,shuffle=True,random_state=42)
score = cross_val_score(pipeline,X,y_transformed,cv=kfold,scoring='r2')
print(score.mean(),score.std())

X_train, X_test, y_train,y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
pipeline.fit(X_train,y_train)
y_pred = pipeline.predict(X_test)
y_pred = np.expm1(y_pred)
print(mean_absolute_error(np.expm1(y_test),y_pred))

0.9155415186155187 0.007489640955751008
0.5635281622900338


In [26]:

preprocessor = ColumnTransformer(
    transformers=[
        ('num',StandardScaler(),['buildup_area']),
        ('cat',OrdinalEncoder(),['flat_type','which_floor','furnishing']),
        ('ohe',OneHotEncoder(),['address'])
    ],remainder='passthrough'
)

pipeline = Pipeline([
    ('preprocessor',preprocessor),
    ('regressor',SVR(kernel='rbf',C=3,epsilon=0.1,gamma='scale'))
])

kfold = KFold(n_splits=10,shuffle=True,random_state=42)
score = cross_val_score(pipeline,X,y_transformed,cv=kfold,scoring='r2')
print(score.mean(),score.std())

X_train, X_test, y_train,y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
pipeline.fit(X_train,y_train)
y_pred = pipeline.predict(X_test)
y_pred = np.expm1(y_pred)
print(mean_absolute_error(np.expm1(y_test),y_pred))

0.930533499189826 0.007540979661673626
0.5091136446607928


In [27]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for GridSearchCV
param_grid = {
    'regressor__kernel': ['rbf',"linear","poly"],  # Focusing on rbf which is common for SVR
    'regressor__C': [1,5, 10, 100],
    'regressor__epsilon': [0.1, 0.5],
    'regressor__gamma': ['scale']
}


# Define the KFold cross-validator
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

# Perform GridSearchCV
# Use a smaller subset of the data
X_small, _, y_small, _ = train_test_split(X, y_transformed, test_size=0.8, random_state=42)

grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=kfold, scoring='r2', n_jobs=-1)
grid_search.fit(X_small, y_small)


# Best parameters and best score
print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score: ", grid_search.best_score_)




Best parameters found:  {'regressor__C': 1, 'regressor__epsilon': 0.1, 'regressor__gamma': 'scale', 'regressor__kernel': 'rbf'}
Best cross-validation score:  nan


 nan nan nan nan nan nan]


In [28]:

preprocessor = ColumnTransformer(
    transformers=[
        ('num',StandardScaler(),['buildup_area']),
        ('cat',OrdinalEncoder(),['flat_type','which_floor','furnishing']),
        ('ohe',OneHotEncoder(),['address'])
    ],remainder='passthrough'
)

pipeline = Pipeline([
    ('preprocessor',preprocessor),
    ('regressor',SVR(kernel='rbf',C=1,epsilon=0.1,gamma='scale'))
])

kfold = KFold(n_splits=10,shuffle=True,random_state=42)
score = cross_val_score(pipeline,X,y_transformed,cv=kfold,scoring='r2')
print(score.mean(),score.std())

X_train, X_test, y_train,y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
pipeline.fit(X_train,y_train)
y_pred = pipeline.predict(X_test)
y_pred = np.expm1(y_pred)
print(mean_absolute_error(np.expm1(y_test),y_pred))

0.9283050910119991 0.006997333302999006
0.5284570376838722


In [29]:
X.head()

Unnamed: 0,flat_type,address,buildup_area,which_floor,furnishing,bedrooms,balcony
0,3 BHK Flat,Kanjurmarg West,1550,middle,Fully Furnished,3.0,1.0
1,1 BHK Flat,Palghar,630,lower,Unfurnished,1.0,2.0
2,1 BHK Flat,Mira Road East,580,middle,Semi Furnished,1.0,2.0
3,1 BHK Flat,Dahisar East,410,middle,Semi Furnished,1.0,1.0
4,3 BHK Flat,others,1052,middle,Semi Furnished,3.0,3.0


In [52]:
y[4]

2.25

In [49]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num',StandardScaler(),['buildup_area']),
        ('cat',OrdinalEncoder(),['flat_type','which_floor','furnishing']),
        ('ohe',OneHotEncoder(),['address'])
    ],remainder='passthrough'
)

pipeline = Pipeline([
    ('preprocessor',preprocessor),
    ('regressor',SVR(kernel='rbf',C=1,epsilon=0.1,gamma='scale'))
])
pipeline.fit(X,y_transformed)

In [7]:
import pandas as pd
import numpy as np
input = ['2 BHK Flat','Bhandup East',580,'lower','Fully Furnished',2,0]
a = pd.DataFrame([input],
             columns=X.columns)
np.expm1(pipeline.predict(a)[0])

1.3843257648445904

In [57]:
import pickle

pickle.dump(X,open('X.pkl','wb'))
pickle.dump(pipeline,open('pipeline.pkl','wb'))

In [None]:
import pickle
pipeline = pickle.load(open('pipeline.pkl','rb'))

In [6]:
X = pickle.load(open('X.pkl','rb'))

In [9]:
X['address'].unique()

array(['Kanjurmarg West', 'Palghar', 'Mira Road East', 'Dahisar East',
       'others', 'Kanjurmarg East', 'Byculla', 'Kurla West',
       'Mulund West', 'Sion', 'Powai', 'Lower Parel', 'Borivali East',
       'Malad West', 'Virar West', 'Goregaon West', 'Naigaon East',
       'Vasai East', 'Andheri West', 'Vikhroli East', 'Borivali West',
       'Goregaon East', 'Dadar East', 'Dadar West', 'Wadala East',
       'Govandi', 'Mulund East', 'Santacruz East', 'Chembur',
       'Kandivali East', 'Santacruz West', 'Vile Parle East',
       'Jogeshwari West', 'Nalasopara West', 'Worli', 'Kandivali West',
       'Prabhadevi', 'Matunga East', 'Bandra East', 'Parel',
       'Ghatkopar East', 'Andheri East', 'Jogeshwari East',
       'Dahisar West', 'Matunga West', 'Bhandup West', 'Khar West',
       'Ghatkopar West', 'Bhayandar East', 'Mumbai Central', 'Wadala',
       'Bandra West', 'Malad East', 'Nalasopara East', 'Vile Parle West',
       'Boisar', 'Vikhroli West', 'Kurla East', 'Thane West',

In [27]:
input = ['2 BHK Flat','Bhandup West' ,850,'lower','Semi Furnished',2,1]
a = pd.DataFrame([input],
             columns=X.columns)
np.expm1(pipeline.predict(a)[0])

1.529691926086305

In [12]:
X.sample(7)

Unnamed: 0,flat_type,address,buildup_area,which_floor,furnishing,bedrooms,balcony
2181,1 BHK Flat,Kandivali East,700,middle,Unfurnished,1.0,0.0
5838,2 BHK Flat,Lower Parel,1000,higher,Unfurnished,2.0,0.0
2819,1 BHK Flat,Jogeshwari West,550,middle,Unfurnished,1.0,0.0
4033,3 BHK Flat,Kurla West,2100,lower,Semi Furnished,3.0,2.0
7716,3 BHK Flat,Parel,1550,lower,Semi Furnished,3.0,0.0
2411,3 BHK Flat,Powai,2000,middle,Unfurnished,3.0,1.0
4426,1 BHK Flat,Mulund West,540,lower,Unfurnished,2.0,0.0
