In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,KFold,cross_val_score
from sklearn.preprocessing import OrdinalEncoder,OneHotEncoder,StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import xgboost as xg
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [2]:
df = pd.read_csv('data_for_model.csv')

In [3]:
X = df.drop(columns=['flat_price'],axis=1)
y = df['flat_price']

In [4]:
X

Unnamed: 0,flat_type,location1,buildupArea_sqft,age_of_property,furnishing,bedrooms,bathrooms,balcony,parking
0,2 BHK Flat,Mira Road East,1060,recent construction,Semi Furnished,2,2,0,yes
1,1 BHK Flat,Borivali East,525,mid age property,Semi Furnished,1,2,0,no
2,2 BHK Flat,Bhandup West,1065,new construction,Semi Furnished,2,2,1,yes
3,3 BHK Flat,Bhandup West,1150,recent construction,Semi Furnished,3,2,0,yes
4,1 BHK Flat,Wadala East,480,new construction,Unfurnished,1,2,0,yes
...,...,...,...,...,...,...,...,...,...
7478,1 BHK Flat,Malad West,550,modern property,Unfurnished,1,1,0,no
7479,1 BHK Flat,Nalasopara West,700,recent construction,Unfurnished,1,1,0,yes
7480,1 BHK Flat,Malad West,650,new construction,Unfurnished,1,2,0,yes
7481,2 BHK Flat,Bhandup West,950,recent construction,Unfurnished,2,2,2,yes


In [5]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num',StandardScaler(),['buildupArea_sqft','bedrooms','bathrooms','balcony']),
        ('cat',OrdinalEncoder(),['flat_type','parking','age_of_property','furnishing','parking']),
        ('ohe',OneHotEncoder(),['location1'])
    ],remainder='passthrough'
)

In [6]:
pipeline = Pipeline([
    ('preprocessor',preprocessor),
    ('regressor',LinearRegression())
])

In [7]:
kfold = KFold(n_splits=10,shuffle=True,random_state=42)
score = cross_val_score(pipeline,X,y,cv=kfold,scoring='r2')

In [8]:
score.mean()

0.8447734770427383

In [5]:
y_transformed = np.log1p(y)

In [10]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num',StandardScaler(),['buildupArea_sqft','bedrooms','bathrooms','balcony']),
        ('cat',OrdinalEncoder(),['flat_type','parking','age_of_property','furnishing','parking']),
        ('ohe',OneHotEncoder(),['location1'])
    ],remainder='passthrough'
)

pipeline = Pipeline([
    ('preprocessor',preprocessor),
    ('regressor',LinearRegression())
])

kfold = KFold(n_splits=10,shuffle=True,random_state=42)
score = cross_val_score(pipeline,X,y_transformed,cv=kfold,scoring='r2')
print(score.mean(),score.std())

0.917551194996013 0.008074888259299235


In [11]:
X_train, X_test, y_train,y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

In [12]:
pipeline.fit(X_train,y_train)

In [13]:
y_pred = pipeline.predict(X_test)
y_pred = np.expm1(y_pred)
mean_absolute_error(np.expm1(y_test),y_pred)

0.4949232879379806

In [13]:
# random forest
preprocessor = ColumnTransformer(
    transformers=[
        ('num',StandardScaler(),['buildupArea_sqft','bedrooms','bathrooms','balcony']),
        ('cat',OrdinalEncoder(),['flat_type','parking','age_of_property','furnishing','parking']),
        ('ohe',OneHotEncoder(),['location1'])
    ],remainder='passthrough'
)

pipeline = Pipeline([
    ('preprocessor',preprocessor),
    ('regressor',RandomForestRegressor(n_estimators=150))
])

kfold = KFold(n_splits=10,shuffle=True,random_state=42)
score = cross_val_score(pipeline,X,y_transformed,cv=kfold,scoring='r2')
print(score.mean(),score.std())

X_train, X_test, y_train,y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
pipeline.fit(X_train,y_train)
y_pred = pipeline.predict(X_test)
y_pred = np.expm1(y_pred)
print(mean_absolute_error(np.expm1(y_test),y_pred))

0.9161500057224646 0.006523317391574304
0.5109201155162906


In [14]:
# support vector machine
preprocessor = ColumnTransformer(
    transformers=[
        ('num',StandardScaler(),['buildupArea_sqft','bedrooms','bathrooms','balcony']),
        ('cat',OrdinalEncoder(),['flat_type','parking','age_of_property','furnishing','parking']),
        ('ohe',OneHotEncoder(),['location1'])
    ],remainder='passthrough'
)

pipeline = Pipeline([
    ('preprocessor',preprocessor),
    ('regressor',SVR())
])

kfold = KFold(n_splits=10,shuffle=True,random_state=42)
score = cross_val_score(pipeline,X,y_transformed,cv=kfold,scoring='r2')
print(score.mean(),score.std())

X_train, X_test, y_train,y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
pipeline.fit(X_train,y_train)
y_pred = pipeline.predict(X_test)
y_pred = np.expm1(y_pred)
print(mean_absolute_error(np.expm1(y_test),y_pred))

0.9266212150382371 0.004657720264716049
0.48310923214626506


In [15]:
# Gradient Bossting
preprocessor = ColumnTransformer(
    transformers=[
        ('num',StandardScaler(),['buildupArea_sqft','bedrooms','bathrooms','balcony']),
        ('cat',OrdinalEncoder(),['flat_type','parking','age_of_property','furnishing','parking']),
        ('ohe',OneHotEncoder(),['location1'])
    ],remainder='passthrough'
)

pipeline = Pipeline([
    ('preprocessor',preprocessor),
    ('regressor',GradientBoostingRegressor(n_estimators=100))
])

kfold = KFold(n_splits=10,shuffle=True,random_state=42)
score = cross_val_score(pipeline,X,y_transformed,cv=kfold,scoring='r2')
print(score.mean(),score.std())

X_train, X_test, y_train,y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
pipeline.fit(X_train,y_train)
y_pred = pipeline.predict(X_test)
y_pred = np.expm1(y_pred)
print(mean_absolute_error(np.expm1(y_test),y_pred))

0.8828888679307202 0.006607482944678712
0.6133618024831997


In [24]:
# Ridge
preprocessor = ColumnTransformer(
    transformers=[
        ('num',StandardScaler(),['buildupArea_sqft','bedrooms','bathrooms','balcony']),
        ('cat',OrdinalEncoder(),['flat_type','parking','age_of_property','furnishing','parking']),
        ('ohe',OneHotEncoder(),['location1'])
    ],remainder='passthrough'
)

pipeline = Pipeline([
    ('preprocessor',preprocessor),
    ('regressor',Ridge(alpha=0.0001))
])

kfold = KFold(n_splits=10,shuffle=True,random_state=42)
score = cross_val_score(pipeline,X,y_transformed,cv=kfold,scoring='r2')
print(score.mean(),score.std())

X_train, X_test, y_train,y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
pipeline.fit(X_train,y_train)
y_pred = pipeline.predict(X_test)
y_pred = np.expm1(y_pred)
print(mean_absolute_error(np.expm1(y_test),y_pred))

0.9174944246428274 0.008055108592808823
0.49518914699151767


In [18]:
# decision tree
preprocessor = ColumnTransformer(
    transformers=[
        ('num',StandardScaler(),['buildupArea_sqft','bedrooms','bathrooms','balcony']),
        ('cat',OrdinalEncoder(),['flat_type','parking','age_of_property','furnishing','parking']),
        ('ohe',OneHotEncoder(),['location1'])
    ],remainder='passthrough'
)

pipeline = Pipeline([
    ('preprocessor',preprocessor),
    ('regressor',DecisionTreeRegressor())
])

kfold = KFold(n_splits=10,shuffle=True,random_state=42)
score = cross_val_score(pipeline,X,y_transformed,cv=kfold,scoring='r2')
print(score.mean(),score.std())

X_train, X_test, y_train,y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
pipeline.fit(X_train,y_train)
y_pred = pipeline.predict(X_test)
y_pred = np.expm1(y_pred)
print(mean_absolute_error(np.expm1(y_test),y_pred))

0.863548458147472 0.008131613777667487
0.6270428145919493


In [19]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num',StandardScaler(),['buildupArea_sqft','bedrooms','bathrooms','balcony']),
        
        ('ohe',OneHotEncoder(),['location1','flat_type','parking','age_of_property','furnishing'])
    ],remainder='passthrough'
)

pipeline = Pipeline([
    ('preprocessor',preprocessor),
    ('regressor',xg.XGBRFRegressor(objective ='reg:linear',n_estimators = 100))
])

kfold = KFold(n_splits=10,shuffle=True,random_state=42)
score = cross_val_score(pipeline,X,y_transformed,cv=kfold,scoring='r2')
print(score.mean(),score.std())

X_train, X_test, y_train,y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
pipeline.fit(X_train,y_train)
y_pred = pipeline.predict(X_test)
y_pred = np.expm1(y_pred)
print(mean_absolute_error(np.expm1(y_test),y_pred))



0.8191688210266881 0.012051608542453005
0.7469919473987942




In [27]:

preprocessor = ColumnTransformer(
    transformers=[
        ('num',StandardScaler(),['buildupArea_sqft','bedrooms','bathrooms','balcony']),
        ('cat',OrdinalEncoder(),['flat_type','furnishing']),
        ('ohe',OneHotEncoder(),['location1','parking','age_of_property'])
    ],remainder='passthrough'
)

pipeline = Pipeline([
    ('preprocessor',preprocessor),
    ('regressor',LinearRegression())
])

kfold = KFold(n_splits=10,shuffle=True,random_state=42)
score = cross_val_score(pipeline,X,y_transformed,cv=kfold,scoring='r2')
print(score.mean(),score.std())

X_train, X_test, y_train,y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
pipeline.fit(X_train,y_train)
y_pred = pipeline.predict(X_test)
y_pred = np.expm1(y_pred)
print(mean_absolute_error(np.expm1(y_test),y_pred))

0.9182652107950238 0.007983961109307295
0.4933184097534738


In [33]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num',StandardScaler(),['buildupArea_sqft','bedrooms','bathrooms','balcony']),
        ('cat',OrdinalEncoder(),['flat_type','furnishing']),
        ('ohe',OneHotEncoder(),['location1','parking','age_of_property'])
    ],remainder='passthrough'
)

pipeline = Pipeline([
    ('preprocessor',preprocessor),
    ('regressor',SVR(kernel='rbf',C=3,epsilon=0.1,gamma='scale'))
])

kfold = KFold(n_splits=10,shuffle=True,random_state=42)
score = cross_val_score(pipeline,X,y_transformed,cv=kfold,scoring='r2')
print(score.mean(),score.std())

X_train, X_test, y_train,y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
pipeline.fit(X_train,y_train)
y_pred = pipeline.predict(X_test)
y_pred = np.expm1(y_pred)
print(mean_absolute_error(np.expm1(y_test),y_pred))

0.9303541674958297 0.004679064757509158
0.4486041248480058


In [12]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for GridSearchCV
param_grid = {
    'regressor__kernel': ['rbf'],  # Focusing on rbf which is common for SVR
    'regressor__C': [1,5, 10, 100],
    'regressor__epsilon': [0.1, 0.5],
    'regressor__gamma': ['scale']
}


# Define the KFold cross-validator
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

# Perform GridSearchCV
# Use a smaller subset of the data
X_small, _, y_small, _ = train_test_split(X, y_transformed, test_size=0.8, random_state=42)

grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=kfold, scoring='r2', n_jobs=-1)
grid_search.fit(X_small, y_small)


# Best parameters and best score
print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score: ", grid_search.best_score_)




Best parameters found:  {'regressor__C': 1, 'regressor__epsilon': 0.1, 'regressor__gamma': 'scale', 'regressor__kernel': 'rbf'}
Best cross-validation score:  nan




In [51]:
X.sample(4)

Unnamed: 0,flat_type,location1,buildupArea_sqft,age_of_property,furnishing,bedrooms,bathrooms,balcony,parking
4742,3 BHK Flat,Parel,1468,new construction,Semi Furnished,3,3,0,yes
4801,2 BHK Flat,Mulund West,755,mid age property,Unfurnished,2,2,0,yes
5933,3 BHK Flat,Sion,1596,recent construction,Semi Furnished,3,2,2,yes
840,3 BHK Flat,Powai,1470,recent construction,Semi Furnished,3,2,0,yes


In [34]:
# train on full data
pipeline.fit(X,y_transformed)

In [80]:
input = ['1 BHK Flat','Mulund West',600,'recent construction','Fully Furnished',1,1,1,'yes']
a = pd.DataFrame([input],
             columns=X.columns)

In [81]:
np.expm1(pipeline.predict(a)[0])

0.9929375646131227

In [77]:
np.expm1(pipeline.predict(a)[0])+np.expm1(pipeline.predict(a)[0])*0.05

0.43300104546906176

In [66]:
5.54830216350218 +0.277415108175109

5.82571727167729

In [53]:
import pickle

pickle.dump(X,open('df.pkl','wb'))
pickle.dump(pipeline,open('pipeline.pkl','wb'))

In [79]:
pickle.dump(X,open('df.pkl','wb'))