# Project Name: Real Estate Price Prediction Project

## Model Selection

### Common Library Call

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings("ignore")

### Dataset Call

In [6]:
# calling the dataset
data=pd.read_csv("data_for_model_sel.csv")
data.head()

Unnamed: 0,location,total_sqft,bath,balcony,price,bhk
0,Devarachikkanahalli,1250.0,2.0,2.0,40.0,2
1,Devarachikkanahalli,1200.0,2.0,2.0,83.0,2
2,Devarachikkanahalli,1170.0,2.0,2.0,40.0,2
3,Devarachikkanahalli,1425.0,2.0,2.0,65.0,3
4,Devarachikkanahalli,947.0,2.0,2.0,43.0,2


### Encoding

In [7]:
# here to process the data into a model we must get dummy varaibles for the column locations
dummies=pd.get_dummies(data["location"])

In [8]:
# we append those dummy variables to the original dataframe
df1=pd.concat([data,dummies.drop("others",axis=1)],axis=1)
df1.head()

Unnamed: 0,location,total_sqft,bath,balcony,price,bhk,Devarachikkanahalli,1st Block Jayanagar,1st Phase JP Nagar,2nd Phase Judicial Layout,...,Vijayanagar,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur
0,Devarachikkanahalli,1250.0,2.0,2.0,40.0,2,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Devarachikkanahalli,1200.0,2.0,2.0,83.0,2,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Devarachikkanahalli,1170.0,2.0,2.0,40.0,2,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Devarachikkanahalli,1425.0,2.0,2.0,65.0,3,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Devarachikkanahalli,947.0,2.0,2.0,43.0,2,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
# now we Have to drop the original location columns since its of No use now
df2=df1.drop("location",axis=1)
df2.head()

Unnamed: 0,total_sqft,bath,balcony,price,bhk,Devarachikkanahalli,1st Block Jayanagar,1st Phase JP Nagar,2nd Phase Judicial Layout,2nd Stage Nagarbhavi,...,Vijayanagar,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur
0,1250.0,2.0,2.0,40.0,2,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1200.0,2.0,2.0,83.0,2,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1170.0,2.0,2.0,40.0,2,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1425.0,2.0,2.0,65.0,3,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,947.0,2.0,2.0,43.0,2,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
df2.shape

(7276, 245)

### X and y constructions

In [11]:
# our entire dataframe will be our independent varaible excluding the price columns
X=df2.drop("price",axis=1)
X.head()

Unnamed: 0,total_sqft,bath,balcony,bhk,Devarachikkanahalli,1st Block Jayanagar,1st Phase JP Nagar,2nd Phase Judicial Layout,2nd Stage Nagarbhavi,5th Block Hbr Layout,...,Vijayanagar,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur
0,1250.0,2.0,2.0,2,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1200.0,2.0,2.0,2,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1170.0,2.0,2.0,2,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1425.0,2.0,2.0,3,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,947.0,2.0,2.0,2,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
# the price columns will be the dependent column
y=df2["price"]
y.head()

0    40.0
1    83.0
2    40.0
3    65.0
4    43.0
Name: price, dtype: float64

### Train-Test-Split

In [13]:
# now we will divide the data set into 80%  training data and 20% testing data as we need to see the accuracy of any selected
# model, after applying the model onto to data
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=101)

### Cross Validater

In [15]:
# setting up a cross validator that will apply cross validation technique on the model to geat an idea of accuracy
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score

cv=ShuffleSplit(n_splits=10,test_size=0.2,random_state=101)

def cross_val_summary(cvs):
    df=pd.DataFrame(columns=["Mean","Std","Min","Max"])
    cv_mean=np.mean(cvs)
    cv_std=np.std(cvs)
    cv_min=np.min(cvs)
    cv_max=np.max(cvs)
    df.loc[len(df.index)]=[cv_mean,cv_std,cv_min,cv_max]
    return df

### Trying Linear Model 

In [16]:
from sklearn.linear_model import LinearRegression

cvs=cross_val_score(LinearRegression(),X,y,cv=cv)
cross_val_summary(cvs)

Unnamed: 0,Mean,Std,Min,Max
0,0.859081,0.019077,0.811624,0.887092


### Searching Over Algorithms

In [17]:
# here we will not go over different different models and check for which model we get the Best accuracy
# we will automate the system by Grid search CV 
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor

def find_best_model(X,y):
    algo={
        "linear_regression":{
            "model":LinearRegression(),
            "params":{
                "normalize":[True,False]
            }
        },
        "lasso_regression":{
            "model":Lasso(),
            "params":{
                "alpha":[1,2],
                "selection":["random","cyclic"],
            }
        },
        "decision_tree":{
            "model":DecisionTreeRegressor(),
            "params":{
                "criterion":["mse","Friedman_mse"],
                "splitter":["random","best"]      
            }
        }
    }
    cv=ShuffleSplit(n_splits=5,test_size=0.2,random_state=101)
    score=[]
    for algo_name,config in algo.items():
        gs=GridSearchCV(config["model"],config["params"],cv=cv,return_train_score=False)
        gs.fit(X,y)
        score.append({
            "model":algo_name,
            "best_score":gs.best_score_,
            "best_params":gs.best_params_
        })
    return pd.DataFrame(score,columns=["model","best_score","best_params"])

result=find_best_model(X,y)
result

Unnamed: 0,model,best_score,best_params
0,linear_regression,0.868581,{'normalize': False}
1,lasso_regression,0.718093,"{'alpha': 1, 'selection': 'cyclic'}"
2,decision_tree,0.732793,"{'criterion': 'mse', 'splitter': 'best'}"


In [13]:
# here one can see that for the LinearRegression The accuracy is best

### Final Model

In [18]:
# selecting linear regression as our final model
lin_model=LinearRegression(**result["best_params"][0])
lin_model.fit(X,y)

### Prediction

In [29]:
# prediction function when a single record is provided
def predict_price(sqft,bath,balcony,bhk,location,columnNames):
    x=np.zeros(len(columnNames))
    x[0]=sqft
    x[1]=bath
    x[2]=balcony
    x[3]=bhk
    
    location_index=np.where(columnNames==location)[0][0]
    if location_index>=0:
        x[location_index]=1
    
    return lin_model.predict([x])[0]

In [30]:
# testing the prediction model
predict_price(1000,2,1,3,"1st Phase JP Nagar",X.columns)

84.8430564971884

### Saving informations

In [23]:
# we have to save the selected model  
import pickle
with open("Bengaluru_realestate_price_model.pkl","wb") as f:
    pickle.dump(lin_model,f)

In [24]:
# also we need the column names to be stored
# we will store all the column name into small case format
import json
columns=[loc.lower() for loc in X.columns]
with open("columns.json","w") as f:
    f.write(json.dumps({"columns":columns}))