In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
df1= pd.read_csv("../input/bengaluru-house-price-data/Bengaluru_House_Data.csv")
df1.head()

In [None]:
df1.isnull().sum()

In [None]:
# removing particular columns
# removing null values also because comparing to total data points null points is 
# only 3 to 5 percent

df2=df1.copy()
df2.drop(['area_type','availability','society'],axis=1,inplace=True)
df2.dropna(inplace=True)
df2.reset_index(inplace=True, drop=True)
df2.head(10)

In [None]:
# preprocessing size column

df2['size'].unique()

In [None]:
# changing size column from string to numeric 

df2['size']=df2['size'].apply(lambda x : float(x.split(' ')[0]))
df2.head(10)

In [None]:
# preprocessing total_sqft attribute
# analysing what types of values are there and displaying only range like values

def is_float(x):
    try:
        float(x)
    except:
        return False
    
    return True

# ~ negation symbol interchanges boolean values 

df2[~df2['total_sqft'].apply(is_float)]

In [None]:
# converting range like values in string to numeric values
# range value is converted to mean value

# values with different units (eg: "50 sq meters") converted to none so that
# later it is removed from the dataframe

def convertSqftToNum(x):
    tokens=x.split('-')
    if len(tokens)==2:
        return (float(tokens[0])+float(tokens[1]))/2
    try:
        return float(x)
    except:
        return None
    
df2['total_sqft']=df2['total_sqft'].apply(convertSqftToNum)
df2.dropna(inplace=True)
df2.reset_index(inplace=True,drop=True)
df2.head(10)

In [None]:
# grouping by location and calculating how many values are there within one location

df2['location']=df2['location'].apply(lambda x : x.strip())
location_count=df2.groupby("location")["location"].agg("count").sort_values()
location_count

In [None]:
# there are too many unique values in location which is not great for any conversion
# methods like one-hot encoding.

# so checkLocation function groups them based on no.of datapoints per location using some
# threshold

def check_location(x):
    if location_count[x] <= 10:
        return 'others'
    else:
        return x
    
df2['location']=df2['location'].apply(check_location)
location_count=df2.groupby("location")["location"].count()
location_count

In [None]:
df2.head(20)

## Outlier Detection

In [None]:
# assuming threshold is 300 sqft per bedroom in a house below that is unlikely

df2[df2["total_sqft"]/ df2["size"] < 300 ]

In [None]:
# As the anomalies are less compared to total data points we can remove it

df2 =  df2[~ (df2["total_sqft"]/ df2["size"] < 300) ]

In [None]:
# added new attribute price_per_sqft to remove outliers
# data points lies between one sd is retained remaining are removed within each city

df2["price_per_sqft"]=df2["price"]*100000 / df2["total_sqft"]

def remove_pps_outliers(df):
    df_out= pd.DataFrame()
    for city, subdf in df.groupby("location"):
        m=np.mean(subdf["price_per_sqft"])
        sd=np.std(subdf["price_per_sqft"])
        temp_df= subdf[(subdf["price_per_sqft"] > (m-sd)) & (subdf["price_per_sqft"] < (m+sd))]
        df_out=pd.concat([df_out,temp_df], ignore_index=True)
    return df_out

print(df2.shape)
df2=remove_pps_outliers(df2)
print(df2.shape)

In [None]:
# removing outliers based on bathrooms 
# assuming threshold is n+1 bathrooms for n bedrooms

df2=df2[df2["bath"] < df2["size"]+2 ]
print(df2.shape)

# dropping th pps feature becoz it is added only to remove outliers and it is a reduncacy
df2=df2.drop("price_per_sqft", axis=1)
df2.reset_index(inplace=True,drop=True)
df2

In [None]:
df2.info()

In [None]:
# converting location feature to numeric using one hot encoding
dummies= pd.get_dummies(df2.location)
df3= pd.concat([df2, dummies], axis=1)
df3=df3.drop(["location", 'balcony'], axis=1)
df3.reset_index(inplace=True,drop=True)
df3.head()

In [None]:
print(df3.shape)

In [None]:
X=df3.drop('price',axis=1)
y=df3['price']

print(len(X),len(y))

## Model Building

In [None]:
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor

#models which we are going to test it so there would be 10 total models.

algos={
    
    "linear_regression":{
        "model":LinearRegression(),
        "params":{
            'normalize':[True,False]
        }
    },
    
    "lasso":{
        "model":Lasso(),
        "params":{
            'alpha':[1,2],
            'selection':['random','cyclic']
        }
    },
    
    "decision_tree":{
        "model":DecisionTreeRegressor(),
        "params":{
            'criterion':['mse','friedman_mse'],
            'splitter':['best','random']
            
        }
    }
    
}


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit

scores=[]
cv = ShuffleSplit(n_splits=5, test_size=.25, random_state=0)

for algo_name , algo in algos.items():
    model=algo['model']
    params=algo['params']
    gs=GridSearchCV(model,params,cv=cv,return_train_score=False)
    gs.fit(X,y)
    scores.append({
        'model':algo_name,
        'best_score':gs.best_score_,
        'best_params':gs.best_params_
    })

    
scores=pd.DataFrame(scores,columns=['model','best_score','best_params'])
scores

In [None]:
# from the above scores we conclude that linear_regression with normalized parameter= true
#is the best among the models we trained 

mdl = LinearRegression(normalize=True)
mdl.fit(X,y)

def predict_price(location,sqft,bath,bhk):
       
    # creating x instance from the given details
    x=np.zeros(len(X.columns))
    x[0]=bhk
    x[1]=sqft
    x[2]=bath
    loc_index=np.where(X.columns == location)[0][0]
    x[loc_index]=1
    return mdl.predict([x])
 

In [None]:
predict_price("Indira Nagar",1000,2,2)

In [None]:
predict_price("1st Block Jayanagar",1000,2,2)

## Exporting model

In [None]:
import pickle

with open('house_price_prediction.pickle','wb') as f:
    pickle.dump(mdl,f)

In [None]:
import json

# exporting the columns so that it is used in server side code
columns={
    'data_columns':[ col for col in X.columns]
}

with open('house_prediction_columns.json','w') as f:
    f.write(json.dumps(columns))