In [1]:
import numpy as np

In [2]:
import pandas as pd

In [3]:
import matplotlib.pyplot as plt

In [4]:
from google.colab import files
uploaded = files.upload()
import io

ModuleNotFoundError: No module named 'google'

In [None]:
##change the size of a figure in matplotlib in which the width and height ratio will be 20:10
plt.rcParams["figure.figsize"] = [20, 10]

In [None]:
df1=pd.read_csv(io.BytesIO(uploaded["Bengaluru_House_Data.csv"]))

In [None]:
df1.head()

In [None]:
df1.shape

In [None]:
#count samples in each of area type categories
df1.groupby('area_type')['area_type'].agg('count')

In [None]:
##Drop certain columns which are not so important in determining the price
df2=df1.drop(['availability','society','balcony','area_type'],axis='columns')

In [None]:
df2.head()

In [None]:
##Tells you the number of rows where a particular column value is NA
df2.isnull().sum()

In [None]:
##Dropping all the rows where data is NA
df3=df2.dropna()
df3.isnull().sum()

In [None]:
##Returns a pandas series which contains all the unique values of the column size
df3['size'].unique()

In [None]:
##Basically BHK and bedroom is same so we split the string on the basis of space and take the first token i.e the number and keep in a new column
df3['bhk'] = df3['size'].apply(lambda x: int(x.split(' ')[0]))


In [None]:
df3.head()

In [None]:
df3['bhk'].unique()

In [None]:
df3[df3['bhk']>20]

In [None]:
df3['total_sqft'].unique()

In [None]:
## function to Determine whether a given value is float or not in the total_sqft column

def is_float(x):
    try:
        float(x)
    except:
        return False
    return True

In [None]:
## ~(negate)operation which returns the rows which are not float
df3[~df3['total_sqft'].apply(is_float)].head()

In [None]:
##Function which splits those rows having sqft values in range on the basis of '_' and takes the mean of the two tokens
def convert_sqft_to_num(x):
    tokens = x.split('-')
    if len(tokens) == 2:
        return (float(tokens[0])+float(tokens[1]))/2
    try:
        return float(x)
    except:
        return None

In [None]:
##Create a new data frame and apply this function on the total_sqft column
df4 = df3.copy()
df4['total_sqft'] = df4['total_sqft'].apply(convert_sqft_to_num)
df4.head()

In [None]:
df4.head(10)

In [None]:
#Feature engineering: Add new feature called price_per_sqft

In [None]:
df5=df4.copy()
df5['price_per_sqft']=df5['price']*100000/df5['total_sqft']
df5.head()

In [None]:
len(df5['location'].unique())

In [None]:
##Strip extra leading or trailing spaces from location to make it clean
df5['location'] = df5['location'].apply(lambda x : x.strip())
##Grouping and arranging the data points in ascending order
location_stats = df5.groupby('location')['location'].agg('count').sort_values(ascending=False)
location_stats

In [None]:
len(location_stats[location_stats<=10])

In [None]:
#Grouping locations having less than 10 data points (rows) into a category called 'other'
#Dimensionality Reduction
#Any location having less than 10 data points should be tagged as "other" location. This way number of categories can be reduced by huge amount. Later on when we do one hot encoding, it will help us with having fewer dummy columns

location_stats_less_than_10 = location_stats[location_stats<=10]

In [None]:
df5['location'] = df5['location'].apply(lambda x: 'other' if x in location_stats_less_than_10 else x)

In [None]:
len(df5['location'].unique())

In [None]:
df5.head(10)

In [None]:
##Outlier Removal Using Business Logic
##As a data scientist when you have a conversation with your business manager (who has expertise in real estate), he will tell you that normally square ft per bedroom is 300 (i.e. 2 bhk apartment is minimum 600 sqft. If you have for example 400 sqft apartment with 2 bhk than that seems suspicious and can be removed as an outlier. We will remove such outliers by keeping our minimum thresold per bhk to be 300 sqft

In [None]:
df5[df5['total_sqft']/df5['bhk']<300]

In [None]:
#Drop all the rows simply using negation symbol

In [None]:
df6 = df5[~(df5['total_sqft']/df5['bhk']<300)]

In [None]:
df5.shape

In [None]:
df6.shape

In [None]:
df6['price_per_sqft'].describe()

In [None]:
#Removing the extreme cases i.e the properties having extremely high and extremely low prices because we are buiding a generic model

In [None]:
#Function which removes outliers per location using mean and one standard deviation
def remove_pps_outliers(df):
    df_out = pd.DataFrame()
    for key, subdf in df.groupby('location'):
        m = np.mean(subdf.price_per_sqft)
        st = np.std(subdf.price_per_sqft)
        reduced_df = subdf[(subdf.price_per_sqft>(m-st)) & (subdf.price_per_sqft<=(m+st))]
        df_out = pd.concat([df_out,reduced_df],ignore_index=True)
    return df_out
df7 = remove_pps_outliers(df6)
df7.shape

In [None]:
#Function which visualises how many data points are there in which the prices of 2BHK apartments are more than that of 3 BHKs

In [None]:

def plot_scatter_chart(df,location):
    bhk2 = df[(df.location==location) & (df.bhk==2)]
    bhk3 = df[(df.location==location) & (df.bhk==3)]
    plt.rcParams['figure.figsize'] = (15,10)
    plt.scatter(bhk2.total_sqft,bhk2.price,color='blue',label='2 BHK', s=50)
    plt.scatter(bhk3.total_sqft,bhk3.price,marker='+', color='green',label='3 BHK', s=50)
    plt.xlabel("Total Square Feet Area")
    plt.ylabel("Price (Lakh Indian Rupees)")
    plt.title(location)
    plt.legend()
    
plot_scatter_chart(df7,"Rajaji Nagar")



We should also remove properties where for same location, the price of (for example) 3 bedroom apartment is less than 2 bedroom apartment (with same square ft area). What we will do is for a given location, we will build a dictionary of stats per bhk, i.e.

{
    '1' : {
        'mean': 4000,
        'std: 2000,
        'count': 34
    },
    '2' : {
        'mean': 4300,
        'std: 2300,
        'count': 22
    },    
}

In [None]:
#Now we can remove those 2 BHK apartments whose price_per_sqft is less than mean price_per_sqft of 1 BHK apartment

In [None]:
def remove_bhk_outliers(df):
    exclude_indices = np.array([])
    for location, location_df in df.groupby('location'):
        bhk_stats = {}
        for bhk, bhk_df in location_df.groupby('bhk'):
            bhk_stats[bhk] = {
                'mean': np.mean(bhk_df.price_per_sqft),
                'std': np.std(bhk_df.price_per_sqft),
                'count': bhk_df.shape[0]
            }
        for bhk, bhk_df in location_df.groupby('bhk'):
            stats = bhk_stats.get(bhk-1)
            if stats and stats['count']>5:
                exclude_indices = np.append(exclude_indices, bhk_df[bhk_df.price_per_sqft<(stats['mean'])].index.values)
    return df.drop(exclude_indices,axis='index')
df8 = remove_bhk_outliers(df7)
# df8 = df7.copy()
df8.shape

In [None]:
plot_scatter_chart(df8,"Rajaji Nagar")

In [None]:
import matplotlib
matplotlib.rcParams["figure.figsize"] = (20,10)
plt.hist(df8.price_per_sqft,rwidth=0.8)
plt.xlabel("Price Per Square Feet")
plt.ylabel("Count")

In [None]:
df8['bath'].unique()

In [None]:
df8[df8['bath']>10]

In [None]:

plt.hist(df8.bath,rwidth=0.8)
plt.xlabel("Number of bathrooms")
plt.ylabel("Count")

In [None]:
df8[df8.bath>df8.bhk+2]

In [None]:
df9 = df8[df8.bath<df8.bhk+2]
df9.shape

In [None]:
df10 = df9.drop(['size','price_per_sqft'],axis='columns')

In [None]:
df10.shape

 Undergoing one hot encoding/dummy method to convert the location column which is a categorical feature into numberts

In [None]:
##For each of the locations it will create a new column
dummies = pd.get_dummies(df10['location'])
dummies.head(13)

In [None]:
##Concatinating the two data frames dummies and df10 along columns and removing one column in order to avoid dummy variable trap
df11 = pd.concat([df10,dummies.drop('other',axis='columns')],axis='columns')
df11.head(3)

In [None]:
df12=df11.drop('location',axis = 'columns')
df12.shape

In [None]:
##Dropping dependent variable from the data frame
X = df12.drop('price',axis = 'columns')
X.head()

In [None]:
y = df12['price']
y.head(3)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=10)

In [None]:
from sklearn.linear_model import LinearRegression
lr_clf = LinearRegression()
lr_clf.fit(X_train,y_train)
lr_clf.score(X_test,y_test)

In [None]:
df12.shape

In [None]:
X.shape

In [None]:
y.shape

In [None]:
##Use K Fold cross validation to measure accuracy of our LinearRegression model

In [None]:
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score

cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)

cross_val_score(LinearRegression(), X, y, cv=cv)

In [None]:
##Find best algorithm for our data set using GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor

def find_best_model_using_gridsearchcv(X,y):
    algos = {
        'linear_regression' : {
            'model': LinearRegression(),
            'params': {
                'normalize': [True, False]
            }
        },
        'lasso': {
            'model': Lasso(),
            'params': {
                'alpha': [1,2],
                'selection': ['random', 'cyclic']
            }
        },
        'decision_tree': {
            'model': DecisionTreeRegressor(),
            'params': {
                'criterion' : ['mse','friedman_mse'],
                'splitter': ['best','random']
            }
        }
    }
    scores = []
    cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
    for algo_name, config in algos.items():
        gs =  GridSearchCV(config['model'], config['params'], cv=cv, return_train_score=False)
        gs.fit(X,y)
        scores.append({
            'model': algo_name,
            'best_score': gs.best_score_,
            'best_params': gs.best_params_
        })

    return pd.DataFrame(scores,columns=['model','best_score','best_params'])

find_best_model_using_gridsearchcv(X,y)

In [None]:
##Test the model for few properties

In [None]:
def predict_price(location,sqft,bath,bhk):    
    loc_index = np.where(X.columns==location)[0][0]

    x = np.zeros(len(X.columns))
    x[0] = sqft
    x[1] = bath
    x[2] = bhk
    if loc_index >= 0:
        x[loc_index] = 1

    return lr_clf.predict([x])[0]

In [None]:
predict_price("Indira Nagar",1000,3,3)

In [None]:
##Export the tested model to a pickle file and it will be used by flask server

In [None]:
import pickle
with open('banglore_home_prices_model.pickle','wb') as f:
    pickle.dump(lr_clf,f)

In [None]:
##Export location and column information to a file that will be useful later on in our prediction application

In [None]:
import json
columns = {
    'data_columns' : [col.lower() for col in X.columns]
}
with open("columns.json","w") as f:
    f.write(json.dumps(columns))