In [33]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [34]:
import pandas as pd
import numpy as np
import matplotlib
from matplotlib import pyplot as plt
%matplotlib inline
matplotlib.rcParams["figure.figsize"]=(20,10)

In [35]:
df=pd.read_csv("../input/house-prediction-dataset/Bengaluru_House_Data.csv")
df.head()

In [36]:
df.shape

# Data Cleaning

In [37]:
# Dropping the columns which are not important features

df2=df.drop(['area_type','society','balcony','availability'],axis=1)
df2.head()

In [38]:
# Counting the number of missing values in the dataset

df2.isnull().sum()


In [39]:
# Dropping the null values

df3=df2.dropna()
df3.isnull().sum()

In [40]:
df3.shape

In [41]:
df3.head()

In [42]:
df3['size'].unique()

In [43]:
df3['bhk']=df3['size'].apply(lambda x: int(x.split(' ')[0]))

In [44]:
df3.head()

In [45]:
df3['bhk'].unique()

In [46]:
df3[df3.bhk>20]

In [47]:
df3['total_sqft'].unique()

In [48]:
# Viewing the values which are in range

def isfloat(x):
    try:
        float(x)
    except:
        return False
    return True

df3[~df3['total_sqft'].apply(isfloat)].head()

In [49]:
# Handling the case which has only range

def convert_sqft_to_num(x):
    tokens=x.split('-')
    if len(tokens)==2:
        return (float(tokens[0])+float(tokens[1]))/2
    try:
        return float(x)
    except:
        return None
    
    
df4=df3.copy()
df4['total_sqft']=df4['total_sqft'].apply(convert_sqft_to_num)
df4.head()

In [50]:
df4['total_sqft'].unique()

In [51]:
# Looking for the missing values

df4.isnull().sum()

In [52]:
df4=df4.dropna()

In [53]:
# Calculating price per square feet for each row

df5=df4.copy()
df5['price_per_sqft']=df4['price']*100000/df4['total_sqft']
df5.head()

In [54]:
location_data=df5["location"].value_counts()

In [55]:
# Making a cluster of all the values having counts less than 10

location_count_lessthan10=location_data[location_data<=10]
location_count_lessthan10

In [56]:
def handle_location_column(value):
    if value in location_count_lessthan10:
        return 'others'
    else:
        return value
    
df5['location']=df5['location'].apply(handle_location_column)
df5['location'].value_counts()

In [57]:
df5.head()

In [58]:
# Finding the outliers and anamolies for total_sqft column 
# Printing all those rows which has total_sqft/bhk <300

df5[(df5.total_sqft/df5.bhk<300)]

In [59]:
# handling outliers for total_sqft column

df6=df5[~(df5.total_sqft/df5.bhk<300)]
df6.shape


In [60]:
# Finding outliers for price_per_sqft column

df6.price_per_sqft.describe()

In [61]:
3+8*2

In [62]:
def remove_pps_outliers(df):
    df_out = pd.DataFrame()
    for key, subdf in df.groupby('location'):
        m = np.mean(subdf.price_per_sqft)
        st = np.std(subdf.price_per_sqft)
        reduced_df = subdf[(subdf.price_per_sqft>(m-3*st)) & (subdf.price_per_sqft<=(m+3*st))]
        df_out = pd.concat([df_out,reduced_df],ignore_index=True)
    return df_out
df7 = remove_pps_outliers(df6)
df7.shape

In [66]:
# Visualizing price_per_sqft column

matplotlib.rcParams["figure.figsize"] = (20,10)
plt.hist(df7.price_per_sqft,rwidth=0.8)
plt.xlabel("Price Per Square Feet")
plt.ylabel("Count")

In [67]:
# Outlier Removal Using Bathrooms Feature

df7.bath.unique()

In [69]:

df7[df7.bath>10]

### It is unusual to have 2 more bathrooms than number of bedrooms in a home

In [71]:
df7[df7.bath>df7.bhk+2]

In [72]:
df8 = df7[df7.bath<df7.bhk+2]
df8.shape

In [73]:
# Dropping the unnecessary columns

df9 = df8.drop(['size','price_per_sqft'],axis=1)
df9.head(3)

# Using One Hot Encoding for Location Data

In [75]:
dummies=pd.get_dummies(df9.location)
dummies.head()

In [76]:
# Dropping the last column for multicolinearity

df10 = pd.concat([df9,dummies.drop('others',axis=1)],axis='columns')
df10.head()

In [78]:
df11 = df10.drop('location',axis=1)
df11.head()

In [79]:
df11.shape

# Building the Model

In [83]:
X= df11.drop(['price'],axis=1)
y= df11.price

In [84]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=10)

In [85]:
from sklearn.linear_model import LinearRegression
lr_clf = LinearRegression()
lr_clf.fit(X_train,y_train)
lr_clf.score(X_test,y_test)

# Testing the model

In [87]:
def predict_price(location,sqft,bath,bhk):    
    loc_index = np.where(X.columns==location)[0][0]

    x = np.zeros(len(X.columns))
    x[0] = sqft
    x[1] = bath
    x[2] = bhk
    if loc_index >= 0:
        x[loc_index] = 1

    return lr_clf.predict([x])[0]

In [88]:
predict_price('1st Phase JP Nagar',1000, 2, 2)

In [89]:
predict_price('1st Phase JP Nagar',1000, 3, 3)

# Export the tested model to a pickle file

In [None]:
# import pickle
# with open('banglore_home_prices_model.pickle','wb') as f:
#     pickle.dump(lr_clf,f)

In [None]:
# import json
# columns = {
#     'data_columns' : [col.lower() for col in X.columns]
# }
# with open("columns.json","w") as f:
#     f.write(json.dumps(columns))