In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Data Preprocessing**

In [None]:
import pandas as pd
import numpy as np
from matplotlib.pyplot import plot
%matplotlib inline
import matplotlib
matplotlib.rcParams["figure.figsize"]=(20,10)
import tensorflow as tf

In [None]:
df=pd.read_csv('../input/bengaluru-house-price-data/Bengaluru_House_Data.csv')

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.groupby('area_type')['area_type'].agg('count')

****Data Cleaning****

In [None]:
df1=df.drop(['area_type','availability','society','balcony'], axis='columns')
df1.head()

In [None]:
df1.isnull().sum()

In [None]:
df2=df1.dropna()

In [None]:
df2.isnull().sum()

In [None]:
df2.shape

In [None]:
df2['size'].unique()

In [None]:
df2['bhk']=df2['size'].apply(lambda x: int(x.split(' ')[0]))

In [None]:
df2.head()

In [None]:
df2['bhk'].unique()

In [None]:
df2[df2.bhk>20]

In [None]:
df2.total_sqft.unique()

In [None]:
def is_float(x):
    try:
        float(x)
    except:
        return False
    return True

In [None]:
df2[~df2['total_sqft'].apply(is_float)].head(10)

In [None]:
def convert_sqft_to_num(x):
    tokens = x.split('-')
    if len(tokens)==2:
        return(float(tokens[0])+float(tokens[1]))/2
    try:
        return float(x)
    except:
        return None

In [None]:
convert_sqft_to_num('3090 - 5002')

In [None]:
df3=df2.copy()
df3['total_sqft']=df3['total_sqft'].apply(convert_sqft_to_num)

In [None]:
df3.head()

In [None]:
df3.loc[30]

**Feature Engineering**

In [None]:
df4= df3.copy()
df4['price_per_sqft']= df4['price']*100000/df4['total_sqft']
df4.head()

In [None]:
len(df4.location.unique())

In [None]:
df4.location = df4.location.apply(lambda x : x.strip())

In [None]:
location_stats=df4.groupby('location')['location'].agg('count').sort_values(ascending=False)

In [None]:
location_stats

In [None]:
len(location_stats[location_stats<=10])

In [None]:
location_stats_less_than_10= location_stats[location_stats<=10]
location_stats_less_than_10

In [None]:
len(df4.location.unique())

In [None]:
df4.location=df4.location.apply(lambda x: 'other' if x in location_stats_less_than_10 else x)
len(df4.location.unique())

In [None]:
len(df4.location.unique())

In [None]:
df4.head(10)

Outlier Removal

In [None]:
df4[df4.total_sqft/df4.bhk<300].head()

In [None]:
df4.head(10)

Outlier Removal

In [None]:
df4[df4.total_sqft/df4.bhk<300].head()

In [None]:
df4.shape

In [None]:
df5=df4[~(df4.total_sqft/df4.bhk<300)]
df5.shape

In [None]:
df5.price_per_sqft.describe()

In [None]:
def remove_pps_outliers(df):
    df_out =pd.DataFrame()
    for key, subdf in df.groupby('location'):
        m=np.mean(subdf.price_per_sqft)
        st=np.std(subdf.price_per_sqft)
        reduced_df=subdf[(subdf.price_per_sqft>(m-st)) & (subdf.price_per_sqft<=(m+st))]
        df_out=pd.concat([df_out,reduced_df],ignore_index=True)
    return df_out

In [None]:
df6=remove_pps_outliers(df5)
df6.shape

In [None]:
def plot_scatter_chart(df,location):
    bhk2=df[(df.location==location) & (df.bhk==2)]
    bhk3=df[(df.location==location) & (df.bhk==3)]
    matplotlib.rcParams['figure.figsize']=(15,10)
    plot.scatter(bhk2.total_sqft,bhk2.price_per_sqft, color='green', label='2 bhk', s=50)
    plot.scatter(bhk3.total_sqft,bhk3.price_per_sqft, color='Blue', label='3 bhk', s=50, marker='+')
    plot.xlabel('Total Squear Feet Area')
    plot.ylabel('Price Per Squear Feet')
    plot.title('Location')
    plot.legend()

In [None]:
plot_scatter_chart(df6,'Murugeshpalya')