In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# loading the input data 
df = pd.read_csv('../input/bengaluru-house-price-data/Bengaluru_House_Data.csv')
df.shape

In [None]:
# take a look at few records of dataframe
df.head()

In [None]:
# drop the columns which are not much important
df1 = df.drop(['availability','society','area_type'],axis = 'columns')
df1

In [None]:
# we can see BHK, Bedroom are synonyms so what we can do we can transform this column
# another observation is there are 43 bedrooms in some records we should check those 
#records are they valid ones.
# lets create a new column and add the numeric part of the size column


df1['bhk'] = df['size'].apply(lambda x : str(x).split(' ')[0])
df1.dtypes

In [None]:
# number of unique values
# print('Number of unique values in area_type column : ',len(df1['area_type'].unique()))
print('Number of unique values in location column : ',len(df1['location'].unique()))

In [None]:
# as we can see  location column has 1306 unique values
# if we will encode these values then 1306 number of columns will be added.
# which is not a great idea so we need to transform this 
# we can create a location 'others' and assign to all those records
# whose count is less than 10 
locations = df1.groupby('location')['location'].agg('count').sort_values(ascending = False)
locations

In [None]:
locations_less_than_equal_10_records = locations[(locations<=10)]
locations_less_than_equal_10_records

In [None]:
# now replace all these locations with 'other'
df1['location'] = df['location'].apply(lambda x : 'other' if x in locations_less_than_equal_10_records else x)
len(df1['location'].unique())

# now we have only 243 unique locations

In [None]:
df1.head()

In [None]:
## finding how many values are null or na
df1.isna().sum()

In [None]:
# taking only rows where size is not na
df2= df1[df1['size'].notna()]
df2.isna().sum()

In [None]:
import math
bathroom_mean = df2['bath'].mean()
bathroom_mean_floor_val = math.floor(bathroom_mean)
bathroom_mean_floor_val

In [None]:
df2[df2['bath'].isna()]
# filling number of na values for bath equal to bedroom
df2['bath'] = df2['bath'].fillna(bathroom_mean_floor_val)


In [None]:
df2.isna().sum()

In [None]:
df2[df2['balcony'].isna()]

In [None]:
balcony_mean = df2['balcony'].mean() # taking mean of balconies
balcony_mean
balcony_mean_floor = math.floor(balcony_mean)
balcony_mean_floor

In [None]:
#mean is 1.58 as round figure taking 
df2['balcony'] = df2['balcony'].fillna(balcony_mean_floor)

In [None]:
df2.isna().sum()

In [None]:
df2[df2['location'].isna()]

In [None]:
df3 = df2[df2['location'].notna()]
df3.isna().sum()

In [None]:

df4 = df3.drop(['size'],axis = 1)
print(df4)
print(df4.dtypes)

In [None]:
df4['bhk'].unique()
# converting bhk into numeric
df4.bhk = pd.to_numeric(df4.bhk, errors='coerce')
#df4.total_sqft = pd.to_numeric(df4.total_sqft, errors='coerce')


In [None]:
df4.dtypes

In [None]:
# helper function to find if arg passed is float or not
def is_float(x):
    try:
        float(x)
    except:
        return False
    return True

In [None]:
# some values in total_sqft column are not expected format
# they are given in range form
df4[~df4['total_sqft'].apply(is_float)].head(10)

In [None]:
# helper function to take avg of such cases
def convert_sqft_to_num(x):
    tokens = x.split('-')
    if len(tokens) == 2:
        return (float(tokens[0])+float(tokens[1]))/2
    try:
        return float(x)
    except:
        return None

In [None]:
df4.total_sqft = df4.total_sqft.apply(convert_sqft_to_num)
df4 = df4[df4.total_sqft.notnull()]
print(df4.head(2))
print(df4.shape)

In [None]:
# on an avg we have 300 sqft as size of a bedroom and we can remove those outliers
# where total_sqft/bhk < 300 

df5 = df4[~(df4.total_sqft/df4.bhk<300)]
df5.shape

In [None]:
df5[df5['bhk'] >= 10]

In [None]:
# # droping those rows where total_sqft is <= 2500 and bedroom is >= 10
# #df3 = df2[df2['location'].notna()]

# index_to_be_dropped = df4[ (df4['total_sqft'] <= 2500) & (df4['bhk'] >= 10)].index
  
# # drop these given row
# # indexes from dataFrame
# df4.drop(index_to_be_dropped, inplace = True)



In [None]:
# df4[df4['bhk'] >= 10]

In [None]:
# index_to_be_dropped_with_bed_bath_27 = df4[ (df4['bhk'] == 27) & (df4['bath'] >= 27)].index
# df4.drop(index_to_be_dropped_with_bed_bath_27, inplace = True)

In [None]:
# check for outliers 

from scipy import stats
import numpy as np
  
z = np.abs(stats.zscore(df5['total_sqft'])) # note we have taken abs of zscore
df5['total_sqft_zscore']=z# here i am assigning all the zscore values into a new column in dataframe
# now remove all the outlier where df4['total_sqft_zscore'] > 3

print(df5[(df5['total_sqft_zscore']>3)])

print(df5.shape)

In [None]:
df6 = df5[df5['total_sqft_zscore']<=3]
df6.shape

In [None]:
z_bath = np.abs(stats.zscore(df6['bath']))
df6['bath_zscore'] = z_bath
df6[df6['bath_zscore']>3]

In [None]:
df6.shape

In [None]:
df7 = df6[df6['bath_zscore']<=3]
df7.shape

In [None]:
z_balcony = np.abs(stats.zscore(df7['balcony']))
df7['balcony_zscore'] = z_balcony
df7[df7['balcony_zscore']>3]

# no outliers in baclony column

In [None]:
z_bhk = np.abs(stats.zscore(df7['bhk']))
df7['bhk_zscore'] = z_bhk
df7[df7['bhk_zscore']>3]

In [None]:
# take  only those records who are not outlier or whose zscore is less than equal to 3

df8 = df7[df7['bhk_zscore']<=3]
df8.head()

In [None]:
# now drop all the zscore columns 
df8.drop(['total_sqft_zscore','bath_zscore','balcony_zscore','bhk_zscore'],axis = 'columns',inplace=True)
df8.head()

In [None]:
# similarly check for outliers for price column
z_price = np.abs(stats.zscore(df8['price']))
df8['price_zscore'] = z_price
df8[df8['price_zscore']>3]

In [None]:
df9 = df8[df8['price_zscore']<=3]
df9.shape

In [None]:

# drop price_zscore column
df9.drop(['price_zscore'],axis ='columns', inplace = True)
df9.head()

In [None]:
# encode location column using pd.get_dummies function

dummies = pd.get_dummies(df9.location)
dummies.head(3)

In [None]:
df10 = pd.concat([df9,dummies.drop('other',axis='columns')],axis='columns')
df10.shape
df10.head()

In [None]:
df11 = df10.drop(['location'],axis='columns')
df11.shape

In [None]:
is_NaN = df11.isnull()
row_has_NaN = is_NaN.any(axis=1)
rows_with_NaN = df11[row_has_NaN]
rows_with_NaN
df12= df11[df11['total_sqft'].notna()]
df12.shape

In [None]:
y = df12['price']
y

In [None]:
df13=df12.drop(['price'],axis ='columns')
df13

In [None]:
X= df13
X

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=10)

In [None]:
from sklearn.linear_model import LinearRegression
lr_clf = LinearRegression()
lr_clf.fit(X_train,y_train)
lr_clf.score(X_test,y_test)


In [None]:
from sklearn.linear_model import Lasso
lasso = Lasso(alpha=0.0001,selection='random')
lasso.fit(X_train,y_train)
lasso.score(X_test,y_test)

In [None]:
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score

cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=0)

cross_val_score(LinearRegression(), X, y, cv=cv)

In [None]:
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor

def find_best_model_using_gridsearchcv(X,y):
    algos = {
        'linear_regression' : {
            'model': LinearRegression(),
            'params': {
                'normalize': [True, False]
            }
        },
        'lasso': {
            'model': Lasso(),
            'params': {
                'alpha': [1,2],
                'selection': ['random', 'cyclic']
            }
        },
        'decision_tree': {
            'model': DecisionTreeRegressor(),
            'params': {
                'criterion' : ['mse','friedman_mse'],
                'splitter': ['best','random']
            }
        }
    }
    scores = []
    cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=0)
    for algo_name, config in algos.items():
        gs =  GridSearchCV(config['model'], config['params'], cv=cv, return_train_score=False)
        gs.fit(X,y)
        scores.append({
            'model': algo_name,
            'best_score': gs.best_score_,
            'best_params': gs.best_params_
        })

    return pd.DataFrame(scores,columns=['model','best_score','best_params'])

find_best_model_using_gridsearchcv(X,y)