In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = "/kaggle/input/bengaluru-house-price-data/Bengaluru_House_Data.csv"
df = pd.read_csv(data) 
df.head() 

In [None]:
df.shape

In [None]:
#lets see distribution of data
df.describe()


In [None]:
#check data types
df.info() 

# Data cleaning

In [None]:
df.isna().sum()

In [None]:
#society columns has too many null values
df= df.drop('society',axis=1) 

In [None]:
df.shape

In [None]:
df.dropna(inplace=True) 

In [None]:
#count the values of size column
df["size"].value_counts()

In [None]:
#split bhk and bedroom from numbers so that further calculations cam go smoothly
df1= df.copy()
df1['size']=df1["size"].str.split(' ',n=1,expand=True)

In [None]:
df1.head()

In [None]:
df1['size'].value_counts() 

In [None]:
df1['Bhk']= df1['size'].copy()


In [None]:
df1.drop('size', axis=1, inplace=True) 

In [None]:
#convert dtype of BHk from string to integer
df1.Bhk= df1['Bhk'].astype(int) 

In [None]:
df['total_sqft'].unique() 

In [None]:
#Find range value
def is_float(x) :
    try:
        float(x) 
    except:
        return False
    return True

df1[~df1['total_sqft'].apply(is_float)].head() 

In [None]:
#convert range  into single value
def convert_range_to_num(x):
    das = x.split('-') 
    if len(das)==2:
        das1 = float(das[0]) 
        das2 = float(das[1]) 
        total = (das1+das2)/2
        return total
        
    try:
        return float(x) 
    except:
        return None

In [None]:
df2 = df1.copy()
df2['total_sqft']= df2['total_sqft'].apply(convert_range_to_num) 

In [None]:
# drop availability column
df2.drop('availability', axis=1, inplace=True) 

In [None]:
df2.head()

# Data exploration

In [None]:
df.area_type.value_counts() 

In [None]:
# import model required for plotting
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


In [None]:
plt.bar(df2.area_type,df2.price)

In [None]:
df2.area_type.value_counts().values

In [None]:
#check the percentage of df.area_type 

plt.pie(df['area_type'].value_counts().values,labels = df['area_type'].value_counts().index, explode=(0.1,0,0,0.1),shadow=True,autopct='%1.1f%%') 

plt.title("Area___Type")

In [None]:
#for area type we will use label encoder to turn it into numerical form

from sklearn import preprocessing 
le = preprocessing.LabelEncoder()
df2['area_type'] = le.fit_transform(df2['area_type']) 

In [None]:
df2['area_type'].unique()

In [None]:
df2.columns


In [None]:
df['location'].value_counts()

In [None]:
df2.location = df2.location.apply(lambda x: x.strip()) 
location_stats = df2.groupby('location')['location'].agg('count') 
location_stats.sort_values(ascending=False) 

***we will indicate locations which stats less than 15 as other***

In [None]:
len(location_stats[location_stats<=15]) 

In [None]:
location_less_than_15 = location_stats[location_stats<=15]
location_less_than_15

In [None]:
df2.location = df2.location.apply(lambda x : 'other' 
                                  if x in location_less_than_15 else x) 
df2.location.value_counts() 

In [None]:
df2.head()

In [None]:
df2.describe().T

In [None]:
df2.isnull().sum()

In [None]:
df.dropna(inplace=True) 

## Remove outliers

In [None]:
sns.boxplot(x= 'price', data = df2) 

In [None]:
sns.displot(df2.price);

In [None]:
# define a function to remove outliers
def outlier_remover(df, x):
    q25, q75 = np.percentile(df[x], 25) , np.percentile(df[x], 75) 
    IQR = q75 - q25
    outliers = 1.5 * IQR
    lower, upper = (q25 - outliers) , (q75 + outliers) 
    df = df[(df[x] < upper) & (df[x] > lower)]
    
    print('Outliers of "{}" are removed\n'.format(x)) 
    return df
  
             
  

In [None]:
df3 = df2.copy()
df3 = outlier_remover(df3, 'price') 

In [None]:
df3.shape

In [None]:
sns.displot(df3.price) 

In [None]:
sns.boxplot('price',data = df3) 

In [None]:
plt.scatter(df2.total_sqft,df2.price)
plt.xlabel('Total_square_feet_Area') 
plt.ylabel('Price of house') 
plt.title('Relationship between price and total square feet before removing outliers') 


In [None]:
plt.scatter(df3.total_sqft, df3.price) 
plt.xlabel('Total square feet area') 
plt.ylabel('Price of house') 
plt.title('Relationship between Price and total square feet after removing outliers') 



In [None]:
# check the distribution of data
df3.describe()



****let's see correlation between ****

In [None]:
df3.corr()

In [None]:
#Visualizing the correlated Data using heatmap

sns.heatmap(df3.corr(), annot=True) ;

*By above figure, Features like balcony and area type doesn't affect much in price valuation of house so, it's better to remove them from dataset*

In [None]:
#drop area_type and balcony columns
df3.drop(['balcony','area_type'],axis=1,inplace=True) 


In [None]:
df3.head()

In [None]:
# get numerical value of location column using pd. get_dummies
dummies = pd.get_dummies(df3.location)


In [None]:
# merge dummies with df3
df4 = pd.concat([df3, dummies.drop('other',axis=1)],axis=1) 


In [None]:
df4.head()

In [None]:
df4.drop('location', axis=1, inplace=True) 

In [None]:
df4.head()

In [None]:
df4.isnull().sum()

In [None]:
df4 = df4.fillna(method = 'ffill') 

# Model Building

In [None]:
#base model

from sklearn.linear_model import LinearRegression
lr_model = LinearRegression()

In [None]:
#divide a feature columns amd price column

X = df4.drop('price', axis=1) 
y = df4.price

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25,random_state=10) 
lr_model.fit(X_train, y_train) 


In [None]:
print('train_score :', lr_model.score(X_train, y_train))
print('test_score :',lr_model.score(X_test, y_test) ) 

here, we can conclude that linear Regression is not a good module for this model. 

In [None]:
from sklearn.ensemble import RandomForestRegressor

Rdrm = RandomForestRegressor() 
Rdrm.fit(X_train, y_train) 


In [None]:
print('test_score',Rdrm.score(X_test, y_test) ) 
print("train_score", Rdrm.score(X_train, y_train) ) 

      

Random Foresst gives a quiet better result then linear regression but still not good enough

In [None]:
#try hyperparameter tuning concept using GridSearchCV
from sklearn.model_selection import GridSearchCV
gd = GridSearchCV(Rdrm, {
    'n_estimators':[ 15,17,18,19,23]
    },cv=5, return_train_score=False) 

gd.fit(X_train, y_train) 

In [None]:
print(gd.best_score_) 
gd.best_params_


In [None]:
#check out for better modelling
from xgboost import XGBRegressor
X_model = XGBRegressor() 
X_model.fit(X_train, y_train) 
print("Training_accuracy :", X_model.score(X_train, y_train))
print("Test_accuracy :", X_model.score(X_test,y_test)) 

      