In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv("/kaggle/input/bengaluru-house-price-data/Bengaluru_House_Data.csv")

In [None]:
df.shape

In [None]:
df.head()

* Looking at columns like **availability** and **size**, we might have to perform some data cleaning in order to explore the data properly.

In [None]:
df.isnull().sum()

In [None]:
df.info()

In [None]:
df.columns

In [None]:
df.area_type.unique()

In [None]:
df.area_type.value_counts()

* **area_type** has 4 standard values

In [None]:
df.availability.unique()

* The values with dates should most probably be future dates. Let's update them as 'In Future'

In [None]:
df1 = df.copy()

In [None]:
df1.availability = df1.availability.apply(lambda x: x if x in ['Ready To Move','Immediate Possession']
                                       else 'In Future')

In [None]:
df1.availability.unique()

In [None]:
df1.availability.value_counts()

In [None]:
df1["size"].unique()

In [None]:
df["size"].value_counts()

In [None]:
df1[df1["size"].isnull()]

* The records which have **size** as null are the Plot Area type properties which might not have any construction as of now. Let's impute a value as 0 as there is no BHK value for it.

In [None]:
df1["size"].fillna("0", inplace=True)

In [None]:
temp = df1["size"].str.split(" ", expand=True)
df1["Room"] = temp[0].copy()
df1["Room_Type"] = temp[1].copy()

df1.head()

In [None]:
len(df1.society.unique())

* There are so unique values in the column **society**. Looking at them they don't seem to provide any meaningful information either. Let's drop the column from the dataset for simplicity.

In [None]:
df2 = df1.copy()

In [None]:
df2.drop("society", axis=1, inplace=True)

In [None]:
len(df2.location.unique())

* Similar to **society**, we do have so many unique value in **location** as well. However, unlike **society**, we know that location is an important feature to decide on the price of a real-state property.

* Let's try to explore more, how can we keep the maximum information from this column

In [None]:
location_stat = df2.groupby("location")["location"].count().sort_values(ascending=False)
location_stat[location_stat.values<10]

In [None]:
location_stat_less_than_10 = location_stat[location_stat.values<10]

In [None]:
location_stat_less_than_10.index

In [None]:
df2.location = df2.location.apply(lambda x: 'Rare' if x in location_stat_less_than_10.index else x)

In [None]:
df2.head()

In [None]:
df2.info()

**total_sqft** has an object data type and we also saw above that for some Plot Area property, it seems like a range value in this column

In [None]:
def is_float(x):
    try:
        float(x)
    except:
        return False
    return True

In [None]:
df2[~df2["total_sqft"].apply(is_float)].head(10)

In [None]:
def convert_sqft_to_num(x):
    tokens = x.split("-")
    if len(tokens)==2:
        return (float(tokens[0])+float(tokens[1]))/2
    try:
        return float(x)
    except:
        return None

In [None]:
convert_sqft_to_num('2166')

In [None]:
convert_sqft_to_num('2100 - 2850')

In [None]:
convert_sqft_to_num('34.46Sq. Meter')

In [None]:
df3 = df2.copy()

In [None]:
df3["total_sqft"] = df3["total_sqft"].apply(convert_sqft_to_num)
df3.head()

In [None]:
df3.loc[30]

In [None]:
df3.head()

In [None]:
df3.info()

In [None]:
df4 = df3.copy()

* We can drop **size** column as we have already created 2 features out of it keeping all the information from it

In [None]:
df4[df4["Room_Type"].isnull()]

In [None]:
df4["Room_Type"].fillna("NA", inplace=True)

In [None]:
df4.drop("size", axis=1, inplace=True)

# Exploratory Data Analysis

* Now that we are done with data cleaning part.
* Primarily we'll check for 2 parts - 1. How are the values distributed for each independent feature and 2. How does the target variable **price** change with the independent feature

In [None]:
df5 = df4.copy()
df5.head()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
sns.countplot(df5["area_type"])
plt.show()

* The majority of the properties are of Super built-up Area type

In [None]:
temp = df5.groupby("area_type")["price"].median()
sns.barplot(x=temp.index, y=temp.values)
plt.show()

* The average price for Plot Area type properties is almost double compared to the other categories

In [None]:
sns.countplot(df5["availability"])
plt.show()

* Most of the properties are Ready to Move.
* There are few properties will be available in the near future.
* There are very few properties which are available for immediate possession as well.

In [None]:
temp = df5.groupby("availability")["price"].median()
sns.barplot(x=temp.index, y=temp.values)
plt.show()

* The immediate possession properties have lower average price, which suggests that these properties might be quite old and hence may not be attracting much buyers.
* The other categories denote an recently developed properties and hence have higher average prices.

In [None]:
plt.hist(df5["total_sqft"], bins=50)
plt.show()

* The feature **total_sqft** is right skewed.
* Most of the properties have areas less than 5000 sq ft. There are some exceptions though which is clearly indicated by the long tail of the histogram

In [None]:
sns.boxplot(df5["total_sqft"])
plt.show()

In [None]:
sns.scatterplot(x="total_sqft", y="price", data=df5)
plt.show()

* As expected, the price of the property increases with the total area. There are some outliers visible above 30000 sq ft

In [None]:
sns.scatterplot(x="bath", y="price", data=df5)
plt.show()

* **bath** column doesn't show any significant direct impact on the price

In [None]:
sns.boxplot(df5["bath"])
plt.show()

* There are outlier in the **bath** columns. Most importantly, notice the 2 extreme values more than 25

In [None]:
plt.figure(figsize=(18,7))
sns.countplot(df5["bath"])
plt.show()

* Most of the properties have bathrooms less than 8

In [None]:
sns.scatterplot(x="balcony", y="price", data=df5)
plt.show()

In [None]:
sns.countplot(df5["balcony"])
plt.show()

In [None]:
temp = df5.groupby("balcony")["price"].median()
sns.barplot(x=temp.index, y=temp.values)
plt.show()

* The average price of properties with 1 balcony is lower than properties where there are no balconies.
* Based on this, we might say that the 1 balcony properties are low in demand i.e., the prices for properties are higher if there is no balcony or has more than 1 balcony.

In [None]:
sns.countplot(df5["Room"])
plt.show()

* Majority of the records are with 2-4 rooms

In [None]:
plt.figure(figsize=(18,7))
temp = df5.groupby("Room")["price"].median()
temp.index = temp.index.astype("float")
sns.barplot(x=temp.index, y=temp.values)
plt.show()

* Primarily the average price increase as the number of rooms increase.
* Ther are some variations noticed when the number of rooms increase more than 10

In [None]:
sns.countplot(df5["Room_Type"])
plt.show()

In [None]:
temp = df5.groupby("Room_Type")["price"].median()
sns.barplot(x=temp.index, y=temp.values)
plt.show()

* The number of Bedroom types is lower than BHK yet has a higher average price when compared with BHK.
* We might want to check more about the entries with Bedroom.

In [None]:
df6 = df5.copy()
df6.head()

In [None]:
df6["price_per_sqft"] = df6["price"]/df6["total_sqft"]

In [None]:
#"area_type","availability","bath","balcony","Room"
temp = df6.groupby("area_type")["price_per_sqft"].median()
sns.barplot(x=temp.index, y=temp.values)
plt.show()

* We saw earlier that the average price for Built-up Area, Carpet Area and Super built-up Area were almost the same, however the price per square feet seems to have some difference in them

In [None]:
temp = df6.groupby("availability")["price_per_sqft"].median()
sns.barplot(x=temp.index, y=temp.values)
plt.show()

In [None]:
plt.figure(figsize=(18,7))
temp = df6.groupby("Room")["price_per_sqft"].median()
temp.index = temp.index.astype("float")
sns.barplot(x=temp.index, y=temp.values)
plt.show()

* The price per sq ft show a linear relation with the number of rooms, however as seen with the price as well, there are irregularities observed when the number of rooms go beyond 10.