In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [3]:
dataset = pd.read_csv("Bengaluru_House_Data.csv")

In [4]:
dataset.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


# DATA CLEANING

    * counting each values in the column

In [5]:
dataset['area_type'].value_counts()

area_type
Super built-up  Area    8790
Built-up  Area          2418
Plot  Area              2025
Carpet  Area              87
Name: count, dtype: int64

    * Drop some unwanted columns

In [6]:
dataset.drop(['area_type','availability','society','balcony'],axis=1,inplace=True)

In [7]:
dataset.head()

Unnamed: 0,location,size,total_sqft,bath,price
0,Electronic City Phase II,2 BHK,1056,2.0,39.07
1,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0
2,Uttarahalli,3 BHK,1440,2.0,62.0
3,Lingadheeranahalli,3 BHK,1521,3.0,95.0
4,Kothanur,2 BHK,1200,2.0,51.0


    * Checking for NaN values

In [8]:
print(dataset.isnull().sum())

location       1
size          16
total_sqft     0
bath          73
price          0
dtype: int64


#### If we want to fill the null value we can use the <span style="color: yellow;">_datsaet.fillna()_</span> method if we want to fill with the specific values such as mean, median the we can use <span style="color: yellow;">_dataset.fillna(dataset.mean())_</span> or <span style="color: yellow;">_dataset.fillna(dataset.meian())_</span> and other specific values such as 0 then <span style="color: yellow;">_dataset.fillna(0)_</span> and using forward and backward filling <span style="color: yellow;">_dataset.ffill()_</span> for forward filling and <span style="color: yellow;">_dataset.bfill()_</span> for backward filling or else we can simply drop the Null rows using <span style="color: yellow;">_dataset.dropna()_</span>

In [9]:
## remove the null value

df2 = dataset.dropna()

In [10]:
df2.isnull().sum()

location      0
size          0
total_sqft    0
bath          0
price         0
dtype: int64

In [11]:
df2.head(10)

Unnamed: 0,location,size,total_sqft,bath,price
0,Electronic City Phase II,2 BHK,1056,2.0,39.07
1,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0
2,Uttarahalli,3 BHK,1440,2.0,62.0
3,Lingadheeranahalli,3 BHK,1521,3.0,95.0
4,Kothanur,2 BHK,1200,2.0,51.0
5,Whitefield,2 BHK,1170,2.0,38.0
6,Old Airport Road,4 BHK,2732,4.0,204.0
7,Rajaji Nagar,4 BHK,3300,4.0,600.0
8,Marathahalli,3 BHK,1310,3.0,63.25
9,Gandhi Bazar,6 Bedroom,1020,6.0,370.0


In [12]:
## make size column more convenient to read by creating new column name BHK 

##Lambda functions are often used in situations where a small, short-lived function is needed,
# and defining a full function using the def keyword might be overly verbose.
# They are commonly used with functions like map, filter, and apply in Python.

df2['BHK']=df2['size'].apply(lambda x: x.split(' ')[0]).astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['BHK']=df2['size'].apply(lambda x: x.split(' ')[0]).astype(int)


In [13]:
## Find the unique values
df2['BHK'].unique()

array([ 2,  4,  3,  6,  1,  8,  7,  5, 11,  9, 27, 10, 19, 16, 43, 14, 12,
       13, 18])

In [14]:
## Access the dataset which has the BHK value greater than 20
df2[df2.BHK>20]

Unnamed: 0,location,size,total_sqft,bath,price,BHK
1718,2Electronic City Phase II,27 BHK,8000,27.0,230.0,27
4684,Munnekollal,43 Bedroom,2400,40.0,660.0,43


In [15]:
df2['total_sqft'].unique() ## There are ranges in the total_sqft columns

array(['1056', '2600', '1440', ..., '1133 - 1384', '774', '4689'],
      dtype=object)

In [16]:
def is_float(X):
    try:
        float(X)
        #float(df2.iloc[1,2])
    except:
        return False
        #float(df2.iloc[30,2])
    return True



df2[~df2['total_sqft'].apply(is_float)].head(10)


Unnamed: 0,location,size,total_sqft,bath,price,BHK
30,Yelahanka,4 BHK,2100 - 2850,4.0,186.0,4
122,Hebbal,4 BHK,3067 - 8156,4.0,477.0,4
137,8th Phase JP Nagar,2 BHK,1042 - 1105,2.0,54.005,2
165,Sarjapur,2 BHK,1145 - 1340,2.0,43.49,2
188,KR Puram,2 BHK,1015 - 1540,2.0,56.8,2
410,Kengeri,1 BHK,34.46Sq. Meter,1.0,18.5,1
549,Hennur Road,2 BHK,1195 - 1440,2.0,63.77,2
648,Arekere,9 Bedroom,4125Perch,9.0,265.0,9
661,Yelahanka,2 BHK,1120 - 1145,2.0,48.13,2
672,Bettahalsoor,4 Bedroom,3090 - 5002,4.0,445.0,4


In [17]:
## Handle the ranges and NaN values

def convert_range_to_num(X):
    tokens = X.split('-')
    if len(tokens) == 2:
        return (float(tokens[0])+float(tokens[1]))/2
    try:
        return float(X)
    except:
        return None
    
df2['total_sqft'] = df2['total_sqft'].apply(convert_range_to_num)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['total_sqft'] = df2['total_sqft'].apply(convert_range_to_num)


In [18]:
df2.isnull().sum()

location       0
size           0
total_sqft    46
bath           0
price          0
BHK            0
dtype: int64

In [19]:
## drop NaN values
df3=df2.dropna()

In [20]:
df3.isnull().sum()

location      0
size          0
total_sqft    0
bath          0
price         0
BHK           0
dtype: int64

# FEATURE ENGINEERING

#### Feature engineering is the process of creating new features or modifying existing ones in a dataset to improve the performance of a machine learning model. It involves transforming raw data into a format that better represents the underlying problem and enhances the model's ability to make accurate predictions.

In [21]:
df3.head()

Unnamed: 0,location,size,total_sqft,bath,price,BHK
0,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.0,4
2,Uttarahalli,3 BHK,1440.0,2.0,62.0,3
3,Lingadheeranahalli,3 BHK,1521.0,3.0,95.0,3
4,Kothanur,2 BHK,1200.0,2.0,51.0,2


In [28]:
## Creating new column name price_per_sqft to store the price

df3['price_per_sqft'] = (df3['price'] * 100000 )/ df3['total_sqft']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df3['price_per_sqft'] = (df3['price'] * 100000 )/ df3['total_sqft']


In [31]:
## Location column is actully a categorical column so if we have so many category it will be a problem

print(f"Total categories in the location column : {len(df3.location.unique())}")

Total categories in the location column : 1298


In [43]:
## Get the counts of location rows per location

df3.location = df3.location.apply(lambda x: x.strip())
location_stats = df3.groupby('location')['location'].agg('count').sort_values(ascending = False)
location_stats

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df3.location = df3.location.apply(lambda x: x.strip())


location
Whitefield               533
Sarjapur  Road           392
Electronic City          304
Kanakpura Road           264
Thanisandra              235
                        ... 
1 Giri Nagar               1
Kanakapura Road,           1
Kanakapura main  Road      1
Kannur                     1
whitefiled                 1
Name: location, Length: 1287, dtype: int64

#### I am going to create a one location call other location and assign all the locations which has less than 10 rows per location

In [46]:
len(location_stats[location_stats<10])
#location_less_than_10_data = df3.location

1033

In [47]:
location_with_lessthan_10_raws = location_stats[location_stats<10]

In [48]:
df3.location = df3.location.apply(lambda x: 'other' if x in location_with_lessthan_10_raws else x) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df3.location = df3.location.apply(lambda x: 'other' if x in location_with_lessthan_10_raws else x)


In [50]:
df3

Unnamed: 0,location,size,total_sqft,bath,price,BHK,price_per_sqft
0,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2,3699.810606
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.00,4,4615.384615
2,Uttarahalli,3 BHK,1440.0,2.0,62.00,3,4305.555556
3,Lingadheeranahalli,3 BHK,1521.0,3.0,95.00,3,6245.890861
4,Kothanur,2 BHK,1200.0,2.0,51.00,2,4250.000000
...,...,...,...,...,...,...,...
13315,Whitefield,5 Bedroom,3453.0,4.0,231.00,5,6689.834926
13316,other,4 BHK,3600.0,5.0,400.00,4,11111.111111
13317,Raja Rajeshwari Nagar,2 BHK,1141.0,2.0,60.00,2,5258.545136
13318,Padmanabhanagar,4 BHK,4689.0,4.0,488.00,4,10407.336319


In [52]:
print(f"After assign location to others in the location column, unique category is : {len(df3.location.unique())}")

After assign location to others in the location column, unique category is : 255
