In [61]:
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib
matplotlib.rcParams["figure.figsize"]= (20, 10)

In [62]:
from sklearn.preprocessing import LabelEncoder

In [63]:
df= pd.read_csv('bengaluru_house_prices.csv')

In [64]:
df.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


##### Data Preprocessing

In [65]:
cols_to_drop= ['society', 'availability']

for col in cols_to_drop:
    if col in df.columns:
        df.drop(col, axis= 'columns', inplace= True)

In [66]:
df.head()

Unnamed: 0,area_type,location,size,total_sqft,bath,balcony,price
0,Super built-up Area,Electronic City Phase II,2 BHK,1056,2.0,1.0,39.07
1,Plot Area,Chikka Tirupathi,4 Bedroom,2600,5.0,3.0,120.0
2,Built-up Area,Uttarahalli,3 BHK,1440,2.0,3.0,62.0
3,Super built-up Area,Lingadheeranahalli,3 BHK,1521,3.0,1.0,95.0
4,Super built-up Area,Kothanur,2 BHK,1200,2.0,1.0,51.0


In [67]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   area_type   13320 non-null  object 
 1   location    13319 non-null  object 
 2   size        13304 non-null  object 
 3   total_sqft  13320 non-null  object 
 4   bath        13247 non-null  float64
 5   balcony     12711 non-null  float64
 6   price       13320 non-null  float64
dtypes: float64(3), object(4)
memory usage: 728.6+ KB


In [68]:
df= df.dropna()

In [69]:
df['bhk']= df['size'].apply(lambda x: int(x[0]))

In [70]:
df.drop(['size'], axis= 'columns', inplace= True)

In [71]:
df['location'].unique()

array(['Electronic City Phase II', 'Chikka Tirupathi', 'Uttarahalli', ...,
       '12th cross srinivas nagar banshankari 3rd stage',
       'Havanur extension', 'Abshot Layout'], dtype=object)

In [72]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12710 entries, 0 to 13319
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   area_type   12710 non-null  object 
 1   location    12710 non-null  object 
 2   total_sqft  12710 non-null  object 
 3   bath        12710 non-null  float64
 4   balcony     12710 non-null  float64
 5   price       12710 non-null  float64
 6   bhk         12710 non-null  int64  
dtypes: float64(3), int64(1), object(3)
memory usage: 794.4+ KB


In [73]:
def handle_ranges(x):
    tokens = x.split('-')

    if len(tokens) == 2:
        try:
            return (float(tokens[0]) + float(tokens[1])) / 2.0
        except ValueError:
            return None  # Handle the case where conversion to float fails
    else:
        x = x.strip()  # Remove leading and trailing whitespaces
        try:
            return float(x)
        except ValueError:
            return None

# Apply the function to the 'total_sqft' column
df['total_sqft'] = df['total_sqft'].apply(handle_ranges)

In [74]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12710 entries, 0 to 13319
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   area_type   12710 non-null  object 
 1   location    12710 non-null  object 
 2   total_sqft  12668 non-null  float64
 3   bath        12710 non-null  float64
 4   balcony     12710 non-null  float64
 5   price       12710 non-null  float64
 6   bhk         12710 non-null  int64  
dtypes: float64(4), int64(1), object(2)
memory usage: 794.4+ KB


In [75]:
df= df.dropna()

##### Feature Engineering

In [76]:
df['price_per_sqft']= (df['price']*100000)/df['total_sqft']

In [77]:
df['location']= df['location'].apply(lambda x: x.strip())

In [78]:
location_stats= df.groupby('location')['location'].agg('count').sort_values(ascending= False)

In [79]:
location_less_than_10=  location_stats[location_stats<= 10]
location_less_than_10

location
Nagappa Reddy Layout    10
Thyagaraja Nagar        10
Ganga Nagar             10
Naganathapura           10
Dairy Circle            10
                        ..
1 Giri Nagar             1
Kanakapura  Rod          1
Kanakapura Main Road     1
Kanakapura Road          1
whitefiled               1
Name: location, Length: 1013, dtype: int64

In [80]:
df['location']= df['location'].apply(lambda x: 'other' if x in location_less_than_10 else x)

In [81]:
df.head()

Unnamed: 0,area_type,location,total_sqft,bath,balcony,price,bhk,price_per_sqft
0,Super built-up Area,Electronic City Phase II,1056.0,2.0,1.0,39.07,2,3699.810606
1,Plot Area,Chikka Tirupathi,2600.0,5.0,3.0,120.0,4,4615.384615
2,Built-up Area,Uttarahalli,1440.0,2.0,3.0,62.0,3,4305.555556
3,Super built-up Area,Lingadheeranahalli,1521.0,3.0,1.0,95.0,3,6245.890861
4,Super built-up Area,Kothanur,1200.0,2.0,1.0,51.0,2,4250.0


##### Outlier removal

In [82]:
df= df[(df['total_sqft']/df['bhk'])>300]

In [83]:
df.drop(['price_per_sqft'], axis= 'columns', inplace= True)

In [84]:
object_columns = df.select_dtypes(include='object').columns

In [85]:
object_columns

Index(['area_type', 'location'], dtype='object')

In [86]:
label_mappings= {}
for col in object_columns:
    le= LabelEncoder()

    df[col+'_le']= le.fit_transform(df[col])

    label_mappings[col]= dict(zip(le.classes_, le.transform(le.classes_)))
    df.drop(col, axis= 'columns', inplace= True)

In [87]:
label_mappings

{'area_type': {'Built-up  Area': 0,
  'Carpet  Area': 1,
  'Plot  Area': 2,
  'Super built-up  Area': 3},
 'location': {'1st Block Jayanagar': 0,
  '1st Phase JP Nagar': 1,
  '2nd Phase Judicial Layout': 2,
  '2nd Stage Nagarbhavi': 3,
  '5th Phase JP Nagar': 4,
  '6th Phase JP Nagar': 5,
  '7th Phase JP Nagar': 6,
  '8th Phase JP Nagar': 7,
  '9th Phase JP Nagar': 8,
  'AECS Layout': 9,
  'Abbigere': 10,
  'Akshaya Nagar': 11,
  'Ambalipura': 12,
  'Ambedkar Nagar': 13,
  'Amruthahalli': 14,
  'Anandapura': 15,
  'Ananth Nagar': 16,
  'Anekal': 17,
  'Anjanapura': 18,
  'Ardendale': 19,
  'Arekere': 20,
  'Attibele': 21,
  'BEML Layout': 22,
  'BTM 2nd Stage': 23,
  'BTM Layout': 24,
  'Babusapalaya': 25,
  'Badavala Nagar': 26,
  'Balagere': 27,
  'Banashankari': 28,
  'Banashankari Stage II': 29,
  'Banashankari Stage III': 30,
  'Banashankari Stage V': 31,
  'Banashankari Stage VI': 32,
  'Banaswadi': 33,
  'Banjara Layout': 34,
  'Bannerghatta': 35,
  'Bannerghatta Road': 36,
  'B

In [88]:
df.head()

Unnamed: 0,total_sqft,bath,balcony,price,bhk,area_type_le,location_le
0,1056.0,2.0,1.0,39.07,2,3,77
1,2600.0,5.0,3.0,120.0,4,2,59
2,1440.0,2.0,3.0,62.0,3,0,220
3,1521.0,3.0,1.0,95.0,3,3,155
4,1200.0,2.0,1.0,51.0,2,3,147


In [89]:
df.to_csv('final_dataset.csv')