In [35]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [36]:
data = pd.read_csv('https://raw.githubusercontent.com/sarthak230605/ML-projects/refs/heads/main/Bengaluru_House_Data.csv')

In [37]:
data.head()


Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [38]:
data.shape

(13320, 9)

In [39]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     13320 non-null  object 
 1   availability  13320 non-null  object 
 2   location      13319 non-null  object 
 3   size          13304 non-null  object 
 4   society       7818 non-null   object 
 5   total_sqft    13320 non-null  object 
 6   bath          13247 non-null  float64
 7   balcony       12711 non-null  float64
 8   price         13320 non-null  float64
dtypes: float64(3), object(6)
memory usage: 936.7+ KB


In [40]:
for column in data.columns :
  print (data[column].value_counts())
  print("*"*20)

area_type
Super built-up  Area    8790
Built-up  Area          2418
Plot  Area              2025
Carpet  Area              87
Name: count, dtype: int64
********************
availability
Ready To Move    10581
18-Dec             307
18-May             295
18-Apr             271
18-Aug             200
                 ...  
16-Oct               1
17-Jan               1
16-Nov               1
16-Jan               1
14-Jul               1
Name: count, Length: 81, dtype: int64
********************
location
Whitefield                         540
Sarjapur  Road                     399
Electronic City                    302
Kanakpura Road                     273
Thanisandra                        234
                                  ... 
3rd Stage Raja Rajeshwari Nagar      1
Chuchangatta Colony                  1
Electronic City Phase 1,             1
Chikbasavanapura                     1
Abshot Layout                        1
Name: count, Length: 1305, dtype: int64
********************
siz

### now we will check for null values


In [41]:
data.isna().sum()

Unnamed: 0,0
area_type,0
availability,0
location,1
size,16
society,5502
total_sqft,0
bath,73
balcony,609
price,0


### Now we will drop the colums which has many null values, like society and balcony and area type and avalibilty are also useless for us

In [42]:
data.drop(columns=['area_type','availability','society','balcony'],inplace=True) #society and balcony doesn't afeect our prediction so it is of no use

In [43]:
data.describe()

Unnamed: 0,bath,price
count,13247.0,13320.0
mean,2.69261,112.565627
std,1.341458,148.971674
min,1.0,8.0
25%,2.0,50.0
50%,2.0,72.0
75%,3.0,120.0
max,40.0,3600.0


In [44]:
data['location'].value_counts()

Unnamed: 0_level_0,count
location,Unnamed: 1_level_1
Whitefield,540
Sarjapur Road,399
Electronic City,302
Kanakpura Road,273
Thanisandra,234
...,...
3rd Stage Raja Rajeshwari Nagar,1
Chuchangatta Colony,1
"Electronic City Phase 1,",1
Chikbasavanapura,1


In [45]:
data['size'].value_counts()

Unnamed: 0_level_0,count
size,Unnamed: 1_level_1
2 BHK,5199
3 BHK,4310
4 Bedroom,826
4 BHK,591
3 Bedroom,547
1 BHK,538
2 Bedroom,329
5 Bedroom,297
6 Bedroom,191
1 Bedroom,105


### filling null values

In [46]:
data['location'] = data['location'].fillna('Sarjapur road')  #because sarjapur is most occuring

In [47]:
data['size'] = data['size'].fillna('2 BHK')

In [48]:
data['bath'] = data['bath'].fillna(data['bath'].median())

In [49]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    13320 non-null  object 
 1   size        13320 non-null  object 
 2   total_sqft  13320 non-null  object 
 3   bath        13320 non-null  float64
 4   price       13320 non-null  float64
dtypes: float64(2), object(3)
memory usage: 520.4+ KB


### Correcting bedroom vs bhk problem

In [50]:
data['bhk'] = data['size'].str.split().str.get(0).astype(int)

### finding outliers in data

In [51]:
data[data.bhk > 20]

Unnamed: 0,location,size,total_sqft,bath,price,bhk
1718,2Electronic City Phase II,27 BHK,8000,27.0,230.0,27
4684,Munnekollal,43 Bedroom,2400,40.0,660.0,43


In [52]:
data['total_sqft'].unique() #tell all unique values

array(['1056', '2600', '1440', ..., '1133 - 1384', '774', '4689'],
      dtype=object)

### we will correct the range problem, we dont want range

In [53]:
# we will define a function
def convertRange(x):

   temp=x.split('-')
   if len(temp) == 2:
     return (float(temp[0]) + float(temp[1]))/2
   try:
     return float(x)
   except:
     return None



In [54]:
data['total_sqft']=data['total_sqft'].apply(convertRange)

In [55]:
data.describe()

Unnamed: 0,total_sqft,bath,price,bhk
count,13274.0,13320.0,13320.0,13320.0
mean,1559.626694,2.688814,112.565627,2.802778
std,1238.405258,1.338754,148.971674,1.294496
min,1.0,1.0,8.0,1.0
25%,1100.0,2.0,50.0,2.0
50%,1276.0,2.0,72.0,3.0
75%,1680.0,3.0,120.0,3.0
max,52272.0,40.0,3600.0,43.0


In [56]:
data['location'].value_counts()

Unnamed: 0_level_0,count
location,Unnamed: 1_level_1
Whitefield,540
Sarjapur Road,399
Electronic City,302
Kanakpura Road,273
Thanisandra,234
...,...
Mango Garden Layout,1
Milk Colony,1
"Basnashankari,6th stage,",1
Near ullas theater,1


In [57]:
data['loaction'] = data['location'].apply(lambda x: x.strip())  #to remove extra spaces from side of any word because, it will consider same location as diffrent if extra spaces are present

In [58]:
location_count= data['location'].value_counts()
print(location_count)

location
Whitefield                  540
Sarjapur  Road              399
Electronic City             302
Kanakpura Road              273
Thanisandra                 234
                           ... 
Mango Garden Layout           1
Milk Colony                   1
Basnashankari,6th stage,      1
Near ullas theater            1
N R Layout                    1
Name: count, Length: 1306, dtype: int64


In [59]:
location_count_less_10 = location_count[location_count <= 10]
location_count_less_10

Unnamed: 0_level_0,count
location,Unnamed: 1_level_1
Dairy Circle,10
BTM 1st Stage,10
Ganga Nagar,10
Nagappa Reddy Layout,10
Dodsworth Layout,10
...,...
Mango Garden Layout,1
Milk Colony,1
"Basnashankari,6th stage,",1
Near ullas theater,1


In [60]:
data['location'] = data['location'].apply(lambda x: 'other' if x in location_count_less_10 else x) #name location which are 10 or less in number we name as other

In [61]:
data['location'].value_counts()

Unnamed: 0_level_0,count
location,Unnamed: 1_level_1
other,2901
Whitefield,540
Sarjapur Road,399
Electronic City,302
Kanakpura Road,273
...,...
Tindlu,11
Marsur,11
2nd Phase Judicial Layout,11
Thyagaraja Nagar,11


In [62]:
data.describe() # here it shows minimum sq feet is 1, which is like not possible

Unnamed: 0,total_sqft,bath,price,bhk
count,13274.0,13320.0,13320.0,13320.0
mean,1559.626694,2.688814,112.565627,2.802778
std,1238.405258,1.338754,148.971674,1.294496
min,1.0,1.0,8.0,1.0
25%,1100.0,2.0,50.0,2.0
50%,1276.0,2.0,72.0,3.0
75%,1680.0,3.0,120.0,3.0
max,52272.0,40.0,3600.0,43.0


In [63]:
(data['total_sqft'] / data['bhk']).describe() #after this we will our minmun threshhold to be 300

Unnamed: 0,0
count,13274.0
mean,575.074878
std,388.205175
min,0.25
25%,473.333333
50%,552.5
75%,625.0
max,26136.0


In [64]:
data = data[((data['total_sqft'] / data['bhk']) >= 300)]
data.describe()

Unnamed: 0,total_sqft,bath,price,bhk
count,12530.0,12530.0,12530.0,12530.0
mean,1594.564544,2.559537,111.382401,2.650838
std,1261.271296,1.077938,152.077329,0.976678
min,300.0,1.0,8.44,1.0
25%,1116.0,2.0,49.0,2.0
50%,1300.0,2.0,70.0,3.0
75%,1700.0,3.0,115.0,3.0
max,52272.0,16.0,3600.0,16.0


In [65]:
data.shape

(12530, 7)

In [66]:
data.price_per_sqft.describe()

AttributeError: 'DataFrame' object has no attribute 'price_per_sqft'