In [172]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import matplotlib
matplotlib.rcParams["figure.figsize"] = (20,10)

In [173]:
df1 = pd.read_csv('bengaluru_house_prices.csv')
df1.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [174]:
df1.shape

(13320, 9)

'Area Type' Column Values

In [175]:
df1.groupby('area_type')['area_type'].agg('count')

area_type
Built-up  Area          2418
Carpet  Area              87
Plot  Area              2025
Super built-up  Area    8790
Name: area_type, dtype: int64

'Availability' Column Values

In [176]:
df1.groupby('availability')['availability'].agg('count')

availability
14-Jul                      1
14-Nov                      1
15-Aug                      1
15-Dec                      1
15-Jun                      1
                        ...  
22-Mar                      3
22-May                     10
22-Nov                      2
Immediate Possession       16
Ready To Move           10581
Name: availability, Length: 81, dtype: int64

'Society' Column Values

In [177]:
df1.groupby('society')['society'].agg('count')

society
3Codeli    2
7 ise P    1
A idse     2
A rtsai    1
ACersd     1
          ..
Zonce E    2
Zostaa     3
i1ncyRe    1
i1odsne    1
i1rtsCo    3
Name: society, Length: 2688, dtype: int64

Distribution of Null Values

In [178]:
df1.isnull().sum()

area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath              73
balcony          609
price              0
dtype: int64

'Balcony' Column Values

In [179]:
df1.groupby('balcony')['balcony'].agg('count')

balcony
0.0    1029
1.0    4897
2.0    5113
3.0    1672
Name: balcony, dtype: int64

In [180]:
df1.groupby('area_type')['area_type'].agg('count')

area_type
Built-up  Area          2418
Carpet  Area              87
Plot  Area              2025
Super built-up  Area    8790
Name: area_type, dtype: int64

In [181]:
df1.drop(['availability','society'], axis='columns', inplace=True)
df1.head()

Unnamed: 0,area_type,location,size,total_sqft,bath,balcony,price
0,Super built-up Area,Electronic City Phase II,2 BHK,1056,2.0,1.0,39.07
1,Plot Area,Chikka Tirupathi,4 Bedroom,2600,5.0,3.0,120.0
2,Built-up Area,Uttarahalli,3 BHK,1440,2.0,3.0,62.0
3,Super built-up Area,Lingadheeranahalli,3 BHK,1521,3.0,1.0,95.0
4,Super built-up Area,Kothanur,2 BHK,1200,2.0,1.0,51.0


In [182]:
df2 = df1.dropna(subset=df1.columns.difference(['balcony']))
df2.isnull().sum()

area_type       0
location        0
size            0
total_sqft      0
bath            0
balcony       536
price           0
dtype: int64

In [183]:
df2['size'].unique()

array(['2 BHK', '4 Bedroom', '3 BHK', '4 BHK', '6 Bedroom', '3 Bedroom',
       '1 BHK', '1 RK', '1 Bedroom', '8 Bedroom', '2 Bedroom',
       '7 Bedroom', '5 BHK', '7 BHK', '6 BHK', '5 Bedroom', '11 BHK',
       '9 BHK', '9 Bedroom', '27 BHK', '10 Bedroom', '11 Bedroom',
       '10 BHK', '19 BHK', '16 BHK', '43 Bedroom', '14 BHK', '8 BHK',
       '12 Bedroom', '13 BHK', '18 Bedroom'], dtype=object)

In [184]:
df2['bhk'] = df2['size'].apply(lambda x : int(x.split(' ')[0]))
df2.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['bhk'] = df2['size'].apply(lambda x : int(x.split(' ')[0]))


Unnamed: 0,area_type,location,size,total_sqft,bath,balcony,price,bhk
0,Super built-up Area,Electronic City Phase II,2 BHK,1056,2.0,1.0,39.07,2
1,Plot Area,Chikka Tirupathi,4 Bedroom,2600,5.0,3.0,120.0,4
2,Built-up Area,Uttarahalli,3 BHK,1440,2.0,3.0,62.0,3
3,Super built-up Area,Lingadheeranahalli,3 BHK,1521,3.0,1.0,95.0,3
4,Super built-up Area,Kothanur,2 BHK,1200,2.0,1.0,51.0,2


In [185]:
def is_float(x):
    try:
        float(x)
    except:
        return False
    return True

In [186]:
df2[~df2['total_sqft'].apply(is_float)].head(30)

Unnamed: 0,area_type,location,size,total_sqft,bath,balcony,price,bhk
30,Super built-up Area,Yelahanka,4 BHK,2100 - 2850,4.0,0.0,186.0,4
122,Super built-up Area,Hebbal,4 BHK,3067 - 8156,4.0,0.0,477.0,4
137,Super built-up Area,8th Phase JP Nagar,2 BHK,1042 - 1105,2.0,0.0,54.005,2
165,Super built-up Area,Sarjapur,2 BHK,1145 - 1340,2.0,0.0,43.49,2
188,Super built-up Area,KR Puram,2 BHK,1015 - 1540,2.0,0.0,56.8,2
410,Super built-up Area,Kengeri,1 BHK,34.46Sq. Meter,1.0,0.0,18.5,1
549,Super built-up Area,Hennur Road,2 BHK,1195 - 1440,2.0,0.0,63.77,2
648,Built-up Area,Arekere,9 Bedroom,4125Perch,9.0,,265.0,9
661,Super built-up Area,Yelahanka,2 BHK,1120 - 1145,2.0,0.0,48.13,2
672,Built-up Area,Bettahalsoor,4 Bedroom,3090 - 5002,4.0,0.0,445.0,4


In [187]:
import re

# Conversion function
def convert_to_sqft(value):
    try:
        # Check if the value is a range
        if '-' in value:
            parts = value.split('-')
            return (float(parts[0]) + float(parts[1])) / 2
        
        # Check for different units and convert to square feet
        if 'Sq. Meter' in value:
            match = re.findall(r"\d+\.?\d*", value)
            return float(match[0]) * 10.7639 if match else np.nan
        if 'Perch' in value:
            match = re.findall(r"\d+\.?\d*", value)
            return float(match[0]) * 272.25 if match else np.nan
        if 'Sq. Yards' in value:
            match = re.findall(r"\d+\.?\d*", value)
            return float(match[0]) * 9 if match else np.nan
        if 'Acres' in value:
            match = re.findall(r"\d+\.?\d*", value)
            return float(match[0]) * 43560 if match else np.nan
        
        # Assume the value is in square feet if no unit is specified
        return float(value)
    except Exception as e:
        print(f"Error processing value: {value}, Error: {e}")
        return np.nan


# Apply the conversion function to 'total_sqft' column
df2['total_sqft'] = df2['total_sqft'].apply(lambda x: convert_to_sqft(str(x)))

# Now you can access the DataFrame directly
df2.head()

Error processing value: 3Cents, Error: could not convert string to float: '3Cents'
Error processing value: 24Guntha, Error: could not convert string to float: '24Guntha'
Error processing value: 1500Cents, Error: could not convert string to float: '1500Cents'
Error processing value: 1Grounds, Error: could not convert string to float: '1Grounds'
Error processing value: 38Guntha, Error: could not convert string to float: '38Guntha'


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['total_sqft'] = df2['total_sqft'].apply(lambda x: convert_to_sqft(str(x)))


Unnamed: 0,area_type,location,size,total_sqft,bath,balcony,price,bhk
0,Super built-up Area,Electronic City Phase II,2 BHK,1056.0,2.0,1.0,39.07,2
1,Plot Area,Chikka Tirupathi,4 Bedroom,2600.0,5.0,3.0,120.0,4
2,Built-up Area,Uttarahalli,3 BHK,1440.0,2.0,3.0,62.0,3
3,Super built-up Area,Lingadheeranahalli,3 BHK,1521.0,3.0,1.0,95.0,3
4,Super built-up Area,Kothanur,2 BHK,1200.0,2.0,1.0,51.0,2


In [188]:
df2[~df2['total_sqft'].apply(is_float)]

Unnamed: 0,area_type,location,size,total_sqft,bath,balcony,price,bhk


In [189]:
df2.loc[648]

area_type     Built-up  Area
location             Arekere
size               9 Bedroom
total_sqft        1123031.25
bath                     9.0
balcony                  NaN
price                  265.0
bhk                        9
Name: 648, dtype: object

In [190]:
df2.isnull().sum()

area_type       0
location        0
size            0
total_sqft      5
bath            0
balcony       536
price           0
bhk             0
dtype: int64

In [192]:
df2.dropna(subset=['total_sqft'], inplace=True)
df2.isnull().sum()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2.dropna(subset=['total_sqft'], inplace=True)


area_type       0
location        0
size            0
total_sqft      0
bath            0
balcony       536
price           0
bhk             0
dtype: int64