# Import Libraries

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib

In [4]:
import seaborn as sns
import re
import sys
from time import sleep

# Load dataset

In [6]:
df=pd.read_csv("Bengaluru_House_Data.csv")

In [7]:
df.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 9 columns):
area_type       13320 non-null object
availability    13320 non-null object
location        13319 non-null object
size            13304 non-null object
society         7818 non-null object
total_sqft      13320 non-null object
bath            13247 non-null float64
balcony         12711 non-null float64
price           13320 non-null float64
dtypes: float64(3), object(6)
memory usage: 936.7+ KB


In [10]:
df.shape

(13320, 9)

# Data Cleaning

In [13]:
df.isnull().sum()

area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath              73
balcony          609
price              0
dtype: int64

In [15]:
data=df.drop(['area_type','society','balcony','availability'],axis='columns')
data.head()

Unnamed: 0,location,size,total_sqft,bath,price
0,Electronic City Phase II,2 BHK,1056,2.0,39.07
1,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0
2,Uttarahalli,3 BHK,1440,2.0,62.0
3,Lingadheeranahalli,3 BHK,1521,3.0,95.0
4,Kothanur,2 BHK,1200,2.0,51.0


In [16]:
data.isnull().sum()

location       1
size          16
total_sqft     0
bath          73
price          0
dtype: int64

In [18]:
data=data.dropna()
data.isnull().sum()

location      0
size          0
total_sqft    0
bath          0
price         0
dtype: int64

In [29]:
data['size'].unique()

array(['2 BHK', '4 Bedroom', '3 BHK', '4 BHK', '6 Bedroom', '3 Bedroom',
       '1 BHK', '1 RK', '1 Bedroom', '8 Bedroom', '2 Bedroom',
       '7 Bedroom', '5 BHK', '7 BHK', '6 BHK', '5 Bedroom', '11 BHK',
       '9 BHK', '9 Bedroom', '27 BHK', '10 Bedroom', '11 Bedroom',
       '10 BHK', '19 BHK', '16 BHK', '43 Bedroom', '14 BHK', '8 BHK',
       '12 Bedroom', '13 BHK', '18 Bedroom'], dtype=object)

In [22]:
def rm_string(val):
    val=str(val)
    if val=='nan':
        val=np.NaN
    else:
        val=int(val.split(" ")[0])
    return val

In [30]:
#create new column for cleaned size column
data['BHK'] = data['size'].apply(lambda val: rm_string(val))
data['BHK'].unique()

array([ 2,  4,  3,  6,  1,  8,  7,  5, 11,  9, 27, 10, 19, 16, 43, 14, 12,
       13, 18], dtype=int64)

In [32]:
data[data.BHK>10]

Unnamed: 0,location,size,total_sqft,bath,price,BHK
459,1 Giri Nagar,11 BHK,5000,9.0,360.0,11
1718,2Electronic City Phase II,27 BHK,8000,27.0,230.0,27
1768,1 Ramamurthy Nagar,11 Bedroom,1200,11.0,170.0,11
3379,1Hanuman Nagar,19 BHK,2000,16.0,490.0,19
3609,Koramangala Industrial Layout,16 BHK,10000,16.0,550.0,16
3853,1 Annasandrapalya,11 Bedroom,1200,6.0,150.0,11
4684,Munnekollal,43 Bedroom,2400,40.0,660.0,43
4916,1Channasandra,14 BHK,1250,15.0,125.0,14
6533,Mysore Road,12 Bedroom,2232,6.0,300.0,12
7979,1 Immadihalli,11 BHK,6000,12.0,150.0,11


In [33]:
data['total_sqft'].unique()

array(['1056', '2600', '1440', ..., '1133 - 1384', '774', '4689'],
      dtype=object)

here some range like 1133-1384 exists

In [35]:
#find non float values from total_sqft
def find_range_values(val):
    range_val = []
    for x in val:
        try:
            float(x)
        except:
            range_val.append(x)
    return range_val

find_range_values(data['total_sqft'])

['2100 - 2850',
 '3067 - 8156',
 '1042 - 1105',
 '1145 - 1340',
 '1015 - 1540',
 '34.46Sq. Meter',
 '1195 - 1440',
 '4125Perch',
 '1120 - 1145',
 '3090 - 5002',
 '1160 - 1195',
 '1000Sq. Meter',
 '1115 - 1130',
 '1100Sq. Yards',
 '520 - 645',
 '1000 - 1285',
 '650 - 665',
 '633 - 666',
 '5.31Acres',
 '30Acres',
 '1445 - 1455',
 '884 - 1116',
 '850 - 1093',
 '716Sq. Meter',
 '547.34 - 827.31',
 '580 - 650',
 '3425 - 3435',
 '1804 - 2273',
 '3630 - 3800',
 '4000 - 5249',
 '1500Sq. Meter',
 '142.61Sq. Meter',
 '1574Sq. Yards',
 '1250 - 1305',
 '670 - 980',
 '1005.03 - 1252.49',
 '3630 - 3800',
 '1004 - 1204',
 '361.33Sq. Yards',
 '645 - 936',
 '2710 - 3360',
 '2830 - 2882',
 '596 - 804',
 '1255 - 1863',
 '1300 - 1405',
 '117Sq. Yards',
 '934 - 1437',
 '980 - 1030',
 '2249.81 - 4112.19',
 '1070 - 1315',
 '3040Sq. Meter',
 '500Sq. Yards',
 '2806 - 3019',
 '613 - 648',
 '704 - 730',
 '1210 - 1477',
 '3369 - 3464',
 '1125 - 1500',
 '167Sq. Meter',
 '1076 - 1199',
 '381 - 535',
 '524 - 894',
 

In [57]:
def convert_range_val(val):
    values = val.split('-')
    if len(values) == 2:
        return (float(values[0])+float(values[1]))/2 
    try:
        return float(val) 
    except:
        return val

print(convert_range_val('1000'))
print(convert_range_val('1000-2000'))
print(convert_range_val('100sqft.'))

1000.0
1500.0
100sqft.


acres to sqft : 43560 * acres<br>
sq Meters to sqft : 10.764 * sq.meters<br>
perch to sqft : 272.25 * perch<br>
sqYards to sqft : 9 * sqYards<br>
Grounds to sqft : 2400 * ground<br>
Cents to sqft : 435.6 * cent<br>
gunta to sqft : 1089 * gunta<br>

In [59]:
#convert acres to sqft
def acres_to_sqft(x):
    return x * 43560

#convert sq.meters to sqft
def sqmt_to_sqft(x):
    return x * 10.764

#convert perch to sqft
def perch_to_sqft(x):
    return x * 272.25

#convert sq.yards to sqft
def sqyards_to_sqft(x):
    return x * 9

#convert grounds to sqft
def grounds_to_sqft(x):
    return x * 2400

#convert gunta to sqft
def gunta_to_sqft(x):
    return x * 1089

#convert cents to sqft
def cents_to_sqft(x):
    return x * 435.6

In [66]:
def clean_sqft(val):
    try:
        ans=float(val)
    except:
        if "-" in val:
            ans = round(convert_range_val(val),1)
        elif "Acres" in val:
            ans = acres_to_sqft(float(re.findall('\d+',val)[0]))
        elif "Sq. Meter" in val:
            ans = round(sqmt_to_sqft(float(re.findall('\d+',val)[0])),1)
        elif "Perch" in val:
            ans = perch_to_sqft(float(re.findall('\d+',val)[0]))
        elif "Sq. Yards" in val:
            ans = sqyards_to_sqft(float(re.findall('\d+',val)[0]))
        elif "Grounds" in val:
            ans = grounds_to_sqft(float(re.findall('\d+',val)[0])) 
        elif "Guntha" in val:
            ans = gunta_to_sqft(float(re.findall('\d+',val)[0]))
        elif "Cents" in val:
            ans = round(cents_to_sqft(float(re.findall('\d+',val)[0])),1)
        return ans
    return ans

In [67]:
print(clean_sqft('1000-2000'))
print(clean_sqft('1000Perch'))

1500.0
272250.0


In [69]:
data['total_sqft_clean'] = data['total_sqft'].apply(lambda val : clean_sqft(val))

In [70]:
find_range_values(data['total_sqft_clean'])

[]

In [71]:
data=data.drop(['size','total_sqft'],axis=1)

In [73]:
data.head()

Unnamed: 0,location,bath,price,BHK,total_sqft_clean
0,Electronic City Phase II,2.0,39.07,2,1056.0
1,Chikka Tirupathi,5.0,120.0,4,2600.0
2,Uttarahalli,2.0,62.0,3,1440.0
3,Lingadheeranahalli,3.0,95.0,3,1521.0
4,Kothanur,2.0,51.0,2,1200.0


## create csv file after cleaning data

In [80]:
data.to_csv('after_data_cleaning.csv')