In [1]:
import pandas as pd
import numpy as np
import re

In [3]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [4]:
df = pd.read_csv('gurgaon_properties_cleaned_v1.csv')

In [5]:
df.duplicated().sum()

121

In [7]:
df.head(1)
# focus is on -> areaWithType, additionalRoom, agePossession, furnishDetails, features  

Unnamed: 0,property_type,society,sector,price_cr,price_per_sqft,area,areaWithType,bedRoom,bathroom,balcony,additionalRoom,floorNum,facing,agePossession,nearbyLocations,furnishDetails,features
0,flat,experion the heartsong,sector 108,3.0,18392.0,1631.0,Super Built up area 2779(258.18 sq.m.)Built Up area: 2204.25 sq.ft. (204.78 sq.m.)Carpet area: 1631.07 sq.ft. (151.53 sq.m.),4,5,3+,servant room,2.0,South-West,1 to 5 Year Old,"['Galleria 108 Mall', 'Dwarka Expressway', 'Central Peripheral Road', 'The Shikshiyan School', 'Manipal Hospital', 'Indira Gandhi International Airport', 'Vivanta New Delhi, Dwarka', 'SkyJumper Trampoline Park', 'Fun N Food Village']","['1 Water Purifier', '8 Fan', '1 Fridge', '1 Exhaust Fan', '1 Dining Table', '4 Geyser', '1 Stove', '70 Light', '7 AC', '2 TV', '1 Modular Kitchen', '1 Chimney', '1 Curtains', '4 Bed', '5 Wardrobe', '1 Sofa', '1 Washing Machine', '1 Microwave']","['Water purifier', 'Security / Fire Alarm', 'Feng Shui / Vaastu Compliant', 'Intercom Facility', 'Lift(s)', 'High Ceiling Height', 'Maintenance Staff', 'False Ceiling Lighting', 'Water Storage', 'Separate entry for servant room', 'No open drainage around', 'Bank Attached Property', 'Internet/wi-fi connectivity', 'Recently Renovated', 'Visitor Parking', 'Swimming Pool', 'Park', 'Security Personnel', 'Natural Light', 'Airy Rooms', 'Spacious Interiors', 'Waste Disposal', 'Rain Water Harvesting', 'Shopping Centre', 'Fitness Centre / GYM', 'Club house / Community Center']"


In [10]:
df.sample(5)[['price_cr','area','areaWithType']]

Unnamed: 0,price_cr,area,areaWithType
2035,2.2,1549.0,Carpet area: 1549 (143.91 sq.m.)
2360,1.47,904.0,Carpet area: 84.01
1069,1.0,1100.0,Super Built up area 1100(102.19 sq.m.)
272,1.1,900.0,Carpet area: 900 (83.61 sq.m.)
77,2.0,2149.0,Super Built up area 2149(199.65 sq.m.)Built Up area: 1900 sq.ft. (176.52 sq.m.)Carpet area: 1650 sq.ft. (153.29 sq.m.)


- here are three catagories [ carpet area, super built up area, built up area and plot area for houses]
- will extract each one and store it in seperate column

In [18]:
# This function extracts the Super Built up area
def get_super_built_up_area(text):
    match = re.search(r'Super Built up area (\d+\.?\d*)', text)
    if match:
        return float(match.group(1))
    return None

In [19]:
# This function extracts the Built Up area or Carpet area
def get_area(text, area_type):
    match = re.search(area_type + r'\s*:\s*(\d+\.?\d*)', text)
    if match:
        return float(match.group(1))
    return None

In [20]:
# This function checks if the area is provided in sq.m. and converts it to sqft if needed
def convert_to_sqft(text, area_value):
    if area_value is None:
        return None
    match = re.search(r'{} \((\d+\.?\d*) sq.m.\)'.format(area_value), text)
    if match:
        sq_m_value = float(match.group(1))
        return sq_m_value * 10.7639  # conversion factor from sq.m. to sqft
    return area_value

In [22]:
# Extract Super Built up area and convert to sqft if needed
df['super_built_up_area'] = df['areaWithType'].apply(get_super_built_up_area)
df['super_built_up_area'] = df.apply(lambda x: convert_to_sqft(x['areaWithType'], x['super_built_up_area']), axis=1)

# Extract Built Up area and convert to sqft if needed
df['built_up_area'] = df['areaWithType'].apply(lambda x: get_area(x, 'Built Up area'))
df['built_up_area'] = df.apply(lambda x: convert_to_sqft(x['areaWithType'], x['built_up_area']), axis=1)

# Extract Carpet area and convert to sqft if needed
df['carpet_area'] = df['areaWithType'].apply(lambda x: get_area(x, 'Carpet area'))
df['carpet_area'] = df.apply(lambda x: convert_to_sqft(x['areaWithType'], x['carpet_area']), axis = 1)

In [23]:
df[['price_cr','property_type','area','areaWithType','super_built_up_area','built_up_area','carpet_area']]

Unnamed: 0,price_cr,property_type,area,areaWithType,super_built_up_area,built_up_area,carpet_area
0,3.0,flat,1631.0,Super Built up area 2779(258.18 sq.m.)Built Up area: 2204.25 sq.ft. (204.78 sq.m.)Carpet area: 1631.07 sq.ft. (151.53 sq.m.),2779.0,2204.25,1631.07
1,1.37,flat,1579.0,Super Built up area 1578(146.6 sq.m.)Carpet area: 1538 sq.ft. (142.88 sq.m.),1578.0,,1538.0
2,1.62,flat,1161.0,Super Built up area 1950(181.16 sq.m.)Carpet area: 1161 sq.ft. (107.86 sq.m.),1950.0,,1161.0
3,0.95,flat,1956.0,Super Built up area 1956(181.72 sq.m.),1956.0,,
4,0.25,flat,481.0,Built Up area: 481 (44.69 sq.m.),,481.0,
5,10.0,flat,4072.0,Super Built up area 4072(378.3 sq.m.)Built Up area: 3000 sq.ft. (278.71 sq.m.)Carpet area: 2800 sq.ft. (260.13 sq.m.),4072.0,3000.0,2800.0
6,0.92,flat,1822.0,Super Built up area 1822(169.27 sq.m.)Carpet area: 1400 sq.ft. (130.06 sq.m.),1822.0,,1400.0
7,2.75,house,2228.0,Built Up area: 2228 (206.99 sq.m.),,2228.0,
8,4.21,flat,3557.0,Super Built up area 3557(330.46 sq.m.),3557.0,,
9,0.95,house,1300.0,Built Up area: 1300 (120.77 sq.m.),,1300.0,
