In [1]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings
warnings.filterwarnings("ignore")

from scipy.stats import ttest_ind, ks_2samp

# Display all columns
pd.set_option('display.max_columns', None)

In [2]:
# Load the stage 5a property data
df = pd.read_csv(r'Output Files\stage_5a_property_data.csv') 
df.head(3)

Unnamed: 0,Possession Status,Availability Starts From,Floor No,Commercial,Developer,Price (Crores),Landmark,Covered Area,Society,sqft Price (INR),Carpet Area,Locality,Unit of Carpet Area,Furnished Type,Bathrooms,Facing,Transaction Type,Type of Property,City,Bedrooms,Posted By,Total Floors,isPrimeLocationProperty,Unit of Covered Area,Property Lifespan,Balconies,Power Back Up,Lift,Rain Water Harvesting,Club House,Swimming Pool,Gymnasium,Park,Parking,Security,Water Storage,Private Terrace/Garden,Vaastu Compliant,Service/Goods Lift,Air Conditioned,Visitor Parking,Intercom Facility,Maintenance Staff,Waste Disposal,Laundry Service,Internet/Wi-Fi Connectivity,DTH Television Facility,RO Water System,Banquet Hall,Bar/Lounge,Cafeteria/Food Court,Conference Room,Piped Gas,Jogging and Strolling Track,Outdoor Tennis Courts,Mansion,Downtown,Skyline View,Wrap Around Balcony,Island Kitchen Layout,Full Glass Wall,House help accommodation,Concierge Services,Garden View,Marble flooring,Well Furnished,Modular Kitchen,Helipad,Private pool,Private Jaccuzi,Mini Cinema Theatre,Golf Course,Infinity Swimming Pool,Pool with temperature control,Sea facing,Skydeck,Earth quake resistant,Theme based Architectures,Health club with Steam / Jaccuzi,Large Clubhouse,Large Open space,Fingerprint Access,Grand Entrance lobby,Private Garage,Fireplace,Wine Cellar,Sky Villa,Water Front,Hilltop,Smart Home,Barbeque space,Kids play area,Library,Puja Room,Study,House Help Room,Store Room,Aerobics Room,Canopy Walk,Coffee Lounge & Restaurants,Dance Studio,Event Space & Amphitheatre,Flower Gardens,Guest Accommodation,Indoor Squash & Badminton Courts,Kids Play Area,Kids Play Pool With Water Slides,Library And Business Centre,Meditation Area,Multipurpose Courts,Recreational Pool,Rentable CommuniPfty Space,Retail Boulevard (Retail Shops),Cycling & Jogging Track,Fire Fighting Equipment,Aesthetically designed landscape garden,latitude,longitude,geocoded_address,Area,Price_bin,Floor Level
0,Ready to Move,Immediately,20.0,Y,Tata Housing Development Company Ltd.,0.63,Rajoli naka,763.571943,Tata Amantra,8250.696031,579.0,Kalyan West,Sq-ft,Unfurnished,2.0,East,New Property,Apartment,Thane,2,Agent,34.0,Y,Sq-ft,New construction,2.0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,19.240353,73.125279,"Kalyan West, Kalyan-Dombivli, Kalyan Taluka, T...",Kalyan West,Low,High rise (> 10)
1,Ready to Move,Immediately,18.0,N,Sai Satyam Developers,0.54,This property has reputed scholl ints vicinity.,850.0,Sai Satyam Homes,6352.0,585.0,Kalyan West,Sq-ft,Unfurnished,2.0,East,New Property,Apartment,Thane,2,Agent,18.0,Y,Sq-ft,New construction,2.0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,19.240353,73.125279,"Kalyan West, Kalyan-Dombivli, Kalyan Taluka, T...",Kalyan West,Low,High rise (> 10)
2,Under Construction,Dec 25,5.0,N,Birla Estates,0.9,Shahad is one of the attractive locations to o...,1050.0,Birla Vanya,8571.0,815.0,Kalyan West,Sq-ft,Unfurnished,2.0,East,New Property,Apartment,Thane,2,Agent,27.0,Y,Sq-ft,Less than 5 years,3.0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,19.240353,73.125279,"Kalyan West, Kalyan-Dombivli, Kalyan Taluka, T...",Kalyan West,Medium,Mid rise (5 to 10)


In [3]:
# Check shape
df.shape

(6280, 122)

-----------------

# Advanced Spatial Feature engineering

In [4]:
# Manually feed the Points of Interest (POIs)
pois = {
    'csia_airport': (19.0896, 72.8656),
    'thane_station': (19.1860, 72.9754),
    'andheri_station': (19.1197, 72.8464),
    'csmt_station': (18.9401, 72.8355),
    'nariman_point': (18.9261, 72.8223),
    'andheri_east_comm': (19.1149, 72.8600),
    'marine_drive': (18.9437, 72.8243),
    'navi_mumbai_airport': (18.9800, 73.0500),
    'vashi_station': (19.0684, 72.9984),
    'sanjay_gandhi_np': (19.2300, 72.8600),
    'phoenix_mall_kurla': (19.0800, 72.8850),
    'kokilaben_hospital': (19.1200, 72.8300),
    'dh_ambani_school': (19.0600, 72.8600)
}

In [5]:
def haversine_distance(lat1, lon1, lat2, lon2):
    '''
    Calculate the distance between two points on the earth (specified in decimal degrees)
    '''
    # Convert decimal degrees to radians
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])

    # Haversine formula
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    r = 6371 # Radius of earth in kilometers.
    return round(c * r, 2)

In [6]:
# Calculate distance to each POI for every property
for name, (lat, lon) in pois.items():
    col_name = f'dist_to_{name}_km'
    df[col_name] = haversine_distance(df['latitude'], df['longitude'], lat, lon)

In [7]:
# Sample
df.sample(4)

Unnamed: 0,Possession Status,Availability Starts From,Floor No,Commercial,Developer,Price (Crores),Landmark,Covered Area,Society,sqft Price (INR),Carpet Area,Locality,Unit of Carpet Area,Furnished Type,Bathrooms,Facing,Transaction Type,Type of Property,City,Bedrooms,Posted By,Total Floors,isPrimeLocationProperty,Unit of Covered Area,Property Lifespan,Balconies,Power Back Up,Lift,Rain Water Harvesting,Club House,Swimming Pool,Gymnasium,Park,Parking,Security,Water Storage,Private Terrace/Garden,Vaastu Compliant,Service/Goods Lift,Air Conditioned,Visitor Parking,Intercom Facility,Maintenance Staff,Waste Disposal,Laundry Service,Internet/Wi-Fi Connectivity,DTH Television Facility,RO Water System,Banquet Hall,Bar/Lounge,Cafeteria/Food Court,Conference Room,Piped Gas,Jogging and Strolling Track,Outdoor Tennis Courts,Mansion,Downtown,Skyline View,Wrap Around Balcony,Island Kitchen Layout,Full Glass Wall,House help accommodation,Concierge Services,Garden View,Marble flooring,Well Furnished,Modular Kitchen,Helipad,Private pool,Private Jaccuzi,Mini Cinema Theatre,Golf Course,Infinity Swimming Pool,Pool with temperature control,Sea facing,Skydeck,Earth quake resistant,Theme based Architectures,Health club with Steam / Jaccuzi,Large Clubhouse,Large Open space,Fingerprint Access,Grand Entrance lobby,Private Garage,Fireplace,Wine Cellar,Sky Villa,Water Front,Hilltop,Smart Home,Barbeque space,Kids play area,Library,Puja Room,Study,House Help Room,Store Room,Aerobics Room,Canopy Walk,Coffee Lounge & Restaurants,Dance Studio,Event Space & Amphitheatre,Flower Gardens,Guest Accommodation,Indoor Squash & Badminton Courts,Kids Play Area,Kids Play Pool With Water Slides,Library And Business Centre,Meditation Area,Multipurpose Courts,Recreational Pool,Rentable CommuniPfty Space,Retail Boulevard (Retail Shops),Cycling & Jogging Track,Fire Fighting Equipment,Aesthetically designed landscape garden,latitude,longitude,geocoded_address,Area,Price_bin,Floor Level,dist_to_csia_airport_km,dist_to_thane_station_km,dist_to_andheri_station_km,dist_to_csmt_station_km,dist_to_nariman_point_km,dist_to_andheri_east_comm_km,dist_to_marine_drive_km,dist_to_navi_mumbai_airport_km,dist_to_vashi_station_km,dist_to_sanjay_gandhi_np_km,dist_to_phoenix_mall_kurla_km,dist_to_kokilaben_hospital_km,dist_to_dh_ambani_school_km
3042,Ready to Move,Immediately,20.0,N,Bombay Realty,8.25,G d ambekar rd,3250.0,Bombay Dyeing Two Icc,25384.0,2100.0,Dadar East,Sq-ft,Semi-Furnished,5.0,East,Resale,Apartment,Mumbai,4,Agent,65.0,Y,Sq-ft,Less than 5 years,1.0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,19.016253,72.852227,"Dadar East, Katrak Road, Dadar Parsi Colony, F...",Dadar East,Very High,High rise (> 10),8.28,22.89,11.52,8.65,10.51,11.0,8.59,21.18,16.42,23.78,7.88,11.77,4.93
978,Ready to Move,Immediately,5.0,N,Unknown,1.9,Near megamall,850.0,Vaibhav Palace,22353.0,646.0,Anand Nagar,Sq-ft,Semi-Furnished,2.0,North-East,Resale,Apartment,Mumbai,2,Owner,8.0,N,Sq-ft,5 to 10 years,1.0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,19.099383,72.911593,"Anand nagar, N Ward, Zone 6, Mumbai Suburban, ...",Dahisar East,High,Mid rise (5 to 10),4.95,11.73,7.21,19.43,21.43,5.69,19.59,19.69,9.75,15.5,3.53,8.87,6.97
4141,Ready to Move,Immediately,23.0,N,Aksee Real Estate,5.99,Matunga w,1500.0,Shreeji Heights,39933.333333,1200.0,Matunga West,Sq-ft,Furnished,3.0,East,Resale,Apartment,Mumbai,3,Agent,31.0,Y,Sq-ft,Less than 5 years,1.0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,19.031706,72.84081,"Union Bank of India, Takandas H. Kataria Marg,...",Matunga West,Very High,High rise (> 10),6.94,22.23,9.8,10.2,11.9,9.47,9.94,22.73,17.06,22.14,7.1,9.88,3.74
2949,Ready to Move,Immediately,35.0,N,Bombay Realty,9.0,"Dadar naigaon, dadar east, mumbai.",2200.0,Bombay Dyeing Two Icc,40909.0,2070.0,Dadar East,Sq-ft,Semi-Furnished,4.0,North-East,New Property,Apartment,Mumbai,4,Agent,57.0,Y,Sq-ft,New construction,3.0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,19.016253,72.852227,"Dadar East, Katrak Road, Dadar Parsi Colony, F...",Dadar East,Very High,High rise (> 10),8.28,22.89,11.52,8.65,10.51,11.0,8.59,21.18,16.42,23.78,7.88,11.77,4.93


In [15]:
# Check shape
df.shape

(6280, 135)

In [16]:
# Export dataframe to a csv file
df.to_csv('Output Files//stage_5b_property_data.csv', index=False)

#### Centroid of each unique area

In [8]:
# Compute centroid of each Area
area_centroids = df.groupby('Area')[['latitude', 'longitude']].mean().reset_index()

# Sample
area_centroids.sample(8)

Unnamed: 0,Area,latitude,longitude
51,Virar East,19.054999,72.869203
31,Matunga West,19.034569,72.844469
44,Sion West,19.047497,72.863874
0,Agripada,18.97504,72.825056
12,Goregaon East,19.164369,72.860054
36,Nalasopara East,19.059122,72.866982
18,Kalyan East,19.225662,73.118251
34,Mumbai Central,18.970676,72.820463


In [12]:
# Export area centroids to a csv file
area_centroids.to_csv('Output Files//area_centroids.csv', index=False)