In [1]:
# Import Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import joblib

import warnings
warnings.filterwarnings("ignore")

#### Create Society-Level profiles

In [2]:
# Load the data
df = pd.read_csv('Output Files\\stage_5b_property_data.csv')
df.head()

Unnamed: 0,Possession Status,Availability Starts From,Floor No,Commercial,Developer,Price (Crores),Landmark,Covered Area,Society,sqft Price (INR),...,dist_to_csmt_station_km,dist_to_nariman_point_km,dist_to_andheri_east_comm_km,dist_to_marine_drive_km,dist_to_navi_mumbai_airport_km,dist_to_vashi_station_km,dist_to_sanjay_gandhi_np_km,dist_to_phoenix_mall_kurla_km,dist_to_kokilaben_hospital_km,dist_to_dh_ambani_school_km
0,Ready to Move,Immediately,20.0,Y,Tata Housing Development Company Ltd.,0.63,Rajoli naka,763.571943,Tata Amantra,8250.696031,...,45.19,47.27,31.16,45.7,30.01,23.31,27.87,30.9,33.78,34.33
1,Ready to Move,Immediately,18.0,N,Sai Satyam Developers,0.54,This property has reputed scholl ints vicinity.,850.0,Sai Satyam Homes,6352.0,...,45.19,47.27,31.16,45.7,30.01,23.31,27.87,30.9,33.78,34.33
2,Under Construction,Dec 25,5.0,N,Birla Estates,0.9,Shahad is one of the attractive locations to o...,1050.0,Birla Vanya,8571.0,...,45.19,47.27,31.16,45.7,30.01,23.31,27.87,30.9,33.78,34.33
3,Ready to Move,Immediately,8.0,Y,Godrej Properties,0.495,Majiwada metro station,561.0,Godrej Nirvaan,8824.0,...,45.19,47.27,31.16,45.7,30.01,23.31,27.87,30.9,33.78,34.33
4,Ready to Move,Immediately,16.0,N,Tycoons Group,0.657,"Close to tree house school, dmart, kdmc garden...",1067.0,Tycoons Solitaire Sapphire,6157.0,...,45.19,47.27,31.16,45.7,30.01,23.31,27.87,30.9,33.78,34.33


In [3]:
# Create Society location map

location_cols = ['City', 'Area', 'Society']

# Create the map
society_location_map = df[location_cols].dropna().drop_duplicates()

# Save the map to a CSV file for app to use
society_location_map.to_csv('Output Files\\Society_Recommendation_System_Files\\society_location_map.csv', index=False)

In [4]:
# Keep only essential columns relevant for building the recommendation system
df = df.drop(columns=[
    'Possession Status',
    'Availability Starts From',
    'Floor No',
    'Developer',
    'Landmark',
    'sqft Price (INR)',
    'Locality',
    'Unit of Carpet Area',
    'Furnished Type',
    'Facing',
    'Transaction Type',
    'Type of Property',
    'isPrimeLocationProperty',
    'City',
    'Posted By',
    'Total Floors',
    'Unit of Covered Area',
    'Property Lifespan',
    'geocoded_address',
    'Area',
    'Price_bin',
    'Floor Level'
])

In [5]:
# Convert boolean/binary columns to numeric format before aggregation
for col in ['Commercial']:
    if col in df.columns:
        # Map positive/negative values to 1/0
        df[col] = df[col].replace({
            'Y': 1, 'Yes': 1,
            'N': 0, 'No': 0
        }).astype(float)

In [6]:
# Make list of columns that are numeric for aggregation.
columns_to_aggregate = df.select_dtypes(include=np.number).columns.tolist()

In [7]:
# Group by society name and calculate the mean of all numeric features
society_profiles = df.groupby('Society')[columns_to_aggregate].mean()

# Add a count of listings per society
society_profiles['listing_count'] = df.groupby('Society').size()

# Display
society_profiles.head()

Unnamed: 0_level_0,Commercial,Price (Crores),Covered Area,Carpet Area,Bathrooms,Bedrooms,Power Back Up,Lift,Rain Water Harvesting,Club House,...,dist_to_nariman_point_km,dist_to_andheri_east_comm_km,dist_to_marine_drive_km,dist_to_navi_mumbai_airport_km,dist_to_vashi_station_km,dist_to_sanjay_gandhi_np_km,dist_to_phoenix_mall_kurla_km,dist_to_kokilaben_hospital_km,dist_to_dh_ambani_school_km,listing_count
Society,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10 Central Avenue,0.0,4.11,855.263158,650.0,2.0,2.0,1.0,1.0,1.0,1.0,...,16.82,4.92,14.86,24.92,17.07,17.21,5.14,4.84,3.12,1
111 Hyde Park,0.0,2.8,1250.0,700.0,2.0,2.0,1.0,1.0,1.0,1.0,...,29.12,7.92,27.15,30.64,19.85,4.91,12.17,7.84,14.02,1
127 Raj Home,0.0,0.788524,884.588235,567.529412,2.0,1.529412,1.0,1.0,1.0,1.0,...,17.900588,7.626471,16.004706,21.259412,13.439412,18.133529,4.89,9.380588,3.737059,17
19 North,0.0,0.95,632.5,402.5,2.0,1.5,1.0,1.0,1.0,1.0,...,23.145,8.515,21.205,27.085,18.12,11.46,8.99,8.945,8.72,2
25 South,0.368421,16.926395,3140.467315,2201.210526,4.263158,3.789474,1.0,1.0,1.0,1.0,...,9.475789,11.921579,7.518421,22.616842,18.131579,24.586316,9.236842,12.308421,6.093684,19


In [8]:
# Dump the society_profiles for app to use
joblib.dump(society_profiles, 'Output Files\\Society_Recommendation_System_Files\\society_profiles.joblib')


['Output Files\\Society_Recommendation_System_Files\\society_profiles.joblib']

Scale the society_profiles ensuring all features have equal weight in the recommendation engine:

In [9]:
features_to_scale = society_profiles.drop(columns=['listing_count'])  # Exclude Listing Count as it is metadata

In [10]:
# Initialize the scaler
scaler = StandardScaler()

# Fit the scaler to the data
scaler = scaler.fit(features_to_scale)

# Save the scaler for recommendation on the app
joblib.dump(scaler, 'Output Files\\Society_Recommendation_System_Files\\standardScaler.joblib')

# Transform the data
scaled_features = scaler.transform(features_to_scale)

In [11]:
# Make dataframe
society_profiles_scaled = pd.DataFrame(scaled_features, columns=features_to_scale.columns, index=features_to_scale.index)

# Dump for app to use
joblib.dump(society_profiles_scaled, 'Output Files\\Society_Recommendation_System_Files\\society_profiles_scaled.joblib')

['Output Files\\Society_Recommendation_System_Files\\society_profiles_scaled.joblib']