In [3]:
import numpy as np
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import ast

In [4]:
df = pd.read_csv('appartments.csv').drop(22)
# 22th row is not an appartment just a data error

In [5]:
df.head(5)

Unnamed: 0,PropertyName,PropertySubName,NearbyLocations,LocationAdvantages,Link,PriceDetails,TopFacilities
0,Smartworld One DXP,"2, 3, 4 BHK Apartment in Sector 113, Gurgaon","['Bajghera Road', 'Palam Vihar Halt', 'DPSG Pa...","{'Bajghera Road': '800 Meter', 'Palam Vihar Ha...",https://www.99acres.com/smartworld-one-dxp-sec...,"{'2 BHK': {'building_type': 'Apartment', 'area...","['Swimming Pool', 'Salon', 'Restaurant', 'Spa'..."
1,M3M Crown,"3, 4 BHK Apartment in Sector 111, Gurgaon","['DPSG Palam Vihar Gurugram', 'The NorthCap Un...","{'DPSG Palam Vihar Gurugram': '1.4 Km', 'The N...",https://www.99acres.com/m3m-crown-sector-111-g...,"{'3 BHK': {'building_type': 'Apartment', 'area...","['Bowling Alley', 'Mini Theatre', 'Manicured G..."
2,Adani Brahma Samsara Vilasa,"Land, 3, 4 BHK Independent Floor in Sector 63,...","['AIPL Business Club Sector 62', 'Heritage Xpe...","{'AIPL Business Club Sector 62': '2.7 Km', 'He...",https://www.99acres.com/adani-brahma-samsara-v...,{'3 BHK': {'building_type': 'Independent Floor...,"['Terrace Garden', 'Gazebo', 'Fountain', 'Amph..."
3,Sobha City,"2, 3, 4 BHK Apartment in Sector 108, Gurgaon","['The Shikshiyan School', 'WTC Plaza', 'Luxus ...","{'The Shikshiyan School': '2.9 KM', 'WTC Plaza...",https://www.99acres.com/sobha-city-sector-108-...,"{'2 BHK': {'building_type': 'Apartment', 'area...","['Swimming Pool', 'Volley Ball Court', 'Aerobi..."
4,Signature Global City 93,"2, 3 BHK Independent Floor in Sector 93 Gurgaon","['Pranavananda Int. School', 'DLF Site central...","{'Pranavananda Int. School': '450 m', 'DLF Sit...",https://www.99acres.com/signature-global-city-...,{'2 BHK': {'building_type': 'Independent Floor...,"['Mini Theatre', 'Doctor on Call', 'Concierge ..."


In [6]:
df.iloc[0].NearbyLocations

"['Bajghera Road', 'Palam Vihar Halt', 'DPSG Palam Vihar', 'Park Hospital', 'Gurgaon Railway Station']"

In [7]:
df.iloc[0].LocationAdvantages

"{'Bajghera Road': '800 Meter', 'Palam Vihar Halt': '2.5 KM', 'DPSG Palam Vihar': '3.1 KM', 'Park Hospital': '3.1 KM', 'Gurgaon Railway Station': '4.9 KM', 'The NorthCap University': '5.4 KM', 'Dwarka Expy': '1.2 KM', 'Hyatt Place Gurgaon Udyog Vihar': '7.7 KM', 'Dwarka Sector 21, Metro Station': '7.2 KM', 'Pacific D21 Mall': '7.4 KM', 'Indira Gandhi International Airport': '14.7 KM', 'Hamoni Golf Camp': '6.2 KM', 'Fun N Food Waterpark': '8.8 KM', 'Accenture DDC5': '9 KM'}"

Note -> Nearby location is the subset of LocationAdvantages

In [8]:
df.iloc[0].PriceDetails

"{'2 BHK': {'building_type': 'Apartment', 'area_type': 'Carpet Area', 'area': '1,370 sq.ft.', 'price-range': '₹ 2 - 2.4 Cr'}, '3 BHK': {'building_type': 'Apartment', 'area_type': 'Carpet Area', 'area': '1,850 - 2,050 sq.ft.', 'price-range': '₹ 2.25 - 3.59 Cr'}, '4 BHK': {'building_type': 'Apartment', 'area_type': 'Carpet Area', 'area': '2,600 sq.ft.', 'price-range': '₹ 3.24 - 4.56 Cr'}}"

In [9]:
df.iloc[0].PropertySubName

'2, 3, 4 BHK Apartment in Sector 113, Gurgaon'

Note -> PropertySubname is subset of PriceDetails

In [10]:
df.iloc[0].TopFacilities

"['Swimming Pool', 'Salon', 'Restaurant', 'Spa', 'Cafeteria', 'Sun Deck', '24x7 Security', 'Club House', 'Gated Community']"

## Note:
 - one recommender system only on location advantage column,
 - Second on price details column
 - third recommender system on top facilities
 - Will add these three results and then willl give output
 - 
Why not only one recommender system?
- We can give weights to these recommender systems based on the similarity
- Let say if Location thing is more similar then that should have bigger weight
- also if user wants recommendation based on specific thing like pricing then we will increase pricing weight

# Recommender System 1
Based on Top facilities

In [11]:
df[['PropertyName','TopFacilities']]

Unnamed: 0,PropertyName,TopFacilities
0,Smartworld One DXP,"['Swimming Pool', 'Salon', 'Restaurant', 'Spa'..."
1,M3M Crown,"['Bowling Alley', 'Mini Theatre', 'Manicured G..."
2,Adani Brahma Samsara Vilasa,"['Terrace Garden', 'Gazebo', 'Fountain', 'Amph..."
3,Sobha City,"['Swimming Pool', 'Volley Ball Court', 'Aerobi..."
4,Signature Global City 93,"['Mini Theatre', 'Doctor on Call', 'Concierge ..."
...,...,...
242,DLF Princeton Estate,"['Swimming Pool', 'Medical Centre', 'Laundry',..."
243,Pyramid Urban Homes 2,"['Shopping Centre', 'Community Hall', '24x7 Se..."
244,Satya The Hermitage,"['Bus Shelter', 'Swimming Pool', 'Business Lou..."
245,BPTP Spacio,"['Swimming Pool', 'Card Room', 'Piped Gas', 'P..."


In [12]:
# We will convert the topfacilities list into string then we will do text vectorization
# For similarity we can use dot product or euclidean distance of the vectors (kind of Top 5 nearest vector)

In [13]:
df[['PropertyName','TopFacilities']]['TopFacilities'][0]

"['Swimming Pool', 'Salon', 'Restaurant', 'Spa', 'Cafeteria', 'Sun Deck', '24x7 Security', 'Club House', 'Gated Community']"

In [14]:
# list is inside string

# Function to extract list
def extract_list(s):
    return re.findall(r"'(.*?)'", s)
df['TopFacilities'] = df['TopFacilities'].apply(extract_list)    

In [15]:
df[['PropertyName','TopFacilities']]['TopFacilities'][0]

['Swimming Pool',
 'Salon',
 'Restaurant',
 'Spa',
 'Cafeteria',
 'Sun Deck',
 '24x7 Security',
 'Club House',
 'Gated Community']

In [16]:
# Converting list to string
df['FacilitiesStr'] = df['TopFacilities'].apply(' '.join)

In [17]:
df['FacilitiesStr'][0]

'Swimming Pool Salon Restaurant Spa Cafeteria Sun Deck 24x7 Security Club House Gated Community'

In [18]:
# Now we will vectorize these string
# Strategies -> BOF, Tfidf, word2vec

# We will use tfidf
tfidf_vectorizer = TfidfVectorizer(stop_words = 'english', ngram_range = (1,2))

In [19]:
tfidf_matrix = tfidf_vectorizer.fit_transform(df['FacilitiesStr'])

In [20]:
# our vectorizer is representing every string in 953 dimensions
tfidf_matrix.toarray()[0]

array([0.        , 0.        , 0.        , 0.18809342, 0.18809342,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.     

In [21]:
# Generally In high dimension angular distance is better than euclidean distance
cosine_sim1 = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [22]:
# Gave all the similarity
cosine_sim1.shape

(246, 246)

In [23]:
cosine_sim1[0]

array([1.        , 0.01095159, 0.        , 0.01084196, 0.03680554,
       0.09606236, 0.01298851, 0.11694113, 0.08581866, 0.18383703,
       0.0867874 , 0.38885046, 0.05456836, 0.01060269, 0.2795812 ,
       0.09962026, 0.04464395, 0.        , 0.07993835, 0.01409962,
       0.0285075 , 0.12384884, 0.05425096, 0.01123476, 0.10737203,
       0.03793557, 0.01535356, 0.01028889, 0.02808416, 0.10018755,
       0.01137482, 0.21113653, 0.0142793 , 0.01129043, 0.10627372,
       0.03785414, 0.0937186 , 0.0525833 , 0.1835331 , 0.03549845,
       0.05483513, 0.02642675, 0.        , 0.01058227, 0.03182732,
       0.1457341 , 0.03045638, 0.00985505, 0.05257774, 0.11627835,
       0.01963182, 0.03610692, 0.04770997, 0.        , 0.12505784,
       0.11581035, 0.19094326, 0.09816112, 0.03080674, 0.04080013,
       0.02763568, 0.0847239 , 0.09457611, 0.32781666, 0.24934736,
       0.08049929, 0.01141942, 0.03395175, 0.15377243, 0.09399471,
       0.02681204, 0.01067862, 0.17078732, 0.05827849, 0.03868

In [24]:
def recommend_properties(property_name, cosine_sim = cosine_sim1):
    # get index of the property that matches the name
    idx = df.index[df['PropertyName'] == property_name].tolist()[0]

    # get the pairwise similarity scores with that property
    sim_scores = list(enumerate(cosine_sim1[idx]))

    # Sort the properties based in the similarity scores
    sim_scores = sim_scores[1:6]

    # get the property indices
    property_indices = [i[0] for i in sim_scores]

    recommendations_df = pd.DataFrame({
    'PropertyName':df['PropertyName'].iloc[property_indices],
    'SimilarityScore':sim_scores
    })
    # Return the top 10 most similar properties
    return recommendations_df

In [25]:
recommend_properties('Smartworld One DXP')

Unnamed: 0,PropertyName,SimilarityScore
1,M3M Crown,"(1, 0.010951589538864515)"
2,Adani Brahma Samsara Vilasa,"(2, 0.0)"
3,Sobha City,"(3, 0.010841961977798461)"
4,Signature Global City 93,"(4, 0.03680553752999976)"
5,Whiteland The Aspen,"(5, 0.09606235618398755)"


# Recommender System 2
Price Details

In [26]:
df[['PropertyName','PriceDetails']]['PriceDetails'][7]

"{'3 BHK': {'building_type': 'Apartment', 'area_type': 'Carpet Area', 'area': '1,347 - 2,600 sq.ft.', 'price-range': '₹ 4.05 - 8.88 Cr'}, '4 BHK': {'building_type': 'Apartment', 'area_type': 'Carpet Area', 'area': '1,692 - 3,100 sq.ft.', 'price-range': '₹ 4.89 - 10.59 Cr'}, '5 BHK': {'building_type': 'Apartment', 'area_type': 'Carpet Area', 'area': '2,275 - 4,100 sq.ft.', 'price-range': '₹ 6 - 14 Cr'}}"

# Strategy
- We will vectorize it
- We will make few columns like Building type 3bhk, area 3bhk, price 3bhk, buildingtype 4bhk, area 4bhk,
- Basically bhk wise details in columns so that it can become a row
- 247 rows, Columns - 6
- OHE on categorical column to make vector

In [27]:
import pandas as pd
import json

# Load the dataset
df_appartments = pd.read_csv('appartments.csv').drop(22)

# Function to parse and extract the required features from the PriceDetails column
def refined_parse_modified_v2(detail_str):
    try:
        details = json.loads(detail_str.replace("'", "\""))
    except:
        return {}

    extracted = {}
    for bhk, detail in details.items():
        # Extract building type
        extracted[f'building type_{bhk}'] = detail.get('building_type')

        # Parsing area details
        area = detail.get('area', '')
        area_parts = area.split('-')
        if len(area_parts) == 1:
            try:
                value = float(area_parts[0].replace(',', '').replace(' sq.ft.', '').strip())
                extracted[f'area low {bhk}'] = value
                extracted[f'area high {bhk}'] = value
            except:
                extracted[f'area low {bhk}'] = None
                extracted[f'area high {bhk}'] = None
        elif len(area_parts) == 2:
            try:
                extracted[f'area low {bhk}'] = float(area_parts[0].replace(',', '').replace(' sq.ft.', '').strip())
                extracted[f'area high {bhk}'] = float(area_parts[1].replace(',', '').replace(' sq.ft.', '').strip())
            except:
                extracted[f'area low {bhk}'] = None
                extracted[f'area high {bhk}'] = None

        # Parsing price details
        price_range = detail.get('price-range', '')
        price_parts = price_range.split('-')
        if len(price_parts) == 2:
            try:
                extracted[f'price low {bhk}'] = float(price_parts[0].replace('₹', '').replace(' Cr', '').replace(' L', '').strip())
                extracted[f'price high {bhk}'] = float(price_parts[1].replace('₹', '').replace(' Cr', '').replace(' L', '').strip())
                if 'L' in price_parts[0]:
                    extracted[f'price low {bhk}'] /= 100
                if 'L' in price_parts[1]:
                    extracted[f'price high {bhk}'] /= 100
            except:
                extracted[f'price low {bhk}'] = None
                extracted[f'price high {bhk}'] = None

    return extracted
# Apply the refined parsing and generate the new DataFrame structure
data_refined = []

for _, row in df_appartments.iterrows():
    features = refined_parse_modified_v2(row['PriceDetails'])
    
    # Construct a new row for the transformed dataframe
    new_row = {'PropertyName': row['PropertyName']}
    
    # Populate the new row with extracted features
    for config in ['1 BHK', '2 BHK', '3 BHK', '4 BHK', '5 BHK', '6 BHK', '1 RK', 'Land']:
        new_row[f'building type_{config}'] = features.get(f'building type_{config}')
        new_row[f'area low {config}'] = features.get(f'area low {config}')
        new_row[f'area high {config}'] = features.get(f'area high {config}')
        new_row[f'price low {config}'] = features.get(f'price low {config}')
        new_row[f'price high {config}'] = features.get(f'price high {config}')
    
    data_refined.append(new_row)

df_final_refined_v2 = pd.DataFrame(data_refined).set_index('PropertyName')


In [28]:
df_final_refined_v2['building type_Land'] = df_final_refined_v2['building type_Land'].replace({'':'Land'})

In [29]:
df_final_refined_v2

Unnamed: 0_level_0,building type_1 BHK,area low 1 BHK,area high 1 BHK,price low 1 BHK,price high 1 BHK,building type_2 BHK,area low 2 BHK,area high 2 BHK,price low 2 BHK,price high 2 BHK,...,building type_1 RK,area low 1 RK,area high 1 RK,price low 1 RK,price high 1 RK,building type_Land,area low Land,area high Land,price low Land,price high Land
PropertyName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Smartworld One DXP,,,,,,Apartment,1370.0,1370.0,2.0000,2.40,...,,,,,,,,,,
M3M Crown,,,,,,,,,,,...,,,,,,,,,,
Adani Brahma Samsara Vilasa,,,,,,,,,,,...,,,,,,Land,500.0,4329.0,2.05,41.13
Sobha City,,,,,,Apartment,1381.0,1692.0,1.5500,3.21,...,,,,,,,,,,
Signature Global City 93,,,,,,Independent Floor,981.0,1118.0,0.9301,1.06,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
DLF Princeton Estate,,,,,,Apartment,964.0,964.0,,,...,,,,,,,,,,
Pyramid Urban Homes 2,Apartment,335.0,398.0,23.45,0.2786,Apartment,500.0,625.0,,,...,,,,,,,,,,
Satya The Hermitage,,,,,,Apartment,1450.0,1450.0,,,...,,,,,,,,,,
BPTP Spacio,,,,,,Apartment,1000.0,1079.0,,,...,,,,,,,,,,


In [30]:
df['PriceDetails'][0]

"{'2 BHK': {'building_type': 'Apartment', 'area_type': 'Carpet Area', 'area': '1,370 sq.ft.', 'price-range': '₹ 2 - 2.4 Cr'}, '3 BHK': {'building_type': 'Apartment', 'area_type': 'Carpet Area', 'area': '1,850 - 2,050 sq.ft.', 'price-range': '₹ 2.25 - 3.59 Cr'}, '4 BHK': {'building_type': 'Apartment', 'area_type': 'Carpet Area', 'area': '2,600 sq.ft.', 'price-range': '₹ 3.24 - 4.56 Cr'}}"

In [31]:
df_final_refined_v2.iloc[0]

building type_1 BHK         None
area low 1 BHK               NaN
area high 1 BHK              NaN
price low 1 BHK              NaN
price high 1 BHK             NaN
building type_2 BHK    Apartment
area low 2 BHK            1370.0
area high 2 BHK           1370.0
price low 2 BHK              2.0
price high 2 BHK             2.4
building type_3 BHK    Apartment
area low 3 BHK            1850.0
area high 3 BHK           2050.0
price low 3 BHK             2.25
price high 3 BHK            3.59
building type_4 BHK    Apartment
area low 4 BHK            2600.0
area high 4 BHK           2600.0
price low 4 BHK             3.24
price high 4 BHK            4.56
building type_5 BHK         None
area low 5 BHK               NaN
area high 5 BHK              NaN
price low 5 BHK              NaN
price high 5 BHK             NaN
building type_6 BHK         None
area low 6 BHK               NaN
area high 6 BHK              NaN
price low 6 BHK              NaN
price high 6 BHK             NaN
building t

In [32]:
categorical_columns = df_final_refined_v2.select_dtypes(include = ['object']).columns.tolist()

In [33]:
categorical_columns

['building type_1 BHK',
 'building type_2 BHK',
 'building type_3 BHK',
 'building type_4 BHK',
 'building type_5 BHK',
 'building type_6 BHK',
 'building type_1 RK',
 'building type_Land']

In [34]:
ohe_df = pd.get_dummies(df_final_refined_v2, columns = categorical_columns, drop_first = True)

In [35]:
ohe_df.fillna(0,inplace = True)

In [36]:
ohe_df

Unnamed: 0_level_0,area low 1 BHK,area high 1 BHK,price low 1 BHK,price high 1 BHK,area low 2 BHK,area high 2 BHK,price low 2 BHK,price high 2 BHK,area low 3 BHK,area high 3 BHK,...,building type_2 BHK_Independent Floor,building type_2 BHK_Service Apartment,building type_3 BHK_Independent Floor,building type_3 BHK_Service Apartment,building type_3 BHK_Villa,building type_4 BHK_Independent Floor,building type_4 BHK_Villa,building type_5 BHK_Independent Floor,building type_5 BHK_Villa,building type_6 BHK_Villa
PropertyName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Smartworld One DXP,0.0,0.0,0.00,0.0000,1370.0,1370.0,2.0000,2.40,1850.0,2050.0,...,False,False,False,False,False,False,False,False,False,False
M3M Crown,0.0,0.0,0.00,0.0000,0.0,0.0,0.0000,0.00,1605.0,2170.0,...,False,False,False,False,False,False,False,False,False,False
Adani Brahma Samsara Vilasa,0.0,0.0,0.00,0.0000,0.0,0.0,0.0000,0.00,1800.0,3150.0,...,False,False,True,False,False,True,False,False,False,False
Sobha City,0.0,0.0,0.00,0.0000,1381.0,1692.0,1.5500,3.21,1711.0,2343.0,...,False,False,False,False,False,False,False,False,False,False
Signature Global City 93,0.0,0.0,0.00,0.0000,981.0,1118.0,0.9301,1.06,1235.0,1530.0,...,True,False,True,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
DLF Princeton Estate,0.0,0.0,0.00,0.0000,964.0,964.0,0.0000,0.00,1127.0,1127.0,...,False,False,False,False,False,False,False,False,False,False
Pyramid Urban Homes 2,335.0,398.0,23.45,0.2786,500.0,625.0,0.0000,0.00,0.0,0.0,...,False,False,False,False,False,False,False,False,False,False
Satya The Hermitage,0.0,0.0,0.00,0.0000,1450.0,1450.0,0.0000,0.00,1991.0,1991.0,...,False,False,False,False,False,False,False,False,False,False
BPTP Spacio,0.0,0.0,0.00,0.0000,1000.0,1079.0,0.0000,0.00,1225.0,1865.0,...,False,False,False,False,False,False,False,False,False,False


In [37]:
from sklearn.preprocessing import StandardScaler

# initialize the scaler
scaler = StandardScaler()

# Apply the scaler to the entire dataframe
ohe_df_normalized = pd.DataFrame(scaler.fit_transform(ohe_df), columns = ohe_df.columns, index = ohe_df.index)

In [38]:
ohe_df_normalized.head()

Unnamed: 0_level_0,area low 1 BHK,area high 1 BHK,price low 1 BHK,price high 1 BHK,area low 2 BHK,area high 2 BHK,price low 2 BHK,price high 2 BHK,area low 3 BHK,area high 3 BHK,...,building type_2 BHK_Independent Floor,building type_2 BHK_Service Apartment,building type_3 BHK_Independent Floor,building type_3 BHK_Service Apartment,building type_3 BHK_Villa,building type_4 BHK_Independent Floor,building type_4 BHK_Villa,building type_5 BHK_Independent Floor,building type_5 BHK_Villa,building type_6 BHK_Villa
PropertyName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Smartworld One DXP,-0.252266,-0.169584,-0.105197,-0.082332,1.223499,1.020101,-0.173712,1.158423,0.553787,0.370864,...,-0.28931,-0.063888,-0.372678,-0.063888,-0.171139,-0.254824,-0.236208,-0.111111,-0.216353,-0.063888
M3M Crown,-0.252266,-0.169584,-0.105197,-0.082332,-0.893541,-0.89666,-0.283546,-0.387986,0.293086,0.472749,...,-0.28931,-0.063888,-0.372678,-0.063888,-0.171139,-0.254824,-0.236208,-0.111111,-0.216353,-0.063888
Adani Brahma Samsara Vilasa,-0.252266,-0.169584,-0.105197,-0.082332,-0.893541,-0.89666,-0.283546,-0.387986,0.500583,1.304803,...,-0.28931,-0.063888,2.683282,-0.063888,-0.171139,3.924283,-0.236208,-0.111111,-0.216353,-0.063888
Sobha City,-0.252266,-0.169584,-0.105197,-0.082332,1.240497,1.47061,-0.198425,1.680336,0.405879,0.619632,...,-0.28931,-0.063888,-0.372678,-0.063888,-0.171139,-0.254824,-0.236208,-0.111111,-0.216353,-0.063888
Signature Global City 93,-0.252266,-0.169584,-0.105197,-0.082332,0.622383,0.667529,-0.232468,0.295011,-0.100626,-0.070634,...,3.456497,-0.063888,2.683282,-0.063888,-0.171139,-0.254824,-0.236208,-0.111111,-0.216353,-0.063888


In [39]:
from sklearn.metrics.pairwise import cosine_similarity

# compute the cosine similarity matrix
cosine_sim2 = cosine_similarity(ohe_df_normalized)

In [40]:
cosine_sim2.shape

(246, 246)

In [41]:
# Recommender system 2
def recommend_properties_with_scores(property_name, top_n = 247):
    # get the similarity scored for the property using its name as the index
    sim_scores = list(enumerate(cosine_sim2[ohe_df_normalized.index.get_loc(property_name)]))

    # Sort properties based in the similarity scores
    sorted_scores = sorted(sim_scores, key = lambda x:x[1], reverse = True)
    # Get the indices and scores of the top_n most similar properties
    top_indices = [i[0] for i in sorted_scores[1:top_n + 1]]
    top_scores = [i[1] for i in sorted_scores[1:top_n + 1]]
    # Retrieve the names of the top properties using the indices
    top_properties = ohe_df_normalized.index[top_indices].tolist()
    # Create a dataframe with the results
    recommendations_df = pd.DataFrame({
    'PropertyName' : top_properties,
    'SimilarityScore':top_scores
    })
    return recommendations_df
# test the recommender function suing a property name
recommend_properties_with_scores('M3M Golf Hills')

Unnamed: 0,PropertyName,SimilarityScore
0,AIPL The Peaceful Homes,0.955462
1,Smartworld One DXP,0.954670
2,Unitech Escape,0.953092
3,M3M Capital,0.951156
4,BPTP Terra,0.943128
...,...,...
240,Golden Park,-0.522391
241,Satya Merano Greens,-0.523660
242,ROF Normanton Park,-0.525129
243,BPTP Green Oaks,-0.525286


# Recommender System 3
Based on location advantage

Strategy:
- We will make a vector having all landmarks and in rows we will keep the distance of the appartment from the landmark in that column
- Null if that is not there

In [42]:
df.head()

Unnamed: 0,PropertyName,PropertySubName,NearbyLocations,LocationAdvantages,Link,PriceDetails,TopFacilities,FacilitiesStr
0,Smartworld One DXP,"2, 3, 4 BHK Apartment in Sector 113, Gurgaon","['Bajghera Road', 'Palam Vihar Halt', 'DPSG Pa...","{'Bajghera Road': '800 Meter', 'Palam Vihar Ha...",https://www.99acres.com/smartworld-one-dxp-sec...,"{'2 BHK': {'building_type': 'Apartment', 'area...","[Swimming Pool, Salon, Restaurant, Spa, Cafete...",Swimming Pool Salon Restaurant Spa Cafeteria S...
1,M3M Crown,"3, 4 BHK Apartment in Sector 111, Gurgaon","['DPSG Palam Vihar Gurugram', 'The NorthCap Un...","{'DPSG Palam Vihar Gurugram': '1.4 Km', 'The N...",https://www.99acres.com/m3m-crown-sector-111-g...,"{'3 BHK': {'building_type': 'Apartment', 'area...","[Bowling Alley, Mini Theatre, Manicured Garden...",Bowling Alley Mini Theatre Manicured Garden Sw...
2,Adani Brahma Samsara Vilasa,"Land, 3, 4 BHK Independent Floor in Sector 63,...","['AIPL Business Club Sector 62', 'Heritage Xpe...","{'AIPL Business Club Sector 62': '2.7 Km', 'He...",https://www.99acres.com/adani-brahma-samsara-v...,{'3 BHK': {'building_type': 'Independent Floor...,"[Terrace Garden, Gazebo, Fountain, Amphitheatr...",Terrace Garden Gazebo Fountain Amphitheatre Pa...
3,Sobha City,"2, 3, 4 BHK Apartment in Sector 108, Gurgaon","['The Shikshiyan School', 'WTC Plaza', 'Luxus ...","{'The Shikshiyan School': '2.9 KM', 'WTC Plaza...",https://www.99acres.com/sobha-city-sector-108-...,"{'2 BHK': {'building_type': 'Apartment', 'area...","[Swimming Pool, Volley Ball Court, Aerobics Ce...",Swimming Pool Volley Ball Court Aerobics Centr...
4,Signature Global City 93,"2, 3 BHK Independent Floor in Sector 93 Gurgaon","['Pranavananda Int. School', 'DLF Site central...","{'Pranavananda Int. School': '450 m', 'DLF Sit...",https://www.99acres.com/signature-global-city-...,{'2 BHK': {'building_type': 'Independent Floor...,"[Mini Theatre, Doctor on Call, Concierge Servi...",Mini Theatre Doctor on Call Concierge Service ...


In [43]:
df[['PropertyName','LocationAdvantages']]['LocationAdvantages'][0]

"{'Bajghera Road': '800 Meter', 'Palam Vihar Halt': '2.5 KM', 'DPSG Palam Vihar': '3.1 KM', 'Park Hospital': '3.1 KM', 'Gurgaon Railway Station': '4.9 KM', 'The NorthCap University': '5.4 KM', 'Dwarka Expy': '1.2 KM', 'Hyatt Place Gurgaon Udyog Vihar': '7.7 KM', 'Dwarka Sector 21, Metro Station': '7.2 KM', 'Pacific D21 Mall': '7.4 KM', 'Indira Gandhi International Airport': '14.7 KM', 'Hamoni Golf Camp': '6.2 KM', 'Fun N Food Waterpark': '8.8 KM', 'Accenture DDC5': '9 KM'}"

In [44]:
def distance_to_meters(distance_str):
    try:
        if 'Km' in distance_str or 'KM' in distance_str:
            return float(distance_str.split()[0]) * 1000
        elif 'Meter' in distance_str or 'meter' in distance_str:
            return float(distance_str.split()[0])
        else:
            return None
    except:
        return None

In [45]:
# Extract distances from each location
location_matrix = {}
for index, row in df.iterrows():
    distances = {}
    for location, distance in ast.literal_eval(row['LocationAdvantages']).items():
        distances[location] = distance_to_meters(distance)
    location_matrix[index] = distances
#Convert the dictionary to a dataframe
location_df = pd.DataFrame.from_dict(location_matrix, orient = 'index')

# Display the first few rows
location_df.head()

Unnamed: 0,Bajghera Road,Palam Vihar Halt,DPSG Palam Vihar,Park Hospital,Gurgaon Railway Station,The NorthCap University,Dwarka Expy,Hyatt Place Gurgaon Udyog Vihar,"Dwarka Sector 21, Metro Station",Pacific D21 Mall,...,MCC Cricket Ground Dhankot,The Shri Ram School Aravali,Taj City Centre Gurugram,Minda Industries Corporate Office,"Rampura Flyover, Naurangpur Rd",Manesar toll plaza - Kherki Daula,"Imt Manesar, Gurugram",Holiday Inn,Sector 84 Road,Skyview Corporate Park
0,800.0,2500.0,3100.0,3100.0,4900.0,5400.0,1200.0,7700.0,7200.0,7400.0,...,,,,,,,,,,
25,550.0,,,,,6700.0,3800.0,,,7500.0,...,,,,,,,,,,
37,5300.0,,,,2500.0,8800.0,,,,,...,,,,,,,,,,
69,1500.0,,,,6500.0,6700.0,5100.0,,,8200.0,...,,,,,,,,,,
9,,,,5500.0,,,,,,,...,,,,,,,,,,


In [46]:
location_df.index = df.PropertyName

In [47]:
location_df.columns[10:50]

Index(['Indira Gandhi International Airport', 'Hamoni Golf Camp',
       'Fun N Food Waterpark', 'Accenture DDC5', 'DPSG Palam Vihar Gurugram',
       'Park Hospital, Palam Vihar', 'Palam Vihar Halt Railway Station',
       'Dwarka Sector 21 Metro Station', 'Dwarka Expressway',
       'Fun N Food Water Park', 'Tau DeviLal Sports Complex', 'Hyatt Place',
       'Altrade Business Centre', 'AIPL Business Club Sector 62',
       'Heritage Xperiential Learning School', 'CK Birla Hospital',
       'Paras Trinity Mall Sector 63', 'Rapid Metro Station Sector 56',
       'De Adventure Park', 'Golf Course Ext Rd',
       'DoubleTree by Hilton Hotel Gurgaon',
       'KIIT College of Engineering Sohna Road', 'Mehrauli-Gurgaon Road',
       'Nirvana Rd', 'TERI Golf Course', 'The Shikshiyan School', 'WTC Plaza',
       'Luxus Haritma Resort', 'BSF Golf Course', 'Rions Hospital', 'Gurgaon',
       'Dwarka Sector 21', 'Nehru Stadium', 'Fun N Food WaterPark',
       'IGI Airport', 'Vasant Kunj', 'Prana

# Observation
- Too many repetitions
- Like indira gandhi airport & IGI Airport
- We will have to remove these duplicates

In [48]:
# Can't replace NAN value with zero as it will make the distance zero from that locality
location_df.fillna(54000, inplace = True)
# NaN means the society is far so we have replaced the NaN with maxm value

In [49]:
location_df

Unnamed: 0_level_0,Bajghera Road,Palam Vihar Halt,DPSG Palam Vihar,Park Hospital,Gurgaon Railway Station,The NorthCap University,Dwarka Expy,Hyatt Place Gurgaon Udyog Vihar,"Dwarka Sector 21, Metro Station",Pacific D21 Mall,...,MCC Cricket Ground Dhankot,The Shri Ram School Aravali,Taj City Centre Gurugram,Minda Industries Corporate Office,"Rampura Flyover, Naurangpur Rd",Manesar toll plaza - Kherki Daula,"Imt Manesar, Gurugram",Holiday Inn,Sector 84 Road,Skyview Corporate Park
PropertyName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Smartworld One DXP,800.0,2500.0,3100.0,3100.0,4900.0,5400.0,1200.0,7700.0,7200.0,7400.0,...,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0
M3M Crown,550.0,54000.0,54000.0,54000.0,54000.0,6700.0,3800.0,54000.0,54000.0,7500.0,...,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0
Adani Brahma Samsara Vilasa,5300.0,54000.0,54000.0,54000.0,2500.0,8800.0,54000.0,54000.0,54000.0,54000.0,...,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0
Sobha City,1500.0,54000.0,54000.0,54000.0,6500.0,6700.0,5100.0,54000.0,54000.0,8200.0,...,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0
Signature Global City 93,54000.0,54000.0,54000.0,5500.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,...,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
DLF Princeton Estate,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,...,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0
Pyramid Urban Homes 2,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,...,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0
Satya The Hermitage,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,...,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0
BPTP Spacio,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,...,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0


In [50]:
# Scaling
from sklearn.preprocessing import StandardScaler
# nitlize the scaler
scaler = StandardScaler()

# Apply the scaler to the entire dataframe
location_df_normalized = pd.DataFrame(scaler.fit_transform(location_df), columns = location_df.columns, index = location_df.index)

In [51]:
location_df_normalized

Unnamed: 0_level_0,Bajghera Road,Palam Vihar Halt,DPSG Palam Vihar,Park Hospital,Gurgaon Railway Station,The NorthCap University,Dwarka Expy,Hyatt Place Gurgaon Udyog Vihar,"Dwarka Sector 21, Metro Station",Pacific D21 Mall,...,MCC Cricket Ground Dhankot,The Shri Ram School Aravali,Taj City Centre Gurugram,Minda Industries Corporate Office,"Rampura Flyover, Naurangpur Rd",Manesar toll plaza - Kherki Daula,"Imt Manesar, Gurugram",Holiday Inn,Sector 84 Road,Skyview Corporate Park
PropertyName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Smartworld One DXP,-7.960979,-15.652476,-15.652476,-3.149592,-2.966108,-3.147217,-3.726615,-10.231739,-15.652476,-6.023233,...,0.0,0.063888,0.063888,0.063888,0.063888,0.063888,0.063888,0.0,0.063888,0.063888
M3M Crown,-7.998993,0.063888,0.063888,0.328277,0.368941,-3.054053,-3.529275,0.090308,0.063888,-6.009941,...,0.0,0.063888,0.063888,0.063888,0.063888,0.063888,0.063888,0.0,0.063888,0.063888
Adani Brahma Samsara Vilasa,-7.276720,0.063888,0.063888,0.328277,-3.129124,-2.903557,0.280891,0.090308,0.063888,0.171073,...,0.0,0.063888,0.063888,0.063888,0.063888,0.063888,0.063888,0.0,0.063888,0.063888
Sobha City,-7.854539,0.063888,0.063888,0.328277,-2.857430,-3.054053,-3.430606,0.090308,0.063888,-5.916893,...,0.0,0.063888,0.063888,0.063888,0.063888,0.063888,0.063888,0.0,0.063888,0.063888
Signature Global City 93,0.128476,0.063888,0.063888,-2.985606,0.368941,0.335688,0.280891,0.090308,0.063888,0.171073,...,0.0,0.063888,0.063888,0.063888,0.063888,0.063888,0.063888,0.0,0.063888,0.063888
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
DLF Princeton Estate,0.128476,0.063888,0.063888,0.328277,0.368941,0.335688,0.280891,0.090308,0.063888,0.171073,...,0.0,0.063888,0.063888,0.063888,0.063888,0.063888,0.063888,0.0,0.063888,0.063888
Pyramid Urban Homes 2,0.128476,0.063888,0.063888,0.328277,0.368941,0.335688,0.280891,0.090308,0.063888,0.171073,...,0.0,0.063888,0.063888,0.063888,0.063888,0.063888,0.063888,0.0,0.063888,0.063888
Satya The Hermitage,0.128476,0.063888,0.063888,0.328277,0.368941,0.335688,0.280891,0.090308,0.063888,0.171073,...,0.0,0.063888,0.063888,0.063888,0.063888,0.063888,0.063888,0.0,0.063888,0.063888
BPTP Spacio,0.128476,0.063888,0.063888,0.328277,0.368941,0.335688,0.280891,0.090308,0.063888,0.171073,...,0.0,0.063888,0.063888,0.063888,0.063888,0.063888,0.063888,0.0,0.063888,0.063888


In [52]:
cosine_sim3 = cosine_similarity(location_df_normalized)

In [53]:
cosine_sim3.shape

(246, 246)

In [54]:
cosine_sim3

array([[ 1.        ,  0.11228075,  0.08647048, ..., -0.01073953,
        -0.06042898, -0.06042898],
       [ 0.11228075,  1.        ,  0.13141512, ..., -0.00854903,
        -0.01763808, -0.01763808],
       [ 0.08647048,  0.13141512,  1.        , ..., -0.01142363,
        -0.02363121, -0.02363121],
       ...,
       [-0.01073953, -0.00854903, -0.01142363, ...,  1.        ,
         0.04377593,  0.04377593],
       [-0.06042898, -0.01763808, -0.02363121, ...,  0.04377593,
         1.        ,  1.        ],
       [-0.06042898, -0.01763808, -0.02363121, ...,  0.04377593,
         1.        ,  1.        ]], shape=(246, 246))

In [55]:
def recommend_properties_with_scores(property_name, top_n = 247):
    
    # Cosine_sim_matrix = cosine_sim1 + cosine_sim2 + 10 * cosine_sim3
    cosine_sim_matrix = cosine_sim3
    
    #Get the similarity score for the property using its name as the index
    sim_scores = list(enumerate(cosine_sim_matrix[location_df_normalized.index.get_loc(property_name)]))
    
    # Sort properties based on the similarity scores
    sorted_scores = sorted(sim_scores, key = lambda x: x[1], reverse = True)
    
    # Get the indices and cors of the top_n most similar properties
    top_indices = [i[0] for i in sorted_scores[1: top_n +1]]
    top_scores = [i[1] for i in sorted_scores[1:top_n + 1]]
    # retrieve the names of the top properties using the indices
    top_properties = location_df_normalized.index[top_indices].tolist()
    # Create a dataframe with the results
    recommendations_df = pd.DataFrame({
        'PropertyName':top_properties,
        'SimilarityScore':top_scores
    })
    return recommendations_df
# Test the recommender function using a property name
recommend_properties_with_scores('Ireo Victory Valley')

Unnamed: 0,PropertyName,SimilarityScore
0,Lion Infra Green Valley,0.260393
1,International City by SOBHA Phase 2,0.175374
2,Emaar MGF Emerald Floors Premier,0.172598
3,Bestech Park View Grand Spa,0.160124
4,Shree Vardhman Flora,0.130741
...,...,...
240,DLF Princeton Estate,-0.067193
241,Pyramid Urban Homes 2,-0.067193
242,BPTP Spacio,-0.067193
243,SS The Coralwood,-0.067193


### Introducting Weights of 3 recommendations

In [56]:
# Weighted Cosine matrix
cosine_sim_weighted = 10 * cosine_sim1 + 5* cosine_sim2 + 2* cosine_sim3

In [57]:
def recommend_properties_with_scores(property_name, top_n = 247):
    
    
    cosine_sim_matrix = cosine_sim_weighted
    
    #Get the similarity score for the property using its name as the index
    sim_scores = list(enumerate(cosine_sim_matrix[location_df_normalized.index.get_loc(property_name)]))
    
    # Sort properties based on the similarity scores
    sorted_scores = sorted(sim_scores, key = lambda x: x[1], reverse = True)
    
    # Get the indices and cors of the top_n most similar properties
    top_indices = [i[0] for i in sorted_scores[1: top_n +1]]
    top_scores = [i[1] for i in sorted_scores[1:top_n + 1]]
    # retrieve the names of the top properties using the indices
    top_properties = location_df_normalized.index[top_indices].tolist()
    # Create a dataframe with the results
    recommendations_df = pd.DataFrame({
        'PropertyName':top_properties,
        'SimilarityScore':top_scores
    })
    return recommendations_df
# Test the recommender function using a property name
recommend_properties_with_scores('Ireo Victory Valley')

Unnamed: 0,PropertyName,SimilarityScore
0,Pioneer Urban Presidia,8.131084
1,Ambience Creacions,7.675505
2,DLF The Crest,7.098810
3,Silverglades The Melia,6.815078
4,Pioneer Araya,6.479646
...,...,...
240,JMS The Nation,-3.683914
241,Shree Vardhman City,-3.694471
242,Vatika Aspiration,-3.747447
243,JMS Prime Land,-3.770557


### We can take input from user let say that what is their preference and accordingly we can give weights
### or based on their used filters or data they have searched (obviously then we need advanced website)