In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer


In [3]:
df = pd.read_csv('../data/external/recommendar_system.csv')

In [4]:
df.drop_duplicates(inplace=True)

In [5]:
amenities_df = df[['society','top_amenties', 'regions']].drop_duplicates()

In [31]:
amenities_df.drop_duplicates(subset=['society'], keep='first', inplace=True)


In [33]:
amenities_df = amenities_df.reset_index().drop(columns=['index'])

In [34]:
amenities_df['society'].nunique()

3328

In [35]:
amenities_df.to_csv('../data/external/amenities_df.csv', index=False)

In [36]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
import joblib

# Define a named function to replace the lambda
def prepare_text_input(x):
    # Ensure the input is a Series of strings, even if it's a single row DataFrame
    if isinstance(x, pd.DataFrame) and 'top_amenties' in x.columns:
        return x['top_amenties'].astype(str)
    else:
        return x.astype(str)


# Wrap TfidfVectorizer inside a pipeline to reshape input
tfidf_pipeline = Pipeline([
    # Use the named function instead of lambda
    ('prepare_text', FunctionTransformer(prepare_text_input, validate=False)),
    ('tfidf', TfidfVectorizer(stop_words='english', ngram_range=(1, 2)))
])

# Build ColumnTransformer
transformer = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(), ['regions']),
        # The tfidf_pipeline now expects the DataFrame and will handle the 'top_amenties' column internally
        ('tfidf', tfidf_pipeline, ['top_amenties'])
    ],
    remainder='passthrough'
)

# Assuming amenities_df is already loaded and preprocessed as in your notebook
# Refit the transformer with the updated pipeline
transformer.fit(amenities_df[['top_amenties', 'regions']].drop_duplicates())

# Save the transformer to a file
joblib.dump(transformer, '../data/external/amenities_transformer.pkl')

['../data/external/amenities_transformer.pkl']

In [38]:
transformer = joblib.load('../data/external/amenities_transformer.pkl')

In [40]:
amenities_df[['top_amenties', 'regions']].shape

(3328, 2)

In [None]:
amenities_df

In [44]:
transform_df = transformer.transform(amenities_df[['top_amenties', 'regions']].drop_duplicates())

In [45]:
def recommend_by_top_amenities(society, region,df, top_n=10):
    # Get the row for the specified society and region from the *original* dataframe
    amenities_row = df.loc[(df['society'] == society) & (df['regions'] == region), ['top_amenties', 'regions']].head(1)

    if amenities_row.empty:
        print("Amenities not found for the given society and region.")
        # Return None or an empty DataFrame if no match is found
        return pd.DataFrame(columns=['PropertyName', 'SimilarityScore'])

    else:
        # Ensure we are working with the amenities_df structure that the transformer was fitted on
        # (which is derived from df and has 'top_amenties' and 'regions' columns)
        # Select the relevant columns from the *original* df row
        input_data = amenities_row[['top_amenties', 'regions']]

        # Transform the input data row to get its feature vector
        # Use the already fitted transformer (assuming it was fitted correctly on amenities_df)
        input_vector = transformer.transform(input_data)

        # Calculate cosine similarity with the entire transformed dataset
        # Ensure input_vector has the correct shape (1, n_features) which transformer.transform on a single row should provide
        sim_scores = cosine_similarity(transform_df, input_vector)

        # sim_scores will be a 2D array (n_samples, 1), convert to 1D list of tuples (index, score)
        sim_scores = list(enumerate(sim_scores.flatten()))

        # Sort by similarity score in descending order
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

        # Get the top N results, excluding the input society itself (which will have a score of 1.0)
        # We start from index 1 to exclude the query item
        top_results = sim_scores[1:top_n+1]

        # Extract indices and scores
        top_indices = [i[0] for i in top_results]
        top_scores = [i[1] for i in top_results]

        # Get the corresponding society names from the *amenities_df* DataFrame
        # since transform_df corresponds to the rows of amenities_df (after dropping duplicates)
        top_societies = amenities_df['society'].iloc[top_indices].tolist()
        top_regions = amenities_df['regions'].iloc[top_indices].tolist()

        # Create the recommendations DataFrame
        recommendations_df = pd.DataFrame({
            'PropertyName': top_societies,
            'SimilarityScore': top_scores,
            'Region': top_regions
        })

        return recommendations_df[recommendations_df['Region'] == region].head()

In [46]:
recommend_by_top_amenities('kolte patil life republic', 'pune',amenities_df)

Unnamed: 0,PropertyName,SimilarityScore,Region
0,garve akshara serenity,0.795091,pune
3,gera greensville skyvillas,0.755181,pune
4,tennessee,0.748578,pune
5,merlin ventana,0.728365,pune
6,brahma suncity,0.728217,pune


In [14]:
def config_society(df):
    # Filter only needed building types and flat configs to reduce size early


    # Group by society and composite key, then aggregate
    grouped = df.groupby(['society', 'composite', 'regions']).agg(
        min_area=('carpet_area', 'min'),
        max_area=('carpet_area', 'max'),
        min_price=('price', 'min'),
        max_price=('price', 'max')
    ).reset_index()

    # Pivot to get each feature as a column
    wide = grouped.pivot(index=['society', 'regions'], columns='composite')

    # Flatten MultiIndex columns
    wide.columns = ['_'.join(col).strip() for col in wide.columns.values]
    wide.reset_index(inplace=True)
    wide.rename(columns={'society': 'PropertyName'}, inplace=True)
    wide.set_index('PropertyName', inplace=True)

    return wide


In [56]:
data_refined = config_society(df)

In [59]:
data_refined = data_refined[~data_refined.index.duplicated(keep='first')]

In [57]:
len(set(data_refined.index))

3328

In [54]:
data_refined.shape

(3348, 53)

In [60]:
data_refined.fillna(0, inplace=True)

In [61]:
data_refined.to_csv('../data/external/properties_df.csv', index=False)

In [21]:
from sklearn.preprocessing import StandardScaler

In [62]:
code_to_scaler = list(data_refined.columns[1:])

In [63]:
transformer_properties = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(drop='first'), ['regions']),
        ('scaler', StandardScaler(), code_to_scaler)
    ],
    remainder='passthrough'
)

In [64]:
transformer_properties.fit(data_refined)

In [65]:
transform_df1 = transformer_properties.transform(data_refined)

In [66]:
transform_df1.shape

(3328, 61)

In [67]:
joblib.dump(transformer_properties, '../data/external/transformer_properties.pkl')

['../data/external/transformer_properties.pkl']

In [None]:
def distance_to_meters(distance_str):
    try:
        if 'Km' in distance_str or 'KM' in distance_str:
            return float(distance_str.split()[0]) * 1000
        elif 'Meter' in distance_str or 'meter' in distance_str:
            return float(distance_str.split()[0])
        else:
            return None
    except:
        return None

In [315]:
# Extract distances for each location
import ast
location_matrix = {}
for index, row in df1.iterrows():
    distances = {}
    for location, distance in ast.literal_eval(row['nearbylocation']).items():
        distances[location] = distance_to_meters(distance)
    location_matrix[index] = distances

# Convert the dictionary to a dataframe
location_df = pd.DataFrame.from_dict(location_matrix, orient='index')

# Display the first few rows
location_df.head()

Unnamed: 0,Akshara International School,Alard College Of Pharmacy,Sunflower Public School,Saibalaji International Institute Of Management,Crimson Anisha Global School,City One Mall,Prime City Centre,Xion Mall,Premier Plaza Mall,Marunji Hospital,...,Oyster English School,Shreeyash Multi-Speciality Hospital,Spc Gurukul School And Sun Bright School And College,Shwas Multi Speciality Hospital,Manas Hospital,Clara Global School,Krome Mall,Weikfield It Citi Info Park,Homeschool International,Yog Multispeciality Hospital And Research Centre
4393,3100.0,600.0,1400.0,1300.0,1100.0,9200.0,5200.0,3800.0,8900.0,100.0,...,,,,,,,,,,
4846,3100.0,600.0,1400.0,1300.0,1100.0,9200.0,5200.0,3800.0,8900.0,100.0,...,,,,,,,,,,
4524,2500.0,,,,,,,2600.0,,,...,,,,,,,,,,
4646,2500.0,,,,,,,2600.0,,,...,,,,,,,,,,
4379,2500.0,,,,,,,2600.0,,,...,,,,,,,,,,


In [253]:
location_df.loc[81].reset_index().info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1160 entries, 0 to 1159
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   index   1160 non-null   object 
 1   81      12 non-null     float64
dtypes: float64(1), object(1)
memory usage: 18.3+ KB


In [250]:
df1.loc[81]['nearbylocation']

"{'School of Engineering and Technology Jain University Bangalore': '5.9 Km', 'International Academy of Management Entrepreneurship': '3.5 Km', 'Institute Of Chartered Financial Analysts Of India': '13.0 Km', 'Dav Public School': '5.6 Km', 'Bangalore International Academy': '10.7 Km', 'Tanishq Jewellery': '4.4 Km', 'Royalmart Supermarket': '4.5 Km', 'METRO Wholesale': '10.0 Km', 'DMart': '4.7 Km', 'Cloudnine Hospital Kanakapura Road': '5.0 Km', ' St. John\\'S Health Centre': '4.0 Km ', 'Kalyani Magnum It Park': '10.0 Km'}"

In [196]:
location_df.columns[10:50]

Index(['Bommasandra Industrial Area', 'Electronic City Tech Park',
       'School of Engineering and Technology Jain University Bangalore',
       'International Academy of Management Entrepreneurship',
       'Institute Of Chartered Financial Analysts Of India',
       'Dav Public School', 'Bangalore International Academy',
       'Tanishq Jewellery', 'Royalmart Supermarket', 'METRO Wholesale',
       'DMart', 'Cloudnine Hospital Kanakapura Road',
       '"St. John'S Health Centre', 'Kalyani Magnum It Park',
       'The Oxford College Of Pharmacy', 'Presidency English School',
       'Green Line Yelachenahalli Metro Station', 'Belandur Railway Station',
       'Gopalan Innovation Mall', 'Vega City Mall', 'Royal Meenakshi Mall',
       'Amr Tech Park', 'Sjb Institute Of Technology',
       'Rns International School', 'Rns Institute Of Technology',
       'Global Academy Of Technology', 'New Horizon Public School',
       'Purple Line Nayandahalli Metro Station',
       'Jnana Bharathi 

In [316]:
location_df.index = df1.society

In [224]:
location_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2222 entries, sri sai nandana royal to parimala trinity
Columns: 1160 entries, Ebenezer International School to Embassy Tech Village
dtypes: float64(1160)
memory usage: 19.7+ MB


In [317]:
location_df.fillna(54000,inplace=True)

In [318]:
from sklearn.preprocessing import StandardScaler
# Initialize the scaler
scaler = StandardScaler()

# Apply the scaler to the entire dataframe
location_df_normalized = pd.DataFrame(scaler.fit_transform(location_df), columns=location_df.columns, index=location_df.index)

In [319]:
location_df_normalized

Unnamed: 0_level_0,Akshara International School,Alard College Of Pharmacy,Sunflower Public School,Saibalaji International Institute Of Management,Crimson Anisha Global School,City One Mall,Prime City Centre,Xion Mall,Premier Plaza Mall,Marunji Hospital,...,Oyster English School,Shreeyash Multi-Speciality Hospital,Spc Gurukul School And Sun Bright School And College,Shwas Multi Speciality Hospital,Manas Hospital,Clara Global School,Krome Mall,Weikfield It Citi Info Park,Homeschool International,Yog Multispeciality Hospital And Research Centre
society,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
pride park astra,-2.795244,-14.106736,-14.106736,-14.106736,-14.106736,-2.160477,-14.106736,-1.819539,-3.685849,-14.106736,...,0.050063,0.050063,-0.112509,-0.112509,-0.112509,0.050063,0.050063,0.050063,0.050063,0.050063
gurukrupa residency,-2.795244,-14.106736,-14.106736,-14.106736,-14.106736,-2.160477,-14.106736,-1.819539,-3.685849,-14.106736,...,0.050063,0.050063,-0.112509,-0.112509,-0.112509,0.050063,0.050063,0.050063,0.050063,0.050063
premia,-2.832439,0.070888,0.070888,0.070888,0.070888,0.419438,0.070888,-1.876087,0.257966,0.070888,...,0.050063,0.050063,-0.112509,-0.112509,-0.112509,0.050063,0.050063,0.050063,0.050063,0.050063
shree siddhivinayak angan,-2.832439,0.070888,0.070888,0.070888,0.070888,0.419438,0.070888,-1.876087,0.257966,0.070888,...,0.050063,0.050063,-0.112509,-0.112509,-0.112509,0.050063,0.050063,0.050063,0.050063,0.050063
devarshi complex,-2.832439,0.070888,0.070888,0.070888,0.070888,0.419438,0.070888,-1.876087,0.257966,0.070888,...,0.050063,0.050063,-0.112509,-0.112509,-0.112509,0.050063,0.050063,0.050063,0.050063,0.050063
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
godrej park greens,0.360159,0.070888,0.070888,0.070888,0.070888,0.419438,0.070888,0.546054,0.257966,0.070888,...,0.050063,0.050063,8.888194,8.888194,8.888194,0.050063,0.050063,0.050063,0.050063,0.050063
kute fortune,0.360159,0.070888,0.070888,0.070888,0.070888,0.419438,0.070888,0.546054,0.257966,0.070888,...,0.050063,0.050063,8.888194,8.888194,8.888194,0.050063,0.050063,0.050063,0.050063,0.050063
godrej forest grove,0.360159,0.070888,0.070888,0.070888,0.070888,0.419438,0.070888,0.546054,0.257966,0.070888,...,0.050063,0.050063,8.888194,8.888194,8.888194,0.050063,0.050063,0.050063,0.050063,0.050063
yashada dreamsong,0.360159,0.070888,0.070888,0.070888,0.070888,0.419438,0.070888,0.546054,0.257966,0.070888,...,0.050063,0.050063,8.888194,8.888194,8.888194,0.050063,0.050063,0.050063,0.050063,0.050063


In [320]:
cosine_sim3 = cosine_similarity(location_df_normalized)

In [321]:
cosine_sim3.shape

(400, 400)

In [322]:
def recommend_properties_with_scores(property_name, top_n=401):
    
    cosine_sim_matrix =  20*cosine_sim2 + 8*cosine_sim3
    # cosine_sim_matrix = cosine_sim3
    
    # Get the similarity scores for the property using its name as the index
    sim_scores = list(enumerate(cosine_sim_matrix[location_df_normalized.index.get_loc(property_name)]))
    
    # Sort properties based on the similarity scores
    sorted_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the indices and scores of the top_n most similar properties
    top_indices = [i[0] for i in sorted_scores[1:top_n+1]]
    top_scores = [i[1] for i in sorted_scores[1:top_n+1]]
    
    # Retrieve the names of the top properties using the indices
    top_properties = location_df_normalized.index[top_indices].tolist()
    
    # Create a dataframe with the results
    recommendations_df = pd.DataFrame({
        'PropertyName': top_properties,
        'SimilarityScore': top_scores
    })
    
    return recommendations_df

# Test the recommender function using a property name
recommend_properties_with_scores('godrej nurture')

Unnamed: 0,PropertyName,SimilarityScore
0,nyati emerald,27.263939
1,aldea espanola phase 2,26.793984
2,nandan prospera gold,26.558089
3,regency astra,24.903412
4,majestique towers,20.293406
...,...,...
394,fortune east,-18.648641
395,kolte-patil three jewels phase 2,-18.666389
396,nakshatra i land,-18.689756
397,prasun adara,-18.717747


In [298]:
df1['society'].value_counts()

society
godrej nurture                   3
purva atmosphere                 2
kalpataru estate                 2
paranjape broadway               1
shree sonigara signature park    1
                                ..
konark exotica                   1
bhandari swaraj                  1
kimaya                           1
nyati elan                       1
mirchandani bellagio             1
Name: count, Length: 400, dtype: int64

In [300]:
df1[df1['society']  == 'godrej nurture'][['society','nearbylocation']]

Unnamed: 0,society,nearbylocation
4256,godrej nurture,"{'ISBR Business School Bangalore': '0.4 Km', '..."
4257,godrej nurture,"{'Sharda University': '5.8 Km', 'K R Managalam..."
4258,godrej nurture,"{'Akshara International School': '7.5 Km', 'Xi..."


In [301]:
df1['num_locations'] = df1['nearbylocation'].apply(lambda x: len(x))
df1 = df1.sort_values('num_locations', ascending=False).drop_duplicates(subset='society', keep='first')
df1.drop(columns='num_locations', inplace=True)


In [302]:
df1.shape

(400, 11)