# Content Based Filtering

This notebook will mainly focus on the main algorithm of content based filtering for apartment recommendation given the preprocessing of data is completed.

In [33]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [34]:
#importing apartments dataframe
apartments_df = pd.read_csv("Datasets/Cleaned_House_Rent_Dataset.csv")
apartments_df.head()

Unnamed: 0,Apartment_id,Posted On,BHK,Rent,Size,Floor,Area Type,Area Locality,City,Furnishing Status,Tenant Preferred,Bathroom,Point of Contact
0,1,18-05-2022,2,10000,1100.0,Ground out of 2,Super Area,Bandel,Kolkata,Unfurnished,Bachelors/Family,2,Contact Owner
1,2,13-05-2022,2,20000,800.0,1 out of 3,Super Area,"Phool Bagan, Kankurgachi",Kolkata,Semi-Furnished,Bachelors/Family,1,Contact Owner
2,3,16-05-2022,2,17000,1000.0,1 out of 3,Super Area,Salt Lake City Sector 2,Kolkata,Semi-Furnished,Bachelors/Family,1,Contact Owner
3,5,09-05-2022,2,7500,850.0,1 out of 2,Carpet Area,South Dum Dum,Kolkata,Unfurnished,Bachelors,1,Contact Owner
4,6,29-04-2022,2,7000,600.0,Ground out of 1,Super Area,Thakurpukur,Kolkata,Unfurnished,Bachelors/Family,2,Contact Owner


In [35]:
# We will drop columns that will not be needed
apartments_df.drop(["Posted On","Floor", "Area Locality","Point of Contact", "Tenant Preferred"], axis=1, inplace=True)
apartments_df.head()

Unnamed: 0,Apartment_id,BHK,Rent,Size,Area Type,City,Furnishing Status,Bathroom
0,1,2,10000,1100.0,Super Area,Kolkata,Unfurnished,2
1,2,2,20000,800.0,Super Area,Kolkata,Semi-Furnished,1
2,3,2,17000,1000.0,Super Area,Kolkata,Semi-Furnished,1
3,5,2,7500,850.0,Carpet Area,Kolkata,Unfurnished,1
4,6,2,7000,600.0,Super Area,Kolkata,Unfurnished,2


In [36]:
#One-hot encoding the Size column into 11 new columns based on range of values.
for i in range(6):
    col_name = "size_"+str(i*500)+"_"+str((i+1)*500)
    apartments_df[col_name] = 0
    apartments_df.loc[( (apartments_df['Size'] >= (i*500)) & (apartments_df['Size'] < (i+1)*500) ), col_name] = 1

apartments_df['size_above_3000'] = 0
apartments_df.loc[(apartments_df['Size'] >= 3000) , "size_above_3000"] = 1

apartments_df.drop("Size",axis = 1,inplace = True)

In [37]:
#One-hot encoding the Rent column into 7 new columns based on range of values.
for i in range(10):
    col_name = "rent_"+str(i*5000)+"_"+str((i+1)*5000)
    apartments_df[col_name] = 0
    # apartments_df.loc[apartments_df['Size'] >= (i * 500) and apartments_df['Size'] < ((i+1) * 500),col_name] = 1
    apartments_df.loc[( (apartments_df['Rent'] >= (i*5000)) & (apartments_df['Rent'] < (i+1)*5000) ), col_name] = 1

apartments_df['rent_above_50000'] = 0
apartments_df.loc[(apartments_df['Rent'] >= 50000) , "rent_above_50000"] = 1

apartments_df.drop("Rent",axis = 1,inplace = True)

In [38]:
#One-hot encoding the Bathroom column
for x in apartments_df['Bathroom'].unique():
    apartments_df['Bathrooms_' + str(x)] = 0
    apartments_df.loc[apartments_df['Bathroom'] == x,'Bathrooms_' + str(x)] = 1

apartments_df.drop("Bathroom",axis = 1,inplace = True)

In [39]:
#One-hot encoding BHK column
for x in apartments_df['BHK'].unique():
    apartments_df['BHK_' + str(x)] = 0
    apartments_df.loc[apartments_df['BHK'] == x,'BHK_' + str(x)] = 1

apartments_df.drop("BHK",axis = 1,inplace = True)

In [40]:
#One-hot encoding the Area Type column
for x in apartments_df['Area Type'].unique():
    apartments_df[x] = 0
    apartments_df.loc[apartments_df['Area Type'] == x,x] = 1

apartments_df.drop("Area Type",axis = 1,inplace = True)

In [41]:
#One-hot encoding the Furnishing Status column
for x in apartments_df['Furnishing Status'].unique():
    apartments_df[x] = 0
    apartments_df.loc[apartments_df['Furnishing Status'] == x,x] = 1

apartments_df.drop("Furnishing Status",axis = 1,inplace = True)

In [42]:
apartments_df.head()

Unnamed: 0,Apartment_id,City,size_0_500,size_500_1000,size_1000_1500,size_1500_2000,size_2000_2500,size_2500_3000,size_above_3000,rent_0_5000,...,BHK_3,BHK_6,BHK_4,BHK_5,Super Area,Carpet Area,Built Area,Unfurnished,Semi-Furnished,Furnished
0,1,Kolkata,0,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
1,2,Kolkata,0,1,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
2,3,Kolkata,0,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
3,5,Kolkata,0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
4,6,Kolkata,0,1,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0


### Ratings Dataset

In [43]:
rating_df = pd.read_csv("Datasets/Cleaned_Tenants_Review_Dataset.csv")
rating_df.head()

Unnamed: 0,tenant_id,Apartment_id,rating
0,1,2731,10.0
1,1,2762,9.0
2,1,2479,6.0
3,1,43,10.0
4,2,2652,9.0


In [44]:
rating_df.shape

(1678, 3)

### Recommendation System

In [45]:
# Use the random library to generate a random user id
import random
# # Set random seed (for reproducibility)
random.seed(10)

# Pick a random id from the ratings dataset
user = random.randint(rating_df["tenant_id"].min(), rating_df["tenant_id"].max())
user

34

In [47]:
city_preferred = input("Enter a city: ")

Enter a city:  Hyderabad


In [48]:
city_preferred

'Hyderabad'

In [49]:
user_df = rating_df[rating_df["tenant_id"]==user]

# Reset the indexes
user_df.reset_index(drop=True, inplace=True)
# Drop the columns that are not needed
user_df = user_df.drop("tenant_id", axis=1)
user_df = user_df.sort_values("Apartment_id")
user_df

Unnamed: 0,Apartment_id,rating
2,67,7.0
0,247,10.0
1,766,10.0


In [50]:
#get the columns of the apartment the selected user has previously stayed from apartments dataset
user_apartments_df = apartments_df[apartments_df["Apartment_id"].isin(user_df["Apartment_id"])]
user_apartments_df

Unnamed: 0,Apartment_id,City,size_0_500,size_500_1000,size_1000_1500,size_1500_2000,size_2000_2500,size_2500_3000,size_above_3000,rent_0_5000,...,BHK_3,BHK_6,BHK_4,BHK_5,Super Area,Carpet Area,Built Area,Unfurnished,Semi-Furnished,Furnished
59,67,Kolkata,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
226,247,Kolkata,0,1,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
710,766,Mumbai,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0


In [51]:
# Sort the user_apartments_df by the apartment_id's so that the rows correspond to the same apartment in the user's rated dataframe
user_apartments_df = user_apartments_df.sort_values("Apartment_id")
user_apartments_df.reset_index(drop=True, inplace=True)
user_apartments_df

Unnamed: 0,Apartment_id,City,size_0_500,size_500_1000,size_1000_1500,size_1500_2000,size_2000_2500,size_2500_3000,size_above_3000,rent_0_5000,...,BHK_3,BHK_6,BHK_4,BHK_5,Super Area,Carpet Area,Built Area,Unfurnished,Semi-Furnished,Furnished
0,67,Kolkata,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
1,247,Kolkata,0,1,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
2,766,Mumbai,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0


Remark: Drop the apartments in the user's rated dataframe that are not present in usert_apartments_df if needed

In [52]:
user_apartments_matrix = user_apartments_df.drop(["Apartment_id", "City"], axis=1)
user_apartments_matrix.shape

(3, 38)

With content based filtering, we create a matrix of the attributes of apartments that the user stayed. We then create weights for each attribute where the higher the weight, the more likely the user enjoys that feature of apartment. To do this, we use the dot product between a matrix and a vector: the attribute matrix and the ratings of each of the apartment the user stayed (vector).

In [53]:
# Vector 
user_df["rating"]

2     7.0
0    10.0
1    10.0
Name: rating, dtype: float64

In [54]:
# Dot product
weights = user_apartments_matrix.transpose().dot(user_df["rating"])

weights

size_0_500          17.0
size_500_1000       10.0
size_1000_1500       0.0
size_1500_2000       0.0
size_2000_2500       0.0
size_2500_3000       0.0
size_above_3000      0.0
rent_0_5000          0.0
rent_5000_10000     27.0
rent_10000_15000     0.0
rent_15000_20000     0.0
rent_20000_25000     0.0
rent_25000_30000     0.0
rent_30000_35000     0.0
rent_35000_40000     0.0
rent_40000_45000     0.0
rent_45000_50000     0.0
rent_above_50000     0.0
Bathrooms_2         10.0
Bathrooms_1         17.0
Bathrooms_3          0.0
Bathrooms_5          0.0
Bathrooms_4          0.0
Bathrooms_6          0.0
Bathrooms_7          0.0
Bathrooms_10         0.0
BHK_2               10.0
BHK_1               17.0
BHK_3                0.0
BHK_6                0.0
BHK_4                0.0
BHK_5                0.0
Super Area          17.0
Carpet Area         10.0
Built Area           0.0
Unfurnished         17.0
Semi-Furnished      10.0
Furnished            0.0
dtype: float64

In [55]:
# Set the index of the dataframe to the apartments_df
recommendation_table = apartments_df[apartments_df['City'] == city_preferred].set_index("Apartment_id")
recommendation_table

Unnamed: 0_level_0,City,size_0_500,size_500_1000,size_1000_1500,size_1500_2000,size_2000_2500,size_2500_3000,size_above_3000,rent_0_5000,rent_5000_10000,...,BHK_3,BHK_6,BHK_4,BHK_5,Super Area,Carpet Area,Built Area,Unfurnished,Semi-Furnished,Furnished
Apartment_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3879,Hyderabad,0,0,0,0,0,0,1,0,0,...,0,0,1,0,0,1,0,0,1,0
3880,Hyderabad,0,0,0,0,0,0,1,0,0,...,0,0,1,0,0,1,0,0,1,0
3881,Hyderabad,0,0,1,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
3882,Hyderabad,0,0,0,0,0,0,1,0,0,...,1,0,0,0,0,1,0,0,1,0
3883,Hyderabad,0,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4742,Hyderabad,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
4743,Hyderabad,0,0,0,0,1,0,0,0,0,...,1,0,0,0,1,0,0,0,1,0
4744,Hyderabad,0,0,0,1,0,0,0,0,0,...,1,0,0,0,0,1,0,0,1,0
4745,Hyderabad,0,0,0,1,0,0,0,0,0,...,1,0,0,0,0,1,0,0,1,0


In [56]:
recommendation_series = (recommendation_table * weights).sum(axis=1) / weights.sum()
recommendation_series.head()

Apartment_id
3879    0.123457
3880    0.123457
3881    0.333333
3882    0.123457
3883    0.438272
dtype: object

In [57]:
# Sort in descending order
recommendations = recommendation_series.sort_values(ascending=False)
recommendations.head(10)

Apartment_id
3926    0.691358
4062    0.691358
4619    0.691358
3920    0.691358
4432    0.691358
4606    0.691358
4109    0.691358
4710    0.691358
4637    0.691358
4638    0.691358
dtype: object

In [61]:
# Find the top 10 Apartments in the recommendations in the apartments dataset and put it in a new dataframe
recommendations_df = apartments_df.loc[apartments_df["Apartment_id"].isin(recommendations.head(10).keys())]
# Set the index of the dataframe to the anime ids
recommendations_df.set_index("Apartment_id", inplace=True)
# Use loc and the Apartment_id ids of the top 10 apartment recommendations to preserve the order and output that to the user
recommendations_df

Unnamed: 0_level_0,City,size_0_500,size_500_1000,size_1000_1500,size_1500_2000,size_2000_2500,size_2500_3000,size_above_3000,rent_0_5000,rent_5000_10000,...,BHK_3,BHK_6,BHK_4,BHK_5,Super Area,Carpet Area,Built Area,Unfurnished,Semi-Furnished,Furnished
Apartment_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3920,Hyderabad,1,0,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,1,0,0
3926,Hyderabad,1,0,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,1,0,0
4062,Hyderabad,1,0,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,1,0,0
4109,Hyderabad,1,0,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,1,0,0
4432,Hyderabad,1,0,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,1,0,0
4606,Hyderabad,1,0,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,1,0,0
4619,Hyderabad,1,0,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,1,0,0
4637,Hyderabad,1,0,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,1,0,0
4638,Hyderabad,1,0,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,1,0,0
4710,Hyderabad,1,0,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,1,0,0


### Comparing Results with previous stays

In [64]:
# Previous apartments the user stayed in.
testing_df = pd.read_csv("Datasets/Cleaned_House_Rent_Dataset.csv")
testing_df[((testing_df['Apartment_id'] == 67) | (testing_df['Apartment_id'] == 247) | (testing_df['Apartment_id'] == 766) )]

Unnamed: 0,Apartment_id,Posted On,BHK,Rent,Size,Floor,Area Type,Area Locality,City,Furnishing Status,Tenant Preferred,Bathroom,Point of Contact
59,67,02-07-2022,1,5000,300.0,Ground out of 2,Carpet Area,Kasba,Kolkata,Unfurnished,Bachelors/Family,1,Contact Owner
226,247,06-05-2022,2,7000,600.0,2 out of 2,Super Area,"Park Circus, Ballygunge",Kolkata,Semi-Furnished,Bachelors/Family,2,Contact Owner
710,766,27-06-2022,1,7500,250.0,1 out of 7,Super Area,Chinmay Gaurang Infrastructure Old Panvel,Mumbai,Unfurnished,Bachelors/Family,1,Contact Owner


In [65]:
# top 3 recommended apartments
testing_df[((testing_df['Apartment_id'] == 117) | (testing_df['Apartment_id'] == 333) | (testing_df['Apartment_id'] == 399) )]

Unnamed: 0,Apartment_id,Posted On,BHK,Rent,Size,Floor,Area Type,Area Locality,City,Furnishing Status,Tenant Preferred,Bathroom,Point of Contact
104,117,30-05-2022,1,5000,20.0,2 out of 3,Super Area,Dakshineswar,Kolkata,Unfurnished,Bachelors/Family,1,Contact Owner
305,333,10-06-2022,1,6500,400.0,1 out of 1,Super Area,"Vivek Nagar, Garfa",Kolkata,Unfurnished,Bachelors/Family,1,Contact Owner
369,399,03-06-2022,1,6000,400.0,1 out of 2,Super Area,Thakurpukur,Kolkata,Unfurnished,Bachelors/Family,1,Contact Owner
