Collaborative Filtering

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time

In [2]:
city = pd.read_excel('Data/City.xlsx')

In [3]:
city.head()

Unnamed: 0,City ID,City,Country,Climate
0,C-1,Tokyo,Japan,Humid subtropical
1,C-2,Sydney,Australia,Temperate
2,C-3,Buenos Aires,Argentina,Humid subtropical
3,C-4,Cairo,Egypt,Desert
4,C-5,Berlin,Germany,Oceanic


In [4]:
city.shape

(200, 4)

In [5]:
user = pd.read_excel('Data/User.xlsx')

In [6]:
user.head()

Unnamed: 0,UserID,Nationality
0,U-1,Brazil
1,U-2,Japan
2,U-3,Canada
3,U-4,Brazil
4,U-5,Italy


In [7]:
user.shape

(200, 2)

In [8]:
ratings = pd.read_excel('Data/Rating.xlsx')

In [9]:
ratings.head()

Unnamed: 0,User ID,City ID,City Name,Travel Rating
0,U-1,C-2,Sydney,5
1,U-2,C-17,Hyderabad,4
2,U-3,C-1,Tokyo,5
3,U-4,C-8,New York City,5
4,U-5,C-17,Hyderabad,2


In [10]:
ratings.shape

(200, 4)

In [11]:
#Count how many total rating each user gave

ratings['User ID'].value_counts()

User ID
U-49     4
U-2      3
U-74     3
U-63     3
U-55     3
        ..
U-38     1
U-47     1
U-45     1
U-48     1
U-103    1
Name: count, Length: 104, dtype: int64

In [12]:
#Taking users who have gave a total no.of rating > 1
#The value of 1 can be changed below

x = ratings['User ID'].value_counts() > 1

In [13]:
x[x].shape

(79,)

In [14]:
#Knowing the user ID's

y = x[x].index

In [15]:
y

Index(['U-49', 'U-2', 'U-74', 'U-63', 'U-55', 'U-21', 'U-64', 'U-18', 'U-71',
       'U-68', 'U-26', 'U-46', 'U-44', 'U-39', 'U-81', 'U-40', 'U-86', 'U-51',
       'U-59', 'U-53', 'U-52', 'U-54', 'U-50', 'U-57', 'U-56', 'U-1', 'U-65',
       'U-58', 'U-60', 'U-83', 'U-85', 'U-77', 'U-78', 'U-79', 'U-80', 'U-76',
       'U-75', 'U-73', 'U-70', 'U-72', 'U-69', 'U-66', 'U-67', 'U-89', 'U-88',
       'U-41', 'U-43', 'U-97', 'U-3', 'U-4', 'U-5', 'U-6', 'U-101', 'U-102',
       'U-100', 'U-99', 'U-13', 'U-14', 'U-15', 'U-16', 'U-17', 'U-20', 'U-98',
       'U-23', 'U-93', 'U-42', 'U-90', 'U-37', 'U-36', 'U-35', 'U-91', 'U-92',
       'U-94', 'U-95', 'U-96', 'U-32', 'U-25', 'U-24', 'U-84'],
      dtype='object', name='User ID')

In [16]:
#Removing users who have rated less then 2 times (total)

ratings = ratings[ratings['User ID'].isin(y)]

In [17]:
ratings.head()

Unnamed: 0,User ID,City ID,City Name,Travel Rating
0,U-1,C-2,Sydney,5
1,U-2,C-17,Hyderabad,4
2,U-3,C-1,Tokyo,5
3,U-4,C-8,New York City,5
4,U-5,C-17,Hyderabad,2


In [18]:
ratings.shape

(175, 4)

In [19]:
#Merging with City dataset

ratings_with_place = ratings.merge(city, on = "City ID")

In [20]:
ratings_with_place.head()

Unnamed: 0,User ID,City ID,City Name,Travel Rating,City,Country,Climate
0,U-1,C-2,Sydney,5,Sydney,Australia,Temperate
1,U-2,C-17,Hyderabad,4,Hyderabad,India,\nPleasant
2,U-3,C-1,Tokyo,5,Tokyo,Japan,Humid subtropical
3,U-4,C-8,New York City,5,New York City,USA,Humid continental
4,U-5,C-17,Hyderabad,2,Hyderabad,India,\nPleasant


In [21]:
ratings_with_place.shape

(171, 7)

In [22]:
#Finding out how many times each city is rated

num_rating = ratings_with_place.groupby('City')['Travel Rating'].count().reset_index()

In [23]:
num_rating.rename(columns = {'Travel Rating' : 'Total no of Ratings'}, inplace = True)

In [24]:
num_rating.head()

Unnamed: 0,City,Total no of Ratings
0,Abu Dhabi,2
1,Agra,3
2,Aspen,3
3,Athens,3
4,Auli,3


In [25]:
num_rating.shape

(66, 2)

In [26]:
#Merging with ratings_with_place dataset

final_rating = ratings_with_place.merge(num_rating, on = "City")

In [27]:
final_rating.head()

Unnamed: 0,User ID,City ID,City Name,Travel Rating,City,Country,Climate,Total no of Ratings
0,U-1,C-2,Sydney,5,Sydney,Australia,Temperate,3
1,U-2,C-17,Hyderabad,4,Hyderabad,India,\nPleasant,4
2,U-3,C-1,Tokyo,5,Tokyo,Japan,Humid subtropical,2
3,U-4,C-8,New York City,5,New York City,USA,Humid continental,1
4,U-5,C-17,Hyderabad,2,Hyderabad,India,\nPleasant,4


In [28]:
final_rating.shape

(171, 8)

In [29]:
#Removing cities which have been not rated more then 2 times
#The value of 2 can be changed below

final_rating = final_rating[final_rating['Total no of Ratings'] > 2]

In [30]:
final_rating.head()

Unnamed: 0,User ID,City ID,City Name,Travel Rating,City,Country,Climate,Total no of Ratings
0,U-1,C-2,Sydney,5,Sydney,Australia,Temperate,3
1,U-2,C-17,Hyderabad,4,Hyderabad,India,\nPleasant,4
4,U-5,C-17,Hyderabad,2,Hyderabad,India,\nPleasant,4
5,U-6,C-11,Paris,5,Paris,France,Oceanic,3
6,U-2,C-54,Bengaluru,3,Bengaluru,India,\nModerate,3


In [31]:
final_rating.shape

(121, 8)

In [32]:
city_pivot = final_rating.pivot_table(columns = 'User ID', index = 'City', values = 'Travel Rating')

In [33]:
city_pivot

User ID,U-1,U-100,U-101,U-102,U-13,U-14,U-15,U-16,U-17,U-18,...,U-90,U-91,U-92,U-93,U-94,U-95,U-96,U-97,U-98,U-99
City,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Agra,,,,,,,,,,,...,3.0,,,5.0,,,3.0,,,
Aspen,,,,,,,,,,,...,,,,,,,,,,
Athens,,5.0,,,,,,,,,...,5.0,,,,,,,,,5.0
Auli,,,,,,,,,,,...,,,,,,,,,,
Babylon,,,,4.0,,,,,,,...,,,,,4.0,5.0,,,,
Banff,,,,,,,,,,,...,,,,,,,,,,
Bengaluru,,,,,2.0,,,3.0,,,...,,,,,,,,,,
Boracay,,,,,,,,,,,...,,,,,,,,,,
Cairo,,4.0,,,,,,,,,...,,4.0,,,,,,,5.0,
Cape Town,,,,,,,,,,,...,,,,,,,,,,


In [34]:
city_pivot.shape

(37, 74)

In [35]:
city_pivot.fillna(0, inplace = True)

In [36]:
city_pivot

User ID,U-1,U-100,U-101,U-102,U-13,U-14,U-15,U-16,U-17,U-18,...,U-90,U-91,U-92,U-93,U-94,U-95,U-96,U-97,U-98,U-99
City,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Agra,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.0,0.0,0.0,5.0,0.0,0.0,3.0,0.0,0.0,0.0
Aspen,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Athens,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
Auli,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Babylon,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,4.0,5.0,0.0,0.0,0.0,0.0
Banff,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Bengaluru,0.0,0.0,0.0,0.0,2.0,0.0,0.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Boracay,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Cairo,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0
Cape Town,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [37]:
from scipy.sparse import csr_matrix

In [38]:
city_sparse = csr_matrix(city_pivot)

In [39]:
city_sparse

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 121 stored elements and shape (37, 74)>

In [40]:
from sklearn.neighbors import NearestNeighbors
model = NearestNeighbors(algorithm = 'brute')

In [41]:
model.fit(city_sparse)

In [42]:
#n_neighbors value can be changed
#n_neighbors gives how many values(Cities) you want in recommendation

distance, suggestion = model.kneighbors(city_pivot.iloc[9,:].values.reshape(1,-1), n_neighbors = 4)

In [43]:
distance

array([[0.        , 6.70820393, 7.61577311, 8.36660027]])

In [44]:
suggestion

array([[ 9, 36, 26,  7]], dtype=int64)

In [45]:
for i in range(len(suggestion)):
    print(city_pivot.index[suggestion[i]])

Index(['Cape Town', 'Visakhapatnam', 'Miami Beach', 'Boracay'], dtype='object', name='City')


In [46]:
city_pivot.index[5]

'Banff'

In [47]:
city_name = city_pivot.index

In [48]:
#Testing

def recommend_city(city_name):
    city_id = np.where(city_pivot.index == city_name)[0][0]
    distance, suggestion = model.kneighbors(city_pivot.iloc[city_id,:].values.reshape(1,-1), n_neighbors = 4)

    for i in range(len(suggestion)):
        cities = city_pivot.index[suggestion[i]]
        for j in cities:
            print(j)

In [49]:
city_name = 'Hyderabad'
recommend_city(city_name)

Hyderabad
Bengaluru
Boracay
Visakhapatnam


In [50]:
# Measure computation time for the recommendation
def measure_recommendation_time(city_name):
    start_time = time.time()  # Start time
    recommendations = recommend_city(city_name)  # Call the recommendation function
    end_time = time.time()  # End time
    elapsed_time = end_time - start_time  # Calculate elapsed time
    print(f"Query processing time: {elapsed_time:.4f} seconds")

In [51]:
# Example usage
measure_recommendation_time('Hyderabad')

Hyderabad
Bengaluru
Boracay
Visakhapatnam
Query processing time: 0.0079 seconds


In [52]:
import pickle
pickle.dump(model, open('artifacts/model.pkl', 'wb'))
pickle.dump(city_name, open('artifacts/city_name.pkl', 'wb'))
pickle.dump(final_rating, open('artifacts/final_rating.pkl', 'wb'))
pickle.dump(city_pivot, open('artifacts/city_pivot.pkl', 'wb'))

In [56]:
# Split the data into training and test sets
from sklearn.model_selection import train_test_split

# Create a mask for the ratings
train_data, test_data = train_test_split(final_rating, test_size=0.2, random_state=42)

# Create a pivot table for the training data
train_pivot = train_data.pivot_table(columns='User ID', index='City', values='Travel Rating')
train_pivot.fillna(0, inplace=True)

# Create a sparse matrix for the training data
train_sparse = csr_matrix(train_pivot)

# Train the model on the training data
model.fit(train_sparse)

In [57]:
# Function to evaluate the model
def evaluate_model(test_data):
    test_pivot = test_data.pivot_table(columns='User ID', index='City', values='Travel Rating')
    test_pivot.fillna(0, inplace=True)
    
    # Calculate the mean absolute error
    total_error = 0
    count = 0
    
    for city in test_pivot.index:
        if city in train_pivot.index:
            city_index = train_pivot.index.get_loc(city)
            distances, indices = model.kneighbors(train_sparse[city_index], n_neighbors=1)  # Get the nearest neighbor
            
            # Get the predicted rating
            predicted_rating = train_pivot.values[indices.flatten()[0]].mean()  # Mean rating of the nearest neighbor
            
            # Get the actual rating from the test set
            actual_rating = test_pivot.loc[city].mean()  # Mean rating for the city in the test set
            
            # Calculate the error
            total_error += abs(predicted_rating - actual_rating)
            count += 1
            
    mae = total_error / count if count > 0 else 0
    return mae

In [58]:
# Evaluate the model
mae = evaluate_model(test_data)
print("Mean Absolute Error:", mae)

Mean Absolute Error: 0.08734587929915641
