In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
!pip install scikit-surprise

Collecting scikit-surprise
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/772.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.4/772.0 kB[0m [31m2.0 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━[0m [32m409.6/772.0 kB[0m [31m6.5 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m768.0/772.0 kB[0m [31m8.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.0/772.0 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp310-cp310-linux_x86_

In [3]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from surprise import Dataset,Reader
from surprise import SVD
from surprise.model_selection import train_test_split,cross_validate
from surprise import accuracy


In [4]:
csv_file_path_1 = '/content/drive/MyDrive/Major_Project/City.csv'
csv_file_path_2 = '/content/drive/MyDrive/Major_Project/Places.csv'


df_city= pd.read_csv(csv_file_path_1)
df_places= pd.read_csv(csv_file_path_2)

# Display the first few rows of the DataFrame
print(df_city.head())

          City  Ratings Ideal_duration  Best_time_to_visit  \
0       Manali      4.5         02-Apr        October-June   
1   Leh Ladakh      4.6         05-Jul         JulyOctober   
2        Coorg      4.2         02-Mar      September-June   
3      Andaman      4.5         04-Jun       October-March   
4  Lakshadweep      4.0         04-Jun  September-February   

                                           City_desc  
0  [' One of the most popular hill stations in Hi...  
1  [" Ladakh is a union territory in the Kashmir ...  
2  [' Located amidst imposing mountains in Karnat...  
3  [' Replete with turquoise blue water beaches a...  
4  [" Formerly known as Laccadive Islands, Laksha...  


In [5]:
df_city.shape

(100, 5)

In [6]:
df_city.isnull().sum()

City                   0
Ratings                0
Ideal_duration         0
Best_time_to_visit    72
City_desc              0
dtype: int64

In [7]:
btv = df_city['Best_time_to_visit']
df_city_mod = df_city.drop(['Ideal_duration','Best_time_to_visit','Ratings'],axis=1)
df_city_mod

Unnamed: 0,City,City_desc
0,Manali,[' One of the most popular hill stations in Hi...
1,Leh Ladakh,"["" Ladakh is a union territory in the Kashmir ..."
2,Coorg,[' Located amidst imposing mountains in Karnat...
3,Andaman,[' Replete with turquoise blue water beaches a...
4,Lakshadweep,"["" Formerly known as Laccadive Islands, Laksha..."
...,...,...
95,Pushkar,[' Pushka is a small temple town located aroun...
96,Chittorgarh,"[' Located in South-Eastern Rajasthan, Chittor..."
97,Nahan,"[' Away from the buzzing crowd of the city, Na..."
98,Lavasa,"["" Known as India's newest hill station, the L..."


In [8]:
tfidf = TfidfVectorizer(stop_words="english")

tfidf_matrix = tfidf.fit_transform(df_city_mod['City_desc'])

print(tfidf_matrix.shape)

(100, 3721)


In [9]:
print(tfidf.get_feature_names_out())

['000' '050' '067' ... 'zones' 'zoo' 'zorbing']


In [10]:
custom_token_pattern = r'\b[a-zA-Z]+\b'
tfidf_vectorizer = TfidfVectorizer(token_pattern=custom_token_pattern,stop_words="english")

# Fit and transform the data
tfidf_matrix = tfidf_vectorizer.fit_transform(df_city_mod['City_desc'])
print(tfidf_matrix.shape)

(100, 3612)


In [11]:
tfidf_vectorizer.get_feature_names_out()

array(['aap', 'aarti', 'aati', ..., 'zones', 'zoo', 'zorbing'],
      dtype=object)

In [12]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [13]:
def recommend_destination(inp_dest, cosine_sim=cosine_sim):

    idx = df_city.loc[df_city_mod['City'].str.lower() == inp_dest.lower()].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[0:20]
    dest_indices = [i[0] for i in sim_scores]
    rec = list(df_city_mod['City'][i] for i in dest_indices)
    return rec

In [58]:
already_visited = 'Tirupati'
content_based_recommendations = recommend_destination(already_visited)
print(content_based_recommendations)

['Tirupati', 'Bhubaneswar', 'Rameshwaram', 'Ranthambore', 'Vrindavan', 'Jammu', 'Amritsar', 'Delhi', 'Varanasi', 'Haridwar', 'Mathura', 'Vaishno Devi', 'Udaipur', 'Puri', 'Madurai', 'Kasauli', 'Bodh Gaya', 'Visakhapatnam', 'Pushkar', 'Shirdi']


In [59]:
csv_file_path_3 = '/content/drive/MyDrive/Major_Project/Utility_Matrices/Utility_Matrix_5000_0.8.csv'
df_user = pd.read_csv(csv_file_path_3,index_col=0)
df_user.head()

Unnamed: 0_level_0,Manali,Leh Ladakh,Coorg,Andaman,Lakshadweep,Goa,Udaipur,Srinagar,Gangtok,Munnar,...,Bikaner,Shimoga (Shivamogga),Hogenakkal,Gir National Park,Kasauli,Pushkar,Chittorgarh,Nahan,Lavasa,Poovar
Users,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
user_0,0.0,1.0,4.0,3.0,5.0,1.0,3.0,3.0,0.0,0.0,...,0.0,2.0,3.0,1.0,2.0,2.0,5.0,4.0,4.0,4.0
user_1,2.0,2.0,3.0,5.0,1.0,4.0,5.0,5.0,4.0,2.0,...,0.0,1.0,4.0,1.0,2.0,3.0,2.0,3.0,3.0,0.0
user_2,4.0,4.0,3.0,5.0,2.0,4.0,4.0,3.0,5.0,3.0,...,1.0,4.0,0.0,1.0,1.0,3.0,1.0,5.0,4.0,0.0
user_3,4.0,2.0,1.0,2.0,3.0,1.0,2.0,1.0,5.0,5.0,...,1.0,3.0,5.0,1.0,1.0,3.0,0.0,1.0,4.0,0.0
user_4,1.0,0.0,1.0,1.0,5.0,5.0,1.0,3.0,4.0,1.0,...,3.0,5.0,3.0,2.0,4.0,0.0,5.0,2.0,3.0,2.0


In [60]:
df_cntnt_rec = df_user.loc[:, [rec for rec in content_based_recommendations]]
df_cntnt_rec

Unnamed: 0_level_0,Tirupati,Bhubaneswar,Rameshwaram,Ranthambore,Vrindavan,Jammu,Amritsar,Delhi,Varanasi,Haridwar,Mathura,Vaishno Devi,Udaipur,Puri,Madurai,Kasauli,Bodh Gaya,Visakhapatnam,Pushkar,Shirdi
Users,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
user_0,2.0,2.0,5.0,3.0,3.0,2.0,3.0,4.0,1.0,3.0,1.0,0.0,3.0,0.0,3.0,2.0,3.0,2.0,2.0,1.0
user_1,0.0,2.0,0.0,5.0,5.0,3.0,5.0,4.0,1.0,1.0,2.0,5.0,5.0,5.0,3.0,2.0,2.0,3.0,3.0,2.0
user_2,2.0,0.0,0.0,3.0,0.0,3.0,4.0,2.0,4.0,5.0,4.0,4.0,4.0,0.0,0.0,1.0,2.0,5.0,3.0,5.0
user_3,5.0,4.0,3.0,5.0,2.0,0.0,4.0,5.0,5.0,4.0,2.0,2.0,2.0,1.0,1.0,1.0,1.0,4.0,3.0,4.0
user_4,2.0,5.0,1.0,1.0,2.0,2.0,1.0,4.0,1.0,4.0,1.0,5.0,1.0,3.0,3.0,4.0,0.0,5.0,0.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
user_4995,1.0,1.0,1.0,2.0,0.0,4.0,1.0,5.0,0.0,5.0,1.0,0.0,4.0,2.0,4.0,1.0,1.0,1.0,0.0,1.0
user_4996,3.0,2.0,1.0,2.0,1.0,1.0,5.0,4.0,2.0,1.0,0.0,0.0,0.0,4.0,1.0,4.0,4.0,0.0,3.0,0.0
user_4997,0.0,5.0,3.0,4.0,3.0,5.0,2.0,3.0,3.0,1.0,0.0,0.0,0.0,0.0,3.0,1.0,4.0,0.0,3.0,5.0
user_4998,0.0,2.0,5.0,0.0,2.0,2.0,2.0,3.0,3.0,5.0,5.0,3.0,2.0,5.0,1.0,0.0,5.0,4.0,2.0,1.0




> Users who liked {already_visited}



In [61]:
df_cntnt_rec_liked = df_cntnt_rec.loc[df_cntnt_rec[already_visited] >= 3]
df_cntnt_rec_liked

Unnamed: 0_level_0,Tirupati,Bhubaneswar,Rameshwaram,Ranthambore,Vrindavan,Jammu,Amritsar,Delhi,Varanasi,Haridwar,Mathura,Vaishno Devi,Udaipur,Puri,Madurai,Kasauli,Bodh Gaya,Visakhapatnam,Pushkar,Shirdi
Users,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
user_3,5.0,4.0,3.0,5.0,2.0,0.0,4.0,5.0,5.0,4.0,2.0,2.0,2.0,1.0,1.0,1.0,1.0,4.0,3.0,4.0
user_6,4.0,4.0,4.0,1.0,1.0,2.0,0.0,2.0,5.0,0.0,1.0,2.0,4.0,5.0,0.0,0.0,1.0,5.0,4.0,1.0
user_10,5.0,4.0,4.0,3.0,3.0,4.0,0.0,0.0,3.0,2.0,2.0,1.0,5.0,0.0,5.0,0.0,3.0,2.0,0.0,4.0
user_12,5.0,5.0,5.0,5.0,0.0,0.0,3.0,0.0,5.0,2.0,5.0,3.0,5.0,4.0,0.0,1.0,2.0,4.0,3.0,3.0
user_13,4.0,5.0,4.0,5.0,3.0,0.0,0.0,5.0,4.0,2.0,1.0,1.0,5.0,2.0,2.0,0.0,0.0,1.0,1.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
user_4983,5.0,3.0,3.0,5.0,1.0,0.0,1.0,5.0,5.0,4.0,4.0,4.0,4.0,1.0,0.0,4.0,3.0,3.0,0.0,1.0
user_4987,5.0,0.0,5.0,2.0,2.0,1.0,0.0,1.0,0.0,2.0,2.0,1.0,4.0,2.0,1.0,3.0,5.0,3.0,0.0,3.0
user_4990,4.0,4.0,0.0,5.0,5.0,5.0,0.0,4.0,5.0,0.0,3.0,2.0,4.0,0.0,4.0,0.0,3.0,4.0,2.0,2.0
user_4991,5.0,2.0,3.0,3.0,3.0,1.0,0.0,3.0,0.0,0.0,0.0,1.0,0.0,2.0,3.0,0.0,0.0,0.0,5.0,0.0


In [62]:
df_cntnt_rec_mod = df_cntnt_rec_liked.reset_index().melt(id_vars='Users', var_name='location', value_name='raw_rating')
df_cntnt_rec_mod.columns = ['user', 'location', 'ratings']

df_cntnt_rec_mod.head()

Unnamed: 0,user,location,ratings
0,user_3,Tirupati,5.0
1,user_6,Tirupati,4.0
2,user_10,Tirupati,5.0
3,user_12,Tirupati,5.0
4,user_13,Tirupati,4.0


In [63]:
reader = Reader(rating_scale=(1, 5))

surprise_data = Dataset.load_from_df(df_cntnt_rec_mod[['user', 'location', 'ratings']], reader)
surprise_data

<surprise.dataset.DatasetAutoFolds at 0x788a12bfa440>

In [64]:
trainset, testset = train_test_split(surprise_data, test_size=.2)
trainset

<surprise.trainset.Trainset at 0x788a1281ec20>

In [65]:
svd = SVD(n_factors=200,n_epochs=50,lr_all = 0.0001)

# Train the algorithm on the training set
svd.fit(trainset)

# Make predictions on the test set
predictions = svd.test(testset)

# Compute and print RMSE (Root Mean Squared Error)
rmse = accuracy.rmse(predictions)
mae = accuracy.mae(predictions)

RMSE: 1.7176
MAE:  1.4951


In [76]:
user_id = 'user_32'
not_visited_by_user = df_cntnt_rec_liked.loc[user_id][df_cntnt_rec_liked.loc[user_id]==0]
not_visited_by_user

Vrindavan    0.0
Haridwar     0.0
Udaipur      0.0
Kasauli      0.0
Name: user_32, dtype: float64

In [77]:
not_visited_by_user.index

Index(['Vrindavan', 'Haridwar', 'Udaipur', 'Kasauli'], dtype='object')

In [78]:
preds=[]
for nv in not_visited_by_user.index:
  pred = svd.predict(user_id,nv)
  preds.append(pred)
preds


[Prediction(uid='user_32', iid='Vrindavan', r_ui=None, est=2.4452420563581083, details={'was_impossible': False}),
 Prediction(uid='user_32', iid='Haridwar', r_ui=None, est=2.154667319040978, details={'was_impossible': False}),
 Prediction(uid='user_32', iid='Udaipur', r_ui=None, est=2.412778667944183, details={'was_impossible': False}),
 Prediction(uid='user_32', iid='Kasauli', r_ui=None, est=2.42717240582532, details={'was_impossible': False})]

In [79]:
sorted_predictions = sorted(preds, key=lambda x: x.est, reverse=True)
sorted_iids = [pred.iid for pred in sorted_predictions]
sorted_iids

['Vrindavan', 'Kasauli', 'Udaipur', 'Haridwar']

In [80]:
final_pd = pd.DataFrame({'Recommendations based on users who liked {}'.format(already_visited):sorted_iids})
final_pd

Unnamed: 0,Recommendations based on users who liked Tirupati
0,Vrindavan
1,Kasauli
2,Udaipur
3,Haridwar
