In [66]:
from google.colab import drive
drive.mount('/content/drive')

import csv
import pandas as pd
import numpy as np

from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse
from sklearn.metrics import pairwise_distances

from gensim.models import KeyedVectors
import gensim.downloader as api
w2v = api.load("glove-twitter-25")
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
import nltk
nltk.download('wordnet')

import itertools
import warnings

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## User input

In [67]:
user_input = ['Pune','Bangalore','New York','Delhi']

# user_input = ['Kota','Varanasi','Chennai','Kathmandu']

# user_input = ['Paris',	'Sicilì'	,'Amsterdam'	,'London']

# user_input = ['Prague', 'Paris', 'Rome']

# user_input = ['Miami', 'Hawaii', 'Brighton']

## Item-based collaborative filtering






In [68]:
df = pd.read_csv('/content/drive/MyDrive/RS_Endterm_Dataset/user_data.csv')

n_users = len(df)
cities = pd.unique(pd.Series(df.values.ravel()).dropna())

user_df = pd.DataFrame(0, index = np.arange(n_users), columns = cities)

for i in range(n_users):
    user_df.loc[i, pd.unique(df.iloc[i].dropna())] = 1

def item_similarity(rating_df):
    sparse_df = sparse.csr_matrix(rating_df)
    simi_matrix = cosine_similarity(sparse_df.transpose())
    simi_df = pd.DataFrame(data=simi_matrix, index=rating_df.columns, columns=rating_df.columns)
    return simi_df


simi_df = item_similarity(user_df)
simi_df.reset_index(inplace=True)

cities_score = {}

for city in user_input:
  top = simi_df[city].nlargest(5)
  for idx in top.index:
    if cities[idx] not in user_input:
      if cities[idx] in cities_score:
        cities_score[cities[idx]] += top[idx]
      else:
        cities_score[cities[idx]] = top[idx]

## Item-based collaborative filtering output

In [69]:
sorted_cities_score = dict(sorted(cities_score.items(), key=lambda x: x[1], reverse = True))
sorted_cities_score

{'Kochi': 0.5477225575051663,
 'Chiapas': 0.5,
 'Mumbai': 0.44453426372657867,
 'Bareilly': 0.4392723441568558,
 'Hengelo': 0.31622776601683794,
 'McLeod Ganj': 0.31622776601683794,
 'Plano': 0.31622776601683794,
 'Chennai': 0.21128856368212917,
 'Varanasi': 0.20333142711013955,
 'Hoi An': 0.07715167498104596,
 'Nice': 0.07142857142857142}

## User-based collaborative filtering

In [70]:
matrix = user_df.to_numpy()
data_items = pd.DataFrame(matrix)

def jaccard_binary(x,y):
    """A function for finding the similarity between two binary vectors"""
    intersection = np.logical_and(x, y)
    union = np.logical_or(x, y)
    similarity = intersection.sum() / float(union.sum())
    return similarity

onehot_user = pd.DataFrame(0, index = np.arange(1), columns = cities)
onehot_user.loc[0, user_input] = 1

sim = [0 for i in range(matrix.shape[0])]
for user in range(matrix.shape[0]):
  sim[user] = jaccard_binary(matrix[user],onehot_user.to_numpy())


cities_score = {}

for i in range(matrix.shape[1]):
    for j in range(matrix.shape[0]):
         if cities[i] not in user_input:
            if(matrix[j][i]==1):
                if cities[i] in cities_score:
                    cities_score[cities[i]] += sim[j]
                else:
                    cities_score[cities[i]] = sim[j]

sorted_cities_score = dict(sorted(cities_score.items(), key=lambda x: x[1], reverse = True))

## User-based collaborative filtering output

In [71]:
top_cities = dict(itertools.islice(sorted_cities_score.items(), 10))
top_cities

{'Mumbai': 2.437359862359862,
 'Bangkok': 1.8366994116994118,
 'Goa': 1.4122766122766124,
 'Tokyo': 1.3987234987234987,
 'Berlin': 1.386025086025086,
 'Kochi': 1.3456099456099455,
 'London': 1.1746503496503498,
 'San Francisco': 1.1440226440226442,
 'Chiang Mai': 1.1079115329115328,
 'Paris': 0.966122766122766}

## Content-based filtering (using word2vec embeddings)

In [72]:
def intersection(lst1, lst2):
    lst3 = [value for value in lst1 if value in lst2]
    return lst3

df = pd.read_csv('/content/drive/MyDrive/RS_Endterm_Dataset/city_data.csv', encoding="latin-1", header=None)
temp_df = df[1].str.split(',', expand=True)
temp_df = temp_df.iloc[:, 0:3]
df = pd.concat([df.drop(columns=1), temp_df], axis=1)
df.columns = ['city', 'f1', 'f2', 'f3']

df['f1_vec'] = df.loc[:, 'f1'].apply(lambda x: w2v[str(x).lower().strip()] if str(x).lower().strip() in w2v.key_to_index else np.zeros(25))
df['f2_vec'] = df.loc[:, 'f2'].apply(lambda x: w2v[str(x).lower().strip()] if str(x).lower().strip() in w2v.key_to_index else np.zeros(25))
df['f3_vec'] = df.loc[:, 'f3'].apply(lambda x: w2v[str(x).lower().strip()] if str(x).lower().strip() in w2v.key_to_index else np.zeros(25))

df

Unnamed: 0,city,f1,f2,f3,f1_vec,f2_vec,f3_vec
0,Paris,Culture,History,Fashion,"[0.073361, -0.45021, -1.4036, -0.17344, 0.8417...","[1.2712, 0.24266, 0.22238, -0.661, 0.44286, -0...","[-1.0618, -0.74601, -0.28375, -1.2646, 1.2012,..."
1,Sicilì,Beaches,Architecture,Cuisine,"[-1.487, -0.85212, 0.28377, -0.77153, -1.1792,...","[-0.89621, -1.0202, -0.16677, -0.97225, 1.0081...","[-1.0173, -0.92534, -1.1711, 0.27051, 0.20621,..."
2,Amsterdam,Nightlife,Museums,Canals,"[-0.6397, 0.77052, -0.40369, -1.4419, 0.19529,...","[-0.39368, -0.19749, 0.13158, -0.93891, -0.301...","[0.45164, -0.90879, 0.49922, 1.043, -1.3982, -..."
3,London,History,Culture,Museums,"[1.2712, 0.24266, 0.22238, -0.661, 0.44286, -0...","[0.073361, -0.45021, -1.4036, -0.17344, 0.8417...","[-0.39368, -0.19749, 0.13158, -0.93891, -0.301..."
4,Istanbul,History,Culture,Architecture,"[1.2712, 0.24266, 0.22238, -0.661, 0.44286, -0...","[0.073361, -0.45021, -1.4036, -0.17344, 0.8417...","[-0.89621, -1.0202, -0.16677, -0.97225, 1.0081..."
...,...,...,...,...,...,...,...
2972,Cogolin,Beaches,Marinas,Culture,"[-1.487, -0.85212, 0.28377, -0.77153, -1.1792,...","[0.47938, -0.38707, 0.82684, 1.0394, -1.29, 0....","[0.073361, -0.45021, -1.4036, -0.17344, 0.8417..."
2973,Monòver,Wine,Caves,Architecture,"[-1.5527, 0.26848, -0.2768, 0.37951, 0.33574, ...","[-0.70318, -1.1499, 0.18135, -0.07528, -1.2421...","[-0.89621, -1.0202, -0.16677, -0.97225, 1.0081..."
2974,Kampot Province,Pepper,Caves,Nature,"[-0.73487, -0.19175, 0.38061, 0.92036, -0.5294...","[-0.70318, -1.1499, 0.18135, -0.07528, -1.2421...","[-1.2603, -1.2661, -1.0679, 0.42772, 0.27031, ..."
2975,Kaaawa,Beaches,Hiking,Scenery,"[-1.487, -0.85212, 0.28377, -0.77153, -1.1792,...","[-1.8731, 0.45734, 0.11424, -0.81883, -0.5047,...","[-1.3802, -1.6205, 0.1559, -0.026147, 0.5655, ..."


In [73]:
# find similar cities for given city
def similar(city):

    with warnings.catch_warnings():

        warnings.simplefilter('ignore')

        i = df.loc[df['city'] == city].index[0] # find index of city

        fs = df.loc[i, ['f1', 'f2', 'f3']] # find features of city

        f_vecs = df.loc[i, ['f1_vec', 'f2_vec', 'f3_vec']] # find feature vectors of city

        temp_df = df.loc[df['f1'].isin(fs) | df['f2'].isin(fs) | df['f3'].isin(fs)] # filter cities with common features
        # temp_df = df

        temp_df['score'] = 0 # assign score of zero to each candidate city

        for index, row in temp_df.iterrows():

            score = 0
            for v1 in f_vecs:
                for v2 in row[['f1_vec', 'f2_vec', 'f3_vec']]:
                    score += cosine_similarity([v1], [v2])

            temp_df.loc[index, 'score'] = score[0, 0]

        return temp_df.sort_values(by=['score'], ascending=False).head(5)

In [74]:
city = 'Miami'
pd.concat([df.loc[df['city'] == city], similar(city)])

Unnamed: 0,city,f1,f2,f3,f1_vec,f2_vec,f3_vec,score
47,Miami,Beaches,Nightlife,Shopping,"[-1.487, -0.85212, 0.28377, -0.77153, -1.1792,...","[-0.6397, 0.77052, -0.40369, -1.4419, 0.19529,...","[-0.87153, 0.0038353, 0.62383, -0.94453, 0.257...",
2715,Kemer,Beaches,Nightlife,Resorts,"[-1.487, -0.85212, 0.28377, -0.77153, -1.1792,...","[-0.6397, 0.77052, -0.40369, -1.4419, 0.19529,...","[-1.4483, 0.079, 0.16322, -1.2798, -1.1629, 0....",6.374631
2871,Vama,Beaches,Resorts,Nightlife,"[-1.487, -0.85212, 0.28377, -0.77153, -1.1792,...","[-1.4483, 0.079, 0.16322, -1.2798, -1.1629, 0....","[-0.6397, 0.77052, -0.40369, -1.4419, 0.19529,...",6.374631
1182,Copacabana,Beaches,Nightlife,Carnival,"[-1.487, -0.85212, 0.28377, -0.77153, -1.1792,...","[-0.6397, 0.77052, -0.40369, -1.4419, 0.19529,...","[-1.1886, 0.27673, 0.57007, -0.98416, -0.53367...",6.363844
47,Miami,Beaches,Nightlife,Shopping,"[-1.487, -0.85212, 0.28377, -0.77153, -1.1792,...","[-0.6397, 0.77052, -0.40369, -1.4419, 0.19529,...","[-0.87153, 0.0038353, 0.62383, -0.94453, 0.257...",6.237411
360,Seminyak,Beaches,shopping,nightlife,"[-1.487, -0.85212, 0.28377, -0.77153, -1.1792,...","[-0.87153, 0.0038353, 0.62383, -0.94453, 0.257...","[-0.6397, 0.77052, -0.40369, -1.4419, 0.19529,...",6.237411


In [75]:
# find similar cities for given cities
def similar(cities):

    with warnings.catch_warnings():
        
        warnings.simplefilter('ignore')

        temp_df = df.loc[df['city'].isin(cities)] # dataframe with input cities

        fs = temp_df.loc[:, ['f1', 'f2', 'f3']] # find features of input cities

        f_vecs = temp_df.loc[:, ['f1_vec', 'f2_vec', 'f3_vec']] # find feature vectors of input cities

        fs = pd.unique(fs.values.ravel('K')) # find unique features pertaining to these cities

        f_vecs = f_vecs.values.ravel('K') # get the feature vectors of these cities

        temp_df = df.loc[df['f1'].isin(fs) | df['f2'].isin(fs) | df['f3'].isin(fs)] # get new cities having these features
        # temp_df = df

        temp_df['score'] = 0 # assign score of zero to each new city

        for index, row in temp_df.iterrows():

            score = 0
            for v1 in f_vecs:
                for v2 in row[['f1_vec', 'f2_vec', 'f3_vec']]:
                    score += cosine_similarity([v1], [v2])

            temp_df.loc[index, 'score'] = score[0, 0]

        return temp_df.sort_values(by=['score'], ascending=False).head(5)

## Content-based filtering output

In [76]:
pd.concat([df.loc[df['city'].isin(user_input)], similar(user_input)])

Unnamed: 0,city,f1,f2,f3,f1_vec,f2_vec,f3_vec,score
70,Bangalore,Technology,Startups,Nightlife,"[0.4979, 0.071279, -0.91832, -0.9668, 0.92633,...","[0.37877, 1.0845, -1.2008, -0.72173, 0.79357, ...","[-0.6397, 0.77052, -0.40369, -1.4419, 0.19529,...",
71,Delhi,History,Culture,Food,"[1.2712, 0.24266, 0.22238, -0.661, 0.44286, -0...","[0.073361, -0.45021, -1.4036, -0.17344, 0.8417...","[-0.57889, 0.52137, -0.17892, 0.2232, 0.61681,...",
741,Pune,Education,IT,History,"[0.39388, 0.78487, -1.3961, -0.60382, 1.3577, ...","[0.16758, 0.21434, -0.093086, 0.16379, -0.6000...","[1.2712, 0.24266, 0.22238, -0.661, 0.44286, -0...",
1901,New York,Landmarks,museums,shopping,"[-0.21958, -0.5955, 0.39141, -0.58904, -0.1537...","[-0.39368, -0.19749, 0.13158, -0.93891, -0.301...","[-0.87153, 0.0038353, 0.62383, -0.94453, 0.257...",
1839,Darmstadt,Science,Arts,Culture,"[1.0617, -0.05314, -0.31337, -0.36208, 0.72722...","[-0.2521, -0.14943, -0.28174, -0.96072, 0.8877...","[0.073361, -0.45021, -1.4036, -0.17344, 0.8417...",25.173904
2709,Walldorf,Business,Culture,Parks,"[0.077507, 0.80513, -1.1954, -0.78154, 1.3877,...","[0.073361, -0.45021, -1.4036, -0.17344, 0.8417...","[-1.0595, 0.073, 0.63337, -0.39179, -0.40072, ...",24.869329
473,Fremont,Technology,education,history,"[0.4979, 0.071279, -0.91832, -0.9668, 0.92633,...","[0.39388, 0.78487, -1.3961, -0.60382, 1.3577, ...","[1.2712, 0.24266, 0.22238, -0.661, 0.44286, -0...",24.854847
1853,Greensboro,Arts,Culture,History,"[-0.2521, -0.14943, -0.28174, -0.96072, 0.8877...","[0.073361, -0.45021, -1.4036, -0.17344, 0.8417...","[1.2712, 0.24266, 0.22238, -0.661, 0.44286, -0...",24.82312
77,Boston,History,Culture,Education,"[1.2712, 0.24266, 0.22238, -0.661, 0.44286, -0...","[0.073361, -0.45021, -1.4036, -0.17344, 0.8417...","[0.39388, 0.78487, -1.3961, -0.60382, 1.3577, ...",24.804701
