In [None]:
import pandas as pd 
import numpy as np 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
# Read the CSV file into a Pandas DataFrame
df = pd.read_csv('/Users/rewanabdelqader/Collage/Semster 8/GraduationProject/Code/GraduationProject/DataSets/places_data.csv')

# Split the data into two separate columns
df[['place_name', 'city']] = df['place_name'].str.split(' - ', expand=True)
# Print the updated DataFrame
df.head(5)


# add the budget and keywords features to the dataset
df['budget'] = df['budget'].astype(float)
df['keywords'] = df['keywords'].apply(lambda x: ' '.join(x))
df['features'] = df['keywords'] + ' ' + df['budget'].astype(str)

# create a tf-idf matrix for the features column
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['features'])
# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
# Construct a reverse map of indices and place names
indices = pd.Series(df.index, index=df['place_name']).drop_duplicates()
def get_similar_places(place_name, keywords=[], budget=np.inf, cosine_sim=cosine_sim, df=df):
    # Get the index of the place that matches the name
    idx = indices[place_name]

    # Get the pairwise similarity scores of all places with that place
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the places based on the average rating
    sim_scores = sorted(sim_scores, key=lambda x: df['average_rating'].iloc[x[0]], reverse=True)

    # Get the indices of the top 10 most similar places
    place_indices = [x[0] for x in sim_scores[1:11]]

    # Filter the places based on keywords and budget
    filtered_places = df.iloc[place_indices][df['budget'] <= budget]
    if keywords:
        filtered_places = filtered_places[filtered_places['keywords'].apply(lambda x: any(keyword in x for keyword in keywords))]

    # Select the relevant columns and return the filtered places sorted by average rating
    return filtered_places[['place_name', 'place_id', 'average_rating']].sort_values(by='average_rating', ascending=False)
get_similar_places('The Sunken City of Heracleion')

In [None]:
# define the recommendation function
def get_recommendations(place_name, city=None, budget=None, cosine_sim=cosine_sim):
    # filter the dataset based on the city and budget
    df_filtered = df.copy()

    if city:
        df_filtered = df_filtered[df_filtered['city'] == city]

    if budget:
        df_filtered = df_filtered[df_filtered['budget'] <= budget]

    # add the budget and keywords features to the filtered dataset
    df_filtered['budget'] = df_filtered['budget'].astype(float)
    df_filtered['keywords'] = df_filtered['keywords'].apply(lambda x: ' '.join(x))
    df_filtered['features'] = df_filtered['keywords'] + ' ' + df_filtered['budget'].astype(str)

    # create a tf-idf matrix for the features column in the filtered dataset
    tfidf_matrix_filtered = tfidf.transform(df_filtered['features'])

    # compute the pairwise similarity scores for the features
    sim_scores = list(enumerate(cosine_sim[indices[place_name]]))

    # sort the places based on the similarity scores and average rating
    sim_scores = sorted(sim_scores, key=lambda x: (x[1], df_filtered.loc[x[0], 'average_rating']), reverse=True)
    # get the indices of the top 10 most similar places
    sim_indices = [i for i, _ in sim_scores[1:11]]

    # return the names of the top 10 most similar places
    return df_filtered.loc[sim_indices, ['place_name', 'budget', 'average_rating', 'keywords']]

# example usage
get_recommendations('The Sunken City of Heracleion')

In [None]:
# define the recommendation function
def get_recommendations(place_name, city=None, budget=None, cosine_sim=cosine_sim):
    # filter the dataset based on the city and budget
    df_filtered = df.copy()

    if city:
        df_filtered = df_filtered[df_filtered['city'] == city]

    if budget:
        df_filtered = df_filtered[df_filtered['budget'] <= budget]

    # add the budget and keywords features to the filtered dataset
    df_filtered['budget'] = df_filtered['budget'].astype(float)
    df_filtered['keywords'] = df_filtered['keywords'].apply(lambda x: ' '.join(x))
    df_filtered['features'] = df_filtered['keywords'] + ' ' + df_filtered['budget'].astype(str)

    # create a tf-idf matrix for the features column in the filtered dataset
    tfidf_matrix_filtered = tfidf.transform(df_filtered['features'])

    # compute the pairwise similarity scores for the features
    sim_scores = list(enumerate(cosine_sim[indices[place_name]]))

    # sort the places based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # get the indices of the top 10 most similar places
    sim_indices = [i for i, _ in sim_scores[1:11]]

    # return the names of the top 10 most similar places
    return df_filtered.loc[sim_indices, ['place_name', 'budget', 'average_rating', 'keywords']]

# example usage
get_recommendations('The Sunken City of Heracleion')

In [22]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# load data
places_df = pd.read_csv("/Users/rewanabdelqader/Collage/Semster 8/GraduationProject/Code/GraduationProject/DataSets/places_data.csv")
users_df = pd.read_csv("/Users/rewanabdelqader/Collage/Semster 8/GraduationProject/Code/GraduationProject/DataSets/Interests.csv")

# create mapping between place names and integer IDs
place2id = {place: i for i, place in enumerate(places_df['place_name'].unique())}

# convert place names to integer IDs
places_df['place_id'] = places_df['place_name'].apply(lambda x: place2id[x])

# convert keywords to bag-of-words vectors
places_df['keywords'] = places_df['keywords'].apply(lambda x: ' '.join(x))
vectorizer = CountVectorizer()
place_keyword_vectors = vectorizer.fit_transform(places_df['keywords'].values)

# compute cosine similarity matrix between place keyword vectors
place_similarity = cosine_similarity(place_keyword_vectors)

# convert user interests to binary vector
def get_user_vector(user_interests):
    user_vector = [0] * len(place2id)
    for interest in user_interests:
        if interest in place2id:
            user_vector[place2id[interest]] = 1
    return user_vector

# get recommendations for each user
n_recommendations = 5
user_place_ratings = []
for i, row in users_df.iterrows():
    user_id = row['User ID']
    user_vector = get_user_vector(row.values[1:])
    place_ratings = {}
    for j, place_name in enumerate(place2id.keys()):
        place_vector = place_similarity[j]
        rating = (user_vector @ place_vector) / sum(place_vector)
        place_ratings[place_name] = rating
    for place_name, rating in sorted(place_ratings.items(), key=lambda x: x[1], reverse=True)[:n_recommendations]:
        user_place_ratings.append((user_id, place_name, rating))

# print recommendations
for user_id, place_name, rating in user_place_ratings:
    print(f"User {user_id} might like {place_name} (rating: {rating})")

ValueError: empty vocabulary; perhaps the documents only contain stop words

In [21]:
import pandas as pd
from sklearn.neighbors import NearestNeighbors

# load data
places_df = pd.read_csv("/Users/rewanabdelqader/Collage/Semster 8/GraduationProject/Code/GraduationProject/DataSets/places_data.csv")
users_df = pd.read_csv("/Users/rewanabdelqader/Collage/Semster 8/GraduationProject/Code/GraduationProject/DataSets/Interests.csv")

# create mapping between interests and integer IDs
interests = set()
for col in users_df.columns[1:]:
    interests |= set(users_df[col].unique())
interests = sorted(list(interests))
interest2id = {interest: i for i, interest in enumerate(interests)}

# create mapping between place names and integer IDs
place2id = {place: i for i, place in enumerate(places_df['place_name'].unique())}

# convert interests and place names to integer IDs
for col in users_df.columns[1:]:
    users_df[col] = users_df[col].apply(lambda x: interest2id[x])
places_df['place_id'] = places_df['place_name'].apply(lambda x: place2id[x])

# split data into training and testing sets
train_size = 0.8
train_users = users_df.sample(frac=train_size, random_state=42)
test_users = users_df.drop(train_users.index)

# train KNN model
knn = NearestNeighbors(metric='cosine', algorithm='brute')
knn.fit(train_users.iloc[:, 1:])

# get recommendations for each user in test set
n_neighbors = 5
user_place_ratings = []
for i, row in test_users.iterrows():
    user_id = i
    user_interests = row[1:]
    distances, indices = knn.kneighbors([user_interests], n_neighbors=n_neighbors)
    for j in indices[0]:
        place_id = train_users.iloc[j]['place_id']
        rating = 1.0 - distances[0][j]
        user_place_ratings.append((user_id, place_id, rating))

# map place IDs back to their names
id2place = {i: place for place, i in place2id.items()}
recommendations = []
for user_id, place_id, rating in user_place_ratings:
    recommendations.append((user_id, id2place[place_id], rating))

# print recommendations
for user_id, place_name, rating in recommendations:
    print(f"User {user_id} might like {place_name} (rating: {rating})")



KeyError: 'place_id'

In [19]:
import pandas as pd
import numpy as np
from libreco.data import DatasetPure, DataInformation
from libreco.algorithms import LightGCN
from libreco.evaluation import evaluate

# load data
data = pd.read_csv('/Users/rewanabdelqader/Collage/Semster 8/GraduationProject/Code/GraduationProject/DataSets/places_data.csv')


# create user and item ids
unique_users = data['place_id'].unique()
unique_items = data['keywords'].apply(pd.Series).stack().unique()
user2id = {old: new for new, old in enumerate(unique_users)}
item2id = {old: new for new, old in enumerate(unique_items)}
data['user_id'] = data['place_id'].map(user2id)
data['item_id'] = data['keywords'].apply(lambda x: [item2id[i] for i in x])
data = data.explode('item_id').reset_index(drop=True)

# split data
train_data = data.sample(frac=0.8, random_state=42)
test_data = data.drop(train_data.index)

# build dataset
train_data, data_info = DatasetPure.build_trainset(train_data)
test_data = DatasetPure.build_testset(test_data)

# initialize model
lightgcn = LightGCN(
    task="ranking",
    data_info=data_info,
    loss_type="bpr",
    embed_size=16,
    n_epochs=3,
    lr=1e-3,
    batch_size=2048,
    num_neg=1,
    device="cuda",
)

# fit model
lightgcn.fit(
    train_data,
    neg_sampling=True,
    verbose=2,
)

# do evaluation on test data
evaluate(
    model=lightgcn,
    data=test_data,
    neg_sampling=True,
    metrics=["roc_auc", "precision", "recall", "ndcg"],
)

# recommend places for a user
user_id = user2id['El Gezira Sporting Club - Cairo']
item_ids = np.arange(len(unique_items))
recommendations = lightgcn.recommend_user(user=user_id, n_rec=7, item_ids=item_ids)

# map item ids to item names
id2item = {new: old for old, new in item2id.items()}
recommendations = [id2item[i] for i in recommendations]

print(recommendations)

ModuleNotFoundError: No module named 'libreco'

In [18]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MultiLabelBinarizer

# Define file paths
places_file = '/Users/rewanabdelqader/Collage/Semster 8/GraduationProject/Code/GraduationProject/DataSets/places_data.csv'
interests_file = '/Users/rewanabdelqader/Collage/Semster 8/GraduationProject/Code/GraduationProject/DataSets/Interests.csv'

# Load data
with open(places_file) as f:
    places = pd.read_csv(f)
with open(interests_file) as f:
    interests = pd.read_csv(f)

# One-hot encode user interests
mlb = MultiLabelBinarizer()
user_features = pd.DataFrame(mlb.fit_transform(interests.iloc[:, 1:].values),
                             columns=mlb.classes_,
                             index=interests.iloc[:, 0].values)

# Compute cosine similarity between user features and place features
places_features = places.drop(['place_id', 'place_name', 'place_type', 'popularity', 'rating_count', 'average_rating', 'budget', 'keywords'], axis=1)
similarity_matrix = cosine_similarity(user_features, places_features)

# Recommend top-N places for each user
N = 5
for i in range(similarity_matrix.shape[0]):
    user_id = interests.iloc[i, 0]
    similar_places_indices = similarity_matrix[i].argsort()[::-1][:N]
    similar_places = places.iloc[similar_places_indices].nlargest(N, 'popularity')
    print(f"Recommendations for User {user_id}:")
    print(similar_places[['place_name', 'popularity', 'average_rating']])
    print()

ValueError: Found array with 0 feature(s) (shape=(1000, 0)) while a minimum of 1 is required by check_pairwise_arrays.

In [17]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MultiLabelBinarizer

# Load data
places = pd.read_csv('/Users/rewanabdelqader/Collage/Semster 8/GraduationProject/Code/GraduationProject/DataSets/places_data.csv')
interests = pd.read_csv('/Users/rewanabdelqader/Collage/Semster 8/GraduationProject/Code/GraduationProject/DataSets/Interests.csv')

# One-hot encode user interests
mlb = MultiLabelBinarizer()
user_features = pd.DataFrame(mlb.fit_transform(interests.iloc[:, 1:].values),
                             columns=mlb.classes_,
                             index=interests.iloc[:, 0].values)

# Compute cosine similarity between user features and place features
places_features = places.drop(['place_id', 'place_name', 'place_type', 'popularity', 'rating_count', 'average_rating', 'budget', 'keywords'], axis=1)
similarity_matrix = cosine_similarity(user_features, places_features)

# Recommend top-N places for each user
N = 5
for i in range(similarity_matrix.shape[0]):
    user_id = interests.iloc[i, 0]
    similar_places_indices = similarity_matrix[i].argsort()[::-1][:N]
    similar_places = places.iloc[similar_places_indices]
    print(f"Recommendations for User {user_id}:")
    print(similar_places[['place_name', 'popularity', 'average_rating']])
    print()


ValueError: Found array with 0 feature(s) (shape=(1000, 0)) while a minimum of 1 is required by check_pairwise_arrays.

In [16]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

# Load data set
places = pd.read_csv('/Users/rewanabdelqader/Collage/Semster 8/GraduationProject/Code/GraduationProject/DataSets/places_data.csv')
interests = pd.read_csv('/Users/rewanabdelqader/Collage/Semster 8/GraduationProject/Code/GraduationProject/DataSets/Interests.csv')

# Convert keywords to binary features
keywords = set()
for k in places['keywords']:
    keywords.update(eval(k))
for k in keywords:
    places[k] = places['keywords'].apply(lambda x: int(k in eval(x)))

# Convert interests to binary features
interests = interests.drop('User ID', axis=1)
for k in keywords:
    interests[k] = interests.apply(lambda x: int(k in x.values), axis=1)

# Compute user-item similarity matrix
places_features = places.drop(['place_id', 'place_name', 'place_type', 'popularity', 'rating_count', 'average_rating', 'budget', 'keywords'], axis=1)
user_features = interests.values
similarity_matrix = cosine_similarity(user_features, places_features)

# Recommend top-N places for each user
N = 5
for i in range(len(interests)):
    user_id = i + 1
    user_similarities = similarity_matrix[i]
    user_top_N = places.iloc[user_similarities.argsort()[::-1][:N]]
    print(f"Recommendations for User {user_id}:")
    print(user_top_N[['place_id', 'place_name', 'place_type']])


ValueError: could not convert string to float: 'Fashion'

In [15]:
import pandas as pd
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Read the CSV files into Pandas DataFrames
df = pd.read_csv('/Users/rewanabdelqader/Collage/Semster 8/GraduationProject/Code/GraduationProject/DataSets/places_data.csv')
users_df = pd.read_csv('/Users/rewanabdelqader/Collage/Semster 8/GraduationProject/Code/GraduationProject/DataSets/Interests.csv')

# Compute the TF-IDF matrix for the keywords column in the df DataFrame
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['keywords'])

# Perform Singular Value Decomposition (SVD) on the TF-IDF matrix
svd = TruncatedSVD(n_components=30, random_state=42)
svd_matrix = svd.fit_transform(tfidf_matrix)

# Compute the item-item similarity matrix using the cosine similarity
item_item_sim = cosine_similarity(svd_matrix)

# Define a function to get the top N recommended items for a user
def get_recommendations(user_id, N=10):
    # Get the user's interests from the users_df DataFrame
    interests = users_df.iloc[user_id-1][1:].values.tolist()

    # Get the user's ratings for all items in the df DataFrame
    user_ratings = []
    for _, row in df.iterrows():
        if row['place_id'] in trainset.ur[user_id]:
            user_ratings.append((row['place_id'], model.predict(user_id, row['place_id']).est))

    # Sort the user's ratings in descending order
    user_ratings.sort(key=lambda x: x[1], reverse=True)

    # Get the top N items that the user has not rated yet
    recommended_items = []
    for item_id, _ in user_ratings:
        if item_id not in trainset.ur[user_id]:
            recommended_items.append((item_id, item_item_sim[df[df['place_id']==item_id].index[0]],))

        if len(recommended_items) >= N:
            break

    # Sort the recommended items by their similarity to the user's interests
    recommended_items.sort(key=lambda x: sum([x[1][df[df['place_id']==item_id].index[0]] for item_id in trainset.ur[user_id]]), reverse=True)

    # Return the top N recommended items
    return df.loc[[item_id for item_id, _ in recommended_items], ['place_name', 'average_rating']]

# Test the function by getting the top 5 recommended items for user 1
recommended_items = get_recommendations(1, N=5)
print(recommended_items)

NameError: name 'trainset' is not defined

In [14]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD

# Read the CSV files into Pandas DataFrames
df = pd.read_csv('/Users/rewanabdelqader/Collage/Semster 8/GraduationProject/Code/GraduationProject/DataSets/places_data.csv')
users_df = pd.read_csv('/Users/rewanabdelqader/Collage/Semster 8/GraduationProject/Code/GraduationProject/DataSets/Interests.csv')

# Compute the TF-IDF matrix for the keywords column in the df DataFrame
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['keywords'])

# Compute the SVD matrix for dimensionality reduction
svd = TruncatedSVD(n_components=100)
svd_matrix = svd.fit_transform(tfidf_matrix)

# Compute the item-item similarity matrix using the cosine similarity
item_item_sim = cosine_similarity(svd_matrix)

# Define a function to get the top N recommended items for a user
def get_recommendations(user_id, N=10):
    # Get the user's interests from the users_df DataFrame
    interests = users_df.iloc[user_id-1][1:].values.tolist()

    # Get the user's ratings for all items in the df DataFrame
    user_ratings = []
    for _, row in df.iterrows():
        user_ratings.append((row['place_id'], row['average_rating'], item_item_sim[df[df['place_id']==row['place_id']].index[0]],))

    # Sort the user's ratings in descending order of average rating
    user_ratings.sort(key=lambda x: x[1], reverse=True)

    # Get the top N items that the user has not rated yet
    recommended_items = []
    for item_id, _, similarity in user_ratings:
        if item_id not in users_df.iloc[user_id-1][1:].index:
            recommended_items.append((item_id, similarity))

        if len(recommended_items) >= N:
            break

    # Sort the recommended items by their similarity to the user's interests
    recommended_items.sort(key=lambda x: sum([x[1][df[df['place_id']==item_id].index[0]] for item_id in users_df.iloc[user_id-1][1:].index]), reverse=True)

    # Return the top N recommended items
    return df.loc[[item_id for item_id, _ in recommended_items[:N]], ['place_name', 'average_rating']]

# Test the function by getting the top 5 recommended items for user 1
recommended_items = get_recommendations(1, N=5)
print(recommended_items)

ValueError: n_components(100) must be <= n_features(30).

In [11]:
import pandas as pd
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Read the CSV file into a Pandas DataFrame
df = pd.read_csv('/Users/rewanabdelqader/Collage/Semster 8/GraduationProject/Code/GraduationProject/DataSets/places_data.csv')

# Convert the DataFrame to a Surprise Dataset
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df[['place_id', 'place_name', 'average_rating']], reader)

# Split the data into a training set and a test set
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

# Train the SVD model on the training set
model = SVD()
model.fit(trainset)

# Get the predictions for the test set
predictions = model.test(testset)

# Compute the accuracy of the predictions
accuracy = sum([1 for pred in predictions if round(pred.est) == pred.r_ui]) / len(predictions) * 100
print(f'Collaborative filtering accuracy: {accuracy:.2f}%')

# Compute the item-item similarity matrix using TF-IDF
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['keywords'])
item_item_sim = cosine_similarity(tfidf_matrix)

# Define a function to get the top N similar items
def get_similar_items(item_id, N=10):
    item_idx = df[df['place_id'] == item_id].index[0]
    sim_scores = list(enumerate(item_item_sim[item_idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:N+1]
    item_indices = [x[0] for x in sim_scores]
    return df.iloc[item_indices][['place_name', 'average_rating']]

# Test the function by getting the top 5 similar items to item 358
similar_items = get_similar_items(358, N=5)
print(similar_items)

ModuleNotFoundError: No module named 'surprise'

In [12]:
import pandas as pd
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Read the CSV files into Pandas DataFrames
df = pd.read_csv('/Users/rewanabdelqader/Collage/Semster 8/GraduationProject/Code/GraduationProject/DataSets/places_data.csv')
users_df = pd.read_csv('/Users/rewanabdelqader/Collage/Semster 8/GraduationProject/Code/GraduationProject/DataSets/Interests.csv')

# Convert the DataFrame to a Surprise Dataset
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df[['place_id', 'place_name', 'average_rating']], reader)

# Split the data into a training set and a test set
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

# Train the SVD model on the training set
model = SVD()
model.fit(trainset)

# Define a function to get the top N recommended items for a user
def get_recommendations(user_id, N=10):
    # Get the user's interests from the users_df DataFrame
    interests = users_df.iloc[user_id-1][1:].values.tolist()

    # Compute the TF-IDF matrix for the keywords column in the df DataFrame
    tfidf = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf.fit_transform(df['keywords'])

    # Compute the item-item similarity matrix using the cosine similarity
    item_item_sim = cosine_similarity(tfidf_matrix)

    # Get the user's ratings for all items in the df DataFrame
    user_ratings = []
    for _, row in df.iterrows():
        if row['place_id'] in trainset.ur[user_id]:
            user_ratings.append((row['place_id'], model.predict(user_id, row['place_id']).est))

    # Sort the user's ratings in descending order
    user_ratings.sort(key=lambda x: x[1], reverse=True)

    # Get the top N items that the user has not rated yet
    recommended_items = []
    for item_id, _ in user_ratings:
        if item_id not in trainset.ur[user_id]:
            recommended_items.append((item_id, item_item_sim[df[df['place_id']==item_id].index[0]],))

        if len(recommended_items) >= N:
            break

    # Sort the recommended items by their similarity to the user's interests
    recommended_items.sort(key=lambda x: sum([x[1][df[df['place_id']==item_id].index[0]] for item_id in trainset.ur[user_id]]), reverse=True)

    # Return the top N recommended items
    return df.loc[[item_id for item_id, _ in recommended_items], ['place_name', 'average_rating']]

# Test the function by getting the top 5 recommended items for user 1
recommended_items = get_recommendations(1, N=5)
print(recommended_items)

ModuleNotFoundError: No module named 'surprise'

In [10]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# Read the CSV file into a Pandas DataFrame
df = pd.read_csv('/Users/rewanabdelqader/Collage/Semster 8/GraduationProject/Code/GraduationProject/DataSets/places_data.csv')

# Split the data into two separate columns
df[['place_name', 'city']] = df['place_name'].str.split(' - ', expand=True)

# Add the budget and keywords features to the dataset
df['budget'] = df['budget'].astype(float)
df['keywords'] = df['keywords'].apply(lambda x: ' '.join(x))
df['features'] = df['keywords'] + ' ' + df['budget'].astype(str)

# Create a tf-idf matrix for the features column
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['features'])

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

# Construct a reverse map of indices and place names
indices = pd.Series(df.index, index=df['place_name']).drop_duplicates()

def get_similar_places(place_name, keywords=[], budget=np.inf, cosine_sim=cosine_sim, df=df):
    # Get the index of the place that matches the name
    idx = indices[place_name]

    # Get the pairwise similarity scores of all places with that place
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the places based on the average rating
    sim_scores = sorted(sim_scores, key=lambda x: df['average_rating'].iloc[x[0]], reverse=True)

    # Get the indices of the top 10 most similar places
    place_indices = [x[0] for x in sim_scores[1:11]]

    # Filter the places based on keywords and budget
    filtered_places = df.iloc[place_indices][df['budget'] <= budget]
    if keywords:
        filtered_places = filtered_places[filtered_places['keywords'].apply(lambda x: any(keyword in x for keyword in keywords))]

    # Select the relevant columns and return the filtered places sorted by average rating
    return filtered_places[['place_name', 'place_id', 'average_rating']].sort_values(by='average_rating', ascending=False)

# Define a function to calculate the accuracy score
def accuracy_score(place, method='top_k', k=10):
    # Get the recommended places for the test place
    recommended_places = get_similar_places(place)

    # Get the actual similar places for the test place
    actual_places = df[df['similar_places'].str.contains(place)][['place_name', 'average_rating']].sort_values(by='average_rating', ascending=False)[:k]

    # Calculate the intersection of the recommended places and actual places
    common_places = pd.merge(recommended_places, actual_places, on='place_name')

    # Calculate the precision and recall scores
    precision = len(common_places) / len(recommended_places)
    recall = len(common_places) / len(actual_places)

    # Calculate the F1 score
    f1_score = 2 * precision * recall / (precision + recall)

    # Return the precision, recall, and F1 score
    return precision, recall, f1_score

# Calculate the accuracy score for a test place
place = 'The Sunken City of Heracleion'
precision, recall, f1_score = accuracy_score(place)
print(f"Accuracy score for {place}:")
print(f"Precision: {precision:.2f}")

  filtered_places = df.iloc[place_indices][df['budget'] <= budget]


KeyError: 'similar_places'

In [3]:
import pandas as pd 
import numpy as np 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

# Read the CSV file into a Pandas DataFrame
df = pd.read_csv('/Users/rewanabdelqader/Collage/Semster 8/GraduationProject/Code/GraduationProject/DataSets/places_data.csv')

# Split the data into two separate columns
df[['place_name', 'city']] = df['place_name'].str.split(' - ', expand=True)

# add the budget and keywords features to the dataset
df['budget'] = df['budget'].astype(float)
df['keywords'] = df['keywords'].apply(lambda x: ' '.join(x))
df['features'] = df['keywords'] + ' ' + df['budget'].astype(str)

# create a tf-idf matrix for the features column
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['features'])

# Compute the cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Construct a reverse map of indices and place names
indices = pd.Series(df.index, index=df['place_name']).drop_duplicates()

def get_similar_places(place_name, keywords=[], budget=np.inf, cosine_sim=cosine_sim, df=df):
    # Get the index of the place that matches the name
    if place_name not in indices:
        raise ValueError(f'Place name "{place_name}" not found in dataset')
    idx = indices[place_name]

    # Get the pairwise similarity scores of all places with that place
    sim_scores = list(enumerate(cosine_sim[idx]))
    if not sim_scores:
        raise ValueError(f'No similar places found for "{place_name}"')

    # Sort the places based on the average rating
    sim_scores = sorted(sim_scores, key=lambda x: df['average_rating'].iloc[x[0]], reverse=True)

    # Get the indices of the top 10 most similar places
    place_indices = [x[0] for x in sim_scores[1:11]]

    # Filter the places based on keywords and budget
    filtered_places = df.iloc[place_indices][df['budget'] <= budget]
    if keywords:
        filtered_places = filtered_places[filtered_places['keywords'].apply(lambda x: any(keyword in x for keyword in keywords))]

    # Select the relevant columns and return the filtered places sorted by average rating
    return filtered_places[['place_name', 'place_id', 'average_rating']].sort_values(by='average_rating', ascending=False)

In [4]:
import pandas as pd 
import numpy as np 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics import accuracy_score

# Read the CSV file into a Pandas DataFrame
df = pd.read_csv('/Users/rewanabdelqader/Collage/Semster 8/GraduationProject/Code/GraduationProject/DataSets/places_data.csv')

# Split the data into two separate columns
df[['place_name', 'city']] = df['place_name'].str.split(' - ', expand=True)

# Add the budget and keywords features to the dataset
df['budget'] = df['budget'].astype(float)
df['keywords'] = df['keywords'].apply(lambda x: ' '.join(x))
df['features'] = df['keywords'] + ' ' + df['budget'].astype(str)

# Create a tf-idf matrix for the features column
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['features'])

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

# Construct a reverse map of indices and place names
indices = pd.Series(df.index, index=df['place_name']).drop_duplicates()

def get_similar_places(place_name, keywords=[], budget=np.inf, cosine_sim=cosine_sim, df=df):
    # Get the index of the place that matches the name
    idx = indices[place_name]

    # Get the pairwise similarity scores of all places with that place
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the places based on the average rating
    sim_scores = sorted(sim_scores, key=lambda x: df['average_rating'].iloc[x[0]], reverse=True)

    # Get the indices of the top 10 most similar places
    place_indices = [x[0] for x in sim_scores[1:11]]

    # Filter the places based on keywords and budget
    filtered_places = df.iloc[place_indices][df['budget'] <= budget]
    if keywords:
        filtered_places = filtered_places[filtered_places['keywords'].apply(lambda x: any(keyword in x for keyword in keywords))]

    # Select the relevant columns and return the filtered places sorted by average rating
    return filtered_places[['place_name', 'place_id', 'average_rating']].sort_values(by='average_rating', ascending=False)

# Function to check the accuracy of the recommendations
def check_accuracy(place_name, expected_result, keywords=[], budget=np.inf, cosine_sim=cosine_sim, df=df):
    # Get the recommended places
    recommended_places = get_similar_places(place_name, keywords, budget, cosine_sim, df)

    # Get the actual place names from the expected result
    actual_places = df[df['place_name'].isin(expected_result)][['place_name', 'average_rating']]

    # Calculate the accuracy score
    accuracy = accuracy_score(recommended_places['place_name'], actual_places['place_name'])

    # Print the accuracy score
    print(f"Accuracy score for {place_name}: {accuracy:.2f}")

In [6]:
# Get the list of place names to test
test_places = ['The Sunken City of Heracleion', 'Some other place', 'Another place']

# Create an empty list to store the accuracy scores
accuracy_scores = []

# Iterate over the test places
for place in test_places:
    # Get the recommended places for the test place
    recommended_places = get_similar_places(place)
    
    # Get the actual similar places for the test place
    actual_places = df[df['similar_places'].str.contains(place)][['place_name', 'average_rating']].sort_values(by='average_rating', ascending=False)[:10]
    
    # Calculate the intersection of the recommended places and actual places
    intersection = recommended_places.merge(actual_places, on='place_name')
    
    # Calculate the accuracy score
    accuracy_score = intersection.shape[0] / 10
    
    # Append the accuracy score to the list
    accuracy_scores.append(accuracy_score)

# Calculate the mean accuracy score
mean_accuracy_score = np.mean(accuracy_scores)

print(f'Mean accuracy score: {mean_accuracy_score:.2f}')


  filtered_places = df.iloc[place_indices][df['budget'] <= budget]


KeyError: 'similar_places'

In [5]:
# Example usage
check_accuracy('The Sunken City of Heracleion', ['The Pyramids of Giza', 'The Valley of the Kings', 'The Great Sphinx'], ['history', 'ancient'], budget=500)

Accuracy score for The Sunken City of Heracleion: nan


  filtered_places = df.iloc[place_indices][df['budget'] <= budget]
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)


In [8]:
import pandas as pd 
import numpy as np 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

# Read the CSV file into a Pandas DataFrame
df = pd.read_csv('/Users/rewanabdelqader/Collage/Semster 8/GraduationProject/Code/GraduationProject/DataSets/places_data.csv')

# Split the data into two separate columns
df[['place_name', 'city']] = df['place_name'].str.split(' - ', expand=True)

# add the budget and keywords features to the dataset
df['budget'] = df['budget'].astype(float)
df['keywords'] = df['keywords'].apply(lambda x: ' '.join(x))
df['features'] = df['keywords'] + ' ' + df['budget'].astype(str)

# create a tf-idf matrix for the features column
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['features'])

# Compute the cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Construct a reverse map of indices and place names
indices = pd.Series(df.index, index=df['place_name']).drop_duplicates()

def get_similar_places(place_name, keywords=[], budget=np.inf, cosine_sim=cosine_sim, df=df):
    # Get the index of the place that matches the name
    if place_name not in indices:
        raise ValueError(f'Place name "{place_name}" not found in dataset')
    idx = indices[place_name]

    # Get the pairwise similarity scores of all places with that place
    sim_scores = list(enumerate(cosine_sim[idx]))
    if not sim_scores:
        raise ValueError(f'No similar places found for "{place_name}"')
    print(f'Similar places for "{place_name}": {sim_scores}')

    # Sort the places based on the average rating
    sim_scores = sorted(sim_scores, key=lambda x: df['average_rating'].iloc[x[0]], reverse=True)
    print(f'Sorted places for "{place_name}": {sim_scores}')

    # Get the indices of the top 10 most similar places
    place_indices = [x[0] for x in sim_scores[1:11]]

    # Filter the places based on keywords and budget
    filtered_places = df.iloc[place_indices][df['budget'] <= budget]
    if keywords:
        filtered_places = filtered_places[filtered_places['keywords'].apply(lambda x: any(keyword in x for keyword in keywords))]

    # Select the relevant columns and return the filtered places sorted by average rating
    return filtered_places[['place_name', 'place_id', 'average_rating']].sort_values(by='average_rating', ascending=False)

In [None]:
import pandas as pd

# Create a sample DataFrame with the data you provided
data = {'place': }
df = pd.DataFrame(data)

# Split the 'place' column into two columns using '-' as the separator
df[['place_name', 'city']] = df['place'].str.split('-', n=1, expand=True)

# Strip leading and trailing whitespaces from the 'city' column
df['city'] = df['city'].str.strip()

# Print the updated DataFrame
print(df)


In [None]:
import pandas as pd 
import numpy as np 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# Read the CSV file into a Pandas DataFrame
df = pd.read_csv('/Users/rewanabdelqader/Collage/Semster 8/Graduation Project/DS/places csv.csv')

# Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

# Replace NaN with an empty string
df['place_name'] = df['place_name'].fillna('')

# Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(df['place_name'])

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

# Construct a reverse map of indices and place names
indices = pd.Series(df.index, index=df['place_name']).drop_duplicates()

# Define a function that takes in place name as input and outputs the top 10 most similar places sorted by rating
def get_similar_places(place_name, cosine_sim=cosine_sim, df=df):
    # Get the index of the place that matches the title
    idx = indices[place_name]

    # Get the pairwise similarity scores of all places with that place
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the places based on the rating
    sim_scores = sorted(sim_scores, key=lambda x: df['rating'].iloc[x[0]], reverse=True)

    # Get the indices of the top 10 most similar places
    place_indices = [x[0] for x in sim_scores[1:11]]

    # Return the top 10 most similar places sorted by rating
    return df.iloc[place_indices].sort_values(by='rating', ascending=False)

# Test the function with some sample inputs
similar_places = get_similar_places('Rooftop Lounge & Bar, Alexandria')
print(similar_places)

similar_places = get_similar_places('Wunder Garten, Alexandria')
print(similar_places)

In [None]:
import pandas as pd 
import numpy as np 
df1=pd.read_csv('/Users/rewanabdelqader/Collage/Semster 8/Graduation Project/DS/places csv.csv')
df1.head(5)

place_id	place_name	rating	popularity	rating_count	average_rating	budget	keywords
0	1	Montaza Palace Gardens	2	174.313947	400	4.0	300	['cafe', 'dinner', 'lunch', 'coffee', 'brunch'...
1	2	Exit Games Egypt - Cairo	2	170.926290	200	5.0	100	['cafe', 'dinner', 'pastries', 'breakfast', 't...
2	3	Roasting House - New Cairo, Cairo	1	142.719149	500	5.0	300	['tea', 'cafe', 'lunch', 'desserts']
3	4	Kharga Oasis	3	184.004065	300	3.5	100	['coffee']
4	5	The Secret Chambers Egypt - Cairo	2	156.373940	500	4.0	300	['dinner', 'tea', 'desserts', 'lunch', 'coffee..
#Import TfIdfVectorizer from scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer

#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

#Replace NaN with an empty string
df1['place_name'] = df1['place_name'].fillna('')

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(df1['place_name'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape
# Import linear_kernel
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
#Construct a reverse map of indices and movie titles
indices = pd.Series(df1.index, index=df1['place_name']).drop_duplicates()
# Function that takes in place title as input and outputs most similar movies
def get_recommendations(place_name, cosine_sim=cosine_sim):
    # Get the index of the place that matches the title
    idx = indices[place_name]

    # Get the pairwsie similarity scores of all places with that place
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar place
    sim_scores = sim_scores[1:11]

    # Get the place indices
    places_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return df1['place_name'].iloc[places_indices]
    get_recommendations('Rooftop Lounge & Bar, Alexandria')
    181       Pegasus Lounge Bar, Alexandria
128    Sky Roof Bar & Lounge, Alexandria
303                      9 Lounge, Cairo
155                       The Bar, Cairo
399            The Lodge Bar, Alexandria
191       Coffee Lounge - Zamalek, Cairo
40                     Buddha-Bar, Cairo
126           Mojo Lounge & Grill, Cairo
70                   I Bistro Bar, Cairo
25                       Roof Bar, Cairo
Name: place_name, dtype: object
get_recommendations('Wunder Garten, Alexandria')
95       Escape It Egypt - Alexandria
275         Escape Egypt - Alexandria
278       The Room Egypt - Alexandria
471                    Alexandria Zoo
462        The Key Egypt - Alexandria
309          Rooms Egypt - Alexandria
280            Alexandria City Center
351              Cap d'Or, Alexandria
36     Escape Room Egypt - Alexandria
185       Breakout Egypt - Alexandria
Name: place_name, dtype: object

In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load user data
user_data = pd.read_csv('/Users/rewanabdelqader/Collage/Semster8/Graduation_Project/DS/Fake Data/User Data.csv')

# Create user profile vector
tfidf_vectorizer = TfidfVectorizer()
user_profile = tfidf_vectorizer.fit_transform(user_data.values.astype('U'))

# Load hangout data
hangout_data = pd.read_csv('hangout_data.csv')


# Create hangout profile vector
hangout_profile = tfidf_vectorizer.fit_transform(hangout_data['description'])

# Calculate similarity between user profile and hangout profile
cosine_similarities = cosine_similarity(user_profile, hangout_profile)

# Get hangout recommendations for each user
hangout_recommendations = {}
for i, row in user_data.iterrows():
    user_id = row['user_id']
    similarity_scores = list(enumerate(cosine_similarities[i]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    similarity_scores = similarity_scores[1:6] # Top 5 similar hangouts
    hangout_indices = [i[0] for i in similarity_scores]
    hangout_recommendations[user_id] = list(hangout_data.iloc[hangout_indices]['hangout_name'])

# Print hangout recommendations for each user
for user_id, recommendations in hangout_recommendations.items():
    print(f"Recommendations for user {user_id}:")
    print(', '.join(recommendations))
    print()


In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load user data
user_interests = pd.read_csv('user_interests.csv')
user_behavior = pd.read_csv('user_behavior.csv')
user_photos = pd.read_csv('user_photos.csv')

# Combine user data into a single dataframe
user_data = pd.concat([user_interests, user_behavior, user_photos], axis=1)

# Clean user data
user_data = user_data.fillna('')

# Create user profile vector
tfidf_vectorizer = TfidfVectorizer()
user_profile = tfidf_vectorizer.fit_transform(user_data.values.astype('U'))

# Load destination data
destination_data = pd.read_csv('destination_data.csv')

# Clean destination data
destination_data = destination_data.fillna('')

# Create destination profile vector
destination_profile = tfidf_vectorizer.transform(destination_data.values.astype('U'))

# Calculate cosine similarity between user profile and destination profiles
similarity_scores = cosine_similarity(user_profile, destination_profile)

# Create a list of recommended destinations for each user
recommendations = []
for i in range(len(user_data)):
    top_destinations = np.argsort(similarity_scores[i])[::-1][:10]
    recommendations.append(list(destination_data.iloc[top_destinations]['destination_name']))

# Save recommendations to a file
with open('user_recommendations.csv', 'w') as f:
    f.write('user_id,recommendations\n')
    for i in range(len(user_data)):
        f.write(f'{i+1},"{", ".join(recommendations[i])}"\n')


In [None]:
# create a new tf-idf matrix for the features column
tfidf_matrix_features = tfidf.fit_transform(df['features'])

# compute the cosine similarity matrix
cosine_sim_features = linear_kernel(tfidf_matrix_features, tfidf_matrix_features)

# create a reverse map of indices and place names
indices = pd.Series(df.index, index=df['place_name'])

# define the recommendation function
def get_recommendations(place_name, cosine_sim_name=cosine_sim_name, cosine_sim_features=cosine_sim_features):
    # get the index of the place that matches the name
    idx = indices[place_name]

    # compute the pairwise similarity scores for the name and features
    sim_scores_name = list(enumerate(cosine_sim_name[idx]))
    sim_scores_features = list(enumerate(cosine_sim_features[idx]))

    # combine the two similarity scores by taking their average
    sim_scores = [(i, (score_name + score_feat) / 2)
                  for (i, score_name), (_, score_feat)
                  in zip(sim_scores_name, sim_scores_features)]

    # sort the places based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # get the indices of the top 10 most similar places
    sim_indices = [i for i, _ in sim_scores[1:11]]

    # return the names of the top 10 most similar places
    return df.loc[sim_indices, 'place_name']

# example usage
get_recommendations('The Sunken City of Heracleion')

In [None]:
def get_recommendations(place_name, city=None, budget=None, cosine_sim=cosine_sim):
    # filter the dataset based on the city and budget
    df_filtered = df.copy()

    if city:
        df_filtered = df_filtered[df_filtered['city'] == city]

    if budget:
        df_filtered = df_filtered[df_filtered['budget'] <= budget]

    # add the budget and keywords features to the filtered dataset
    df_filtered['budget'] = df_filtered['budget'].astype(float)
    df_filtered['keywords'] = df_filtered['keywords'].apply(lambda x: ' '.join(x))
    df_filtered['features'] = df_filtered['keywords'] + ' ' + df_filtered['budget'].astype(str)

    # create a tf-idf matrix for the features column in the filtered dataset
    tfidf_matrix_filtered = tfidf.transform(df_filtered['features'])

    # compute the pairwise similarity scores for the features
    sim_scores = list(enumerate(cosine_sim[indices[place_name]]))

    # sort the places based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # get the indices of the top 10 most similar places
    sim_indices = [i for i, _ in sim_scores[1:11]]

    # filter the places based on the number of similar keywords
    similar_keywords = set(df.loc[indices[place_name], 'keywords'])
    recommended_indices = []
    for index in sim_indices:
        if len(set(df_filtered.loc[index, 'keywords']).intersection(similar_keywords)) >= 3:
            recommended_indices.append(index)

    # return the names of the recommended places
    return df_filtered.loc[recommended_indices, ['place_name', 'budget', 'average_rating', 'keywords']]
get_recommendations('The Sunken City of Heracleion')