In [14]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

# Load the CSV files
user_df = pd.read_csv('bookings.csv')
package_df = pd.read_csv('packages.csv')
review_df = pd.read_csv('reviews.csv')

# Clean column names (strip extra spaces if any)
user_df.columns = user_df.columns.str.strip()
package_df.columns = package_df.columns.str.strip()
review_df.columns = review_df.columns.str.strip()

# Rename package_id in user_df and review_df to avoid conflicts during merge
user_df = user_df.rename(columns={'package_id': 'user_package_id'})
review_df = review_df.rename(columns={'package_id': 'review_package_id'})

# Merging the datasets on user_email first
merged_df = pd.merge(user_df, review_df, on='user_email')

# Now merge with the package data to get package names
merged_df = pd.merge(merged_df, package_df[['package_id', 'name']], left_on='review_package_id', right_on='package_id')

# Creating the user-item matrix with ratings
user_item_matrix = merged_df.pivot_table(index='user_email', columns='name', values='rating', aggfunc='mean').fillna(0)

# Cosine Similarity Calculation
user_item_matrix_scaled = StandardScaler().fit_transform(user_item_matrix)
cosine_sim = cosine_similarity(user_item_matrix_scaled)

# Create a DataFrame for cosine similarity matrix
cosine_sim_df = pd.DataFrame(cosine_sim, index=user_item_matrix.index, columns=user_item_matrix.index)

# Get similar users to a target user (e.g., 'user1@example.com')
target_user = 'user2@example.com'
similar_users = cosine_sim_df[target_user].sort_values(ascending=False)

# Create a dictionary to store package recommendations based on similar users' ratings
package_recommendations = {}

# Iterate through the similar users
for user in similar_users.index[1:]:  # Skipping the target user itself
    user_ratings = user_item_matrix.loc[user]
    
    # Consider packages that the user has rated highly (e.g., rating > 3)
    for package, rating in user_ratings[user_ratings > 3].items():
        if package not in package_recommendations:
            package_recommendations[package] = []
        package_recommendations[package].append(rating)

# Average ratings for each package and sort by highest average rating
package_avg_ratings = {package: sum(ratings)/len(ratings) for package, ratings in package_recommendations.items()}
sorted_recommendations = sorted(package_avg_ratings.items(), key=lambda x: x[1], reverse=True)

# Get the top 10 recommended packages based on average rating
top_10_packages = [package for package, _ in sorted_recommendations[:10]]

# Fetch details of the top 10 recommended packages
recommended_packages_details = package_df[package_df['name'].isin(top_10_packages)]

# Print top 10 recommended packages
print("Top 10 Recommended Packages for", target_user)
print(recommended_packages_details)


Top 10 Recommended Packages for user2@example.com
    package_id        name           location    price  duration
4            5   Package 5            Chitwan  11160.0        13
5            6   Package 6            Lumbini  27381.0         5
6            7   Package 7            Chitwan   9184.0         8
8            9   Package 9          Kathmandu  17523.0        14
19          20  Package 20            Lumbini  32661.0         4
27          28  Package 28          Kathmandu  18286.0         8
40          41  Package 41          Kathmandu   8045.0         7
43          44  Package 44            Chitwan  11742.0         5
44          45  Package 45            Pokhara  30321.0         6
49          50  Package 50  Annapurna Circuit  28104.0        11


In [13]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

# Load the CSV files
user_df = pd.read_csv('bookings.csv')
package_df = pd.read_csv('packages.csv')
review_df = pd.read_csv('reviews.csv')

# Clean column names (strip extra spaces if any)
user_df.columns = user_df.columns.str.strip()
package_df.columns = package_df.columns.str.strip()
review_df.columns = review_df.columns.str.strip()

# Rename package_id in user_df and review_df to avoid conflicts during merge
user_df = user_df.rename(columns={'package_id': 'user_package_id'})
review_df = review_df.rename(columns={'package_id': 'review_package_id'})

# Merging the datasets on user_email first
merged_df = pd.merge(user_df, review_df, on='user_email')

# Now merge with the package data to get package names
merged_df = pd.merge(merged_df, package_df[['package_id', 'name']], left_on='review_package_id', right_on='package_id')

# Creating the user-item matrix with ratings
user_item_matrix = merged_df.pivot_table(index='user_email', columns='name', values='rating', aggfunc='mean').fillna(0)

# Cosine Similarity Calculation
user_item_matrix_scaled = StandardScaler().fit_transform(user_item_matrix)
cosine_sim = cosine_similarity(user_item_matrix_scaled)
cosine_sim_df = pd.DataFrame(cosine_sim, index=user_item_matrix.index, columns=user_item_matrix.index)

# Pearson Correlation Calculation
user_correlation = user_item_matrix.transpose().corr(method='pearson')
user_correlation_df = pd.DataFrame(user_correlation, index=user_item_matrix.index, columns=user_item_matrix.index)

# Hybrid Similarity Calculation (weighted combination)
cosine_weight = 0.7
pearson_weight = 0.1
hybrid_similarity_df = (cosine_sim_df * cosine_weight) + (user_correlation_df * pearson_weight)

# Get similar users to a target user (e.g., 'user2@example.com')
target_user = 'user13@example.com'
similar_users = hybrid_similarity_df[target_user].sort_values(ascending=False)

# Create a dictionary to store package recommendations based on similar users' ratings
package_recommendations = {}

# Iterate through the similar users
for user in similar_users.index[1:]:  # Skipping the target user itself
    user_ratings = user_item_matrix.loc[user]
    
    # Consider packages that the user has rated highly (e.g., rating > 3)
    for package, rating in user_ratings[user_ratings > 3].items():
        if package not in package_recommendations:
            package_recommendations[package] = []
        package_recommendations[package].append(rating)

# Average ratings for each package and sort by highest average rating
package_avg_ratings = {package: sum(ratings) / len(ratings) for package, ratings in package_recommendations.items()}
sorted_recommendations = sorted(package_avg_ratings.items(), key=lambda x: x[1], reverse=True)

# Get the top 10 recommended packages based on average rating
top_10_packages = [package for package, _ in sorted_recommendations[:10]]

# Fetch details of the top 10 recommended packages
recommended_packages_details = package_df[package_df['name'].isin(top_10_packages)]

# Print top 10 recommended packages
print("Top 10 Recommended Packages for", target_user)
print(recommended_packages_details)


Top 10 Recommended Packages for user13@example.com
    package_id        name           location    price  duration
4            5   Package 5            Chitwan  11160.0        13
5            6   Package 6            Lumbini  27381.0         5
6            7   Package 7            Chitwan   9184.0         8
8            9   Package 9          Kathmandu  17523.0        14
19          20  Package 20            Lumbini  32661.0         4
27          28  Package 28          Kathmandu  18286.0         8
40          41  Package 41          Kathmandu   8045.0         7
43          44  Package 44            Chitwan  11742.0         5
44          45  Package 45            Pokhara  30321.0         6
49          50  Package 50  Annapurna Circuit  28104.0        11
