In [2]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Load preprocessed dataset
df = pd.read_csv(r"C:\Tourism\Tourism_preprocessed_data.csv")

# ==============================
# STEP 1: CONTENT-BASED SECTION
# ==============================

# Numeric features for content-based filtering
numeric_cols = [
    "AttractionPopularity", "IsTopAttraction", "RegionPopularity",
    "IsCulturalAttraction", "CityAvgRating", "CountryAvgRating",
    "AttractionRatingStd", "AttractionAvgRating", "AttractionRatingDeviation",
    "AttractionSeasonality", "PopularityRatio"
]

# Ensure numeric format and fill NaNs
df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors='coerce')
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())

# Group by attraction and average the features
attraction_features = df.groupby("Attraction")[numeric_cols].mean().reset_index()
attraction_features[numeric_cols] = attraction_features[numeric_cols].fillna(0)

# Normalize
scaler = MinMaxScaler()
normalized = scaler.fit_transform(attraction_features[numeric_cols])
similarity_matrix = cosine_similarity(normalized)

# Attraction list
attraction_names = attraction_features["Attraction"].tolist()

# Content-based recommendation function
def content_based_recommendations(attraction_name, top_n=5):
    if attraction_name not in attraction_names:
        return []
    idx = attraction_names.index(attraction_name)
    similarity_scores = list(enumerate(similarity_matrix[idx]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    top_indices = [i for i, score in similarity_scores[1:top_n+1]]
    return [attraction_names[i] for i in top_indices]

# ==================================
# STEP 2: COLLABORATIVE FILTERING
# ==================================

# Create user-item matrix from ratings
user_item_matrix = df.pivot_table(index="UserId", columns="Attraction", values="Rating")
user_item_matrix.fillna(0, inplace=True)

# Compute similarity between attractions based on ratings
item_similarity = cosine_similarity(user_item_matrix.T)
item_similarity_df = pd.DataFrame(item_similarity, index=user_item_matrix.columns, columns=user_item_matrix.columns)

# Collaborative recommendation function
def collaborative_recommendations(user_id, top_n=5):
    if user_id not in user_item_matrix.index:
        return []
    
    user_ratings = user_item_matrix.loc[user_id]
    scores = item_similarity_df.dot(user_ratings).div(item_similarity_df.sum(axis=1))
    scores = scores.sort_values(ascending=False)
    
    already_visited = user_ratings[user_ratings > 0].index.tolist()
    recommended = [attr for attr in scores.index if attr not in already_visited]
    
    return recommended[:top_n]

# ================================
# STEP 3: HYBRID RECOMMENDATIONS
# ================================

def hybrid_recommendations(user_id, attraction_name, top_n=5):
    content_recs = set(content_based_recommendations(attraction_name, top_n=10))
    collaborative_recs = set(collaborative_recommendations(user_id, top_n=10))

    # Combine both
    hybrid = list(content_recs.intersection(collaborative_recs))
    
    # Fallback: if no common recommendations, use union
    if not hybrid:
        hybrid = list(content_recs.union(collaborative_recs))
    
    return hybrid[:top_n]

# ===============================
# EXAMPLE USAGE
# ===============================

sample_user = df["UserId"].iloc[0]  # Replace with real user if needed
sample_attraction = df["Attraction"].iloc[0]  # Replace with real attraction

print(f"\n🔁 Hybrid Recommendations for user '{sample_user}' based on attraction '{sample_attraction}':")
for rec in hybrid_recommendations(user_id=sample_user, attraction_name=sample_attraction):
    print("👉", rec)



🔁 Hybrid Recommendations for user '70456' based on attraction 'Sacred Monkey Forest Sanctuary':
👉 Tegalalang Rice Terrace
👉 Kuta Beach - Bali
👉 Uluwatu Temple
👉 Seminyak Beach
👉 Tegenungan Waterfall


In [3]:
def precision_at_k(recommended, relevant, k):
    recommended_k = recommended[:k]
    hits = len(set(recommended_k) & set(relevant))
    return hits / k

def recall_at_k(recommended, relevant, k):
    recommended_k = recommended[:k]
    hits = len(set(recommended_k) & set(relevant))
    return hits / len(relevant) if relevant else 0


In [11]:
liked = set(df[(df['UserId'] == sample_user) & (df['Rating'] >= 4)]['Attraction'])
print(f"User {sample_user} liked these attractions:", liked)

top_attraction = df[df['UserId'] == sample_user].sort_values(by='Rating', ascending=False)['Attraction'].iloc[0]
recs = hybrid_recommendations(sample_user, top_attraction, top_n=10)
print("Hybrid recommendations:", recs)


User 70456 liked these attractions: {'Sacred Monkey Forest Sanctuary'}
Hybrid recommendations: ['Merapi Volcano', 'Tegalalang Rice Terrace', 'Tegenungan Waterfall', 'Seminyak Beach', 'Uluwatu Temple', 'Sanur Beach', 'Kuta Beach - Bali', 'Nusa Dua Beach', 'Tanah Lot Temple', 'Waterbom Bali']


In [4]:
def precision_at_k(recommended, relevant, k):
    recommended_k = recommended[:k]
    relevant_set = set(relevant)
    hit_count = sum([1 for r in recommended_k if r in relevant_set])
    return hit_count / k

def recall_at_k(recommended, relevant, k):
    recommended_k = recommended[:k]
    relevant_set = set(relevant)
    hit_count = sum([1 for r in recommended_k if r in relevant_set])
    return hit_count / len(relevant_set) if len(relevant_set) > 0 else 0


In [5]:
def hybrid_recommendations_weighted(user_id, attraction_name, top_n=10, alpha=0.5):
    # Content-based scores (similarity to given attraction)
    if attraction_name not in attraction_names:
        return []
    idx = attraction_names.index(attraction_name)
    content_scores = similarity_matrix[idx]
    
    # Collaborative scores (weighted sum of item similarity and user ratings)
    if user_id not in user_item_matrix.index:
        return []
    user_ratings = user_item_matrix.loc[user_id]
    collaborative_scores = item_similarity_df.dot(user_ratings).div(item_similarity_df.sum(axis=1))
    
    # Normalize both scores between 0 and 1
    content_scores_norm = (content_scores - content_scores.min()) / (content_scores.max() - content_scores.min())
    collaborative_scores_norm = (collaborative_scores - collaborative_scores.min()) / (collaborative_scores.max() - collaborative_scores.min())
    
    # Align collaborative scores order with content scores attractions
    collaborative_scores_norm = collaborative_scores_norm.reindex(attraction_names).fillna(0).values
    
    # Combine scores with weight alpha
    hybrid_scores = alpha * content_scores_norm + (1 - alpha) * collaborative_scores_norm
    
    # Remove the queried attraction itself
    hybrid_scores[idx] = -1
    
    # Get top_n recommendations
    top_indices = np.argsort(hybrid_scores)[::-1][:top_n]
    recommendations = [attraction_names[i] for i in top_indices]
    
    return recommendations


In [6]:
# Sample test users (you can filter or sample as needed)
test_users = df['UserId'].unique()[:20]

k = 5
precisions = []
recalls = []

for user in test_users:
    user_data = df[df['UserId'] == user]
    relevant_attractions = user_data[user_data['Rating'] >= 4]['Attraction'].tolist()  # consider rating >=4 as relevant
    
    if len(relevant_attractions) == 0:
        continue
    
    # Pick a random attraction from user's liked list as seed
    seed_attraction = relevant_attractions[0]
    
    recommended = hybrid_recommendations_weighted(user, seed_attraction, top_n=k, alpha=0.5)
    
    prec = precision_at_k(recommended, relevant_attractions, k)
    rec = recall_at_k(recommended, relevant_attractions, k)
    
    precisions.append(prec)
    recalls.append(rec)

print(f"Avg Precision@{k}: {np.mean(precisions):.3f}")
print(f"Avg Recall@{k}: {np.mean(recalls):.3f}")


Avg Precision@5: 0.129
Avg Recall@5: 0.216


In [7]:
import pickle

model_objects = {
    "attraction_names": attraction_names,
    "similarity_matrix": similarity_matrix,
    "user_item_matrix": user_item_matrix,
    "item_similarity_df": item_similarity_df,
    "scaler": scaler
}

with open(r"C:\Tourism\hybrid_recommender_model.pkl", "wb") as f:
    pickle.dump(model_objects, f)
