In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
import pickle
import warnings
warnings.filterwarnings('ignore')

In [2]:
class TourismRecommender:
    def __init__(self):
        self.user_item_matrix = None
        self.user_similarity_matrix = None
        self.item_similarity_matrix = None
        self.attraction_features = None
        self.content_similarity_matrix = None
        self.user_profiles = None
        self.attraction_profiles = None

    def preprocess_data(self, df):
        """
        Preprocess the tourism dataset for recommendation system
        """
        print("Preprocessing data...")
        
        # Create user-item matrix (users as rows, attractions as columns, ratings as values)
        self.user_item_matrix = df.pivot_table(
            index='UserId', 
            columns='AttractionId', 
            values='Rating',
            fill_value=0
        )
        
        # Create attraction feature dataframe
        attraction_data = df.drop_duplicates('AttractionId')[
            ['AttractionId', 'AttractionType', 'Continent', 'Region', 'Country']
        ]
        
        # Process attraction features for content-based filtering
        self.attraction_features = pd.get_dummies(
            attraction_data, 
            columns=['AttractionType', 'Continent', 'Region', 'Country'],
            drop_first=False
        )
        self.attraction_features.set_index('AttractionId', inplace=True)
        
        # Calculate visit counts and average ratings for each attraction
        attraction_stats = df.groupby('AttractionId').agg({
            'UserId': 'count',
            'Rating': 'mean'
        }).rename(columns={'UserId': 'visit_count'})
        
        # Merge stats into features
        self.attraction_features = self.attraction_features.join(attraction_stats)
        
        # Fill missing values
        self.attraction_features.fillna(0, inplace=True)
        
        print(f"Processed data: {self.user_item_matrix.shape[0]} users, {self.user_item_matrix.shape[1]} attractions")
        return self
    
    def build_collaborative_model(self):
        """
        Build a collaborative filtering model based on user-user similarity
        """
        print("Building collaborative filtering model...")
        
        # Calculate user similarity matrix
        self.user_similarity_matrix = cosine_similarity(self.user_item_matrix)
        
        # Convert to DataFrame for easier indexing
        self.user_similarity_matrix = pd.DataFrame(
            self.user_similarity_matrix,
            index=self.user_item_matrix.index,
            columns=self.user_item_matrix.index
        )
        
        # Calculate item similarity matrix
        self.item_similarity_matrix = cosine_similarity(self.user_item_matrix.T)
        
        # Convert to DataFrame for easier indexing
        self.item_similarity_matrix = pd.DataFrame(
            self.item_similarity_matrix,
            index=self.user_item_matrix.columns,
            columns=self.user_item_matrix.columns
        )
        
        print("Collaborative model built successfully")
        return self
    
    def build_content_model(self):
        """
        Build a content-based filtering model based on attraction features
        """
        print("Building content-based filtering model...")
        
        # Scale numerical features
        scaler = MinMaxScaler()
        numerical_cols = ['visit_count', 'Rating']
        self.attraction_features[numerical_cols] = scaler.fit_transform(self.attraction_features[numerical_cols])
        
        # Calculate content similarity matrix
        self.content_similarity_matrix = cosine_similarity(self.attraction_features)
        
        # Convert to DataFrame for easier indexing
        self.content_similarity_matrix = pd.DataFrame(
            self.content_similarity_matrix,
            index=self.attraction_features.index,
            columns=self.attraction_features.index
        )
        
        print("Content-based model built successfully")
        return self
    
    def build_user_profiles(self, df):
        """
        Build user profiles based on their rating history and demographics
        """
        print("Building user profiles...")
        
        # Group by user to get their demographic information
        user_data = df.groupby('UserId').first()
        
        # Get user preferences for attraction types
        user_preferences = df.groupby(['UserId', 'AttractionType'])['Rating'].mean().unstack(fill_value=0)
        
        # Get user preferences for regions
        user_regions = df.groupby(['UserId', 'Region'])['Rating'].mean().unstack(fill_value=0)
        
        # Combine into user profiles
        self.user_profiles = user_data[['Continent', 'Country', 'user_avg_rating_before']]
        self.user_profiles = self.user_profiles.join(user_preferences, how='left')
        self.user_profiles = self.user_profiles.join(user_regions, how='left')
        
        # Fill missing values
        self.user_profiles.fillna(0, inplace=True)
        
        print(f"Built {len(self.user_profiles)} user profiles")
        return self
    
    def recommend_collaborative(self, user_id, n_recommendations=5, n_neighbors=10):
        """
        Generate recommendations using user-based collaborative filtering
        """
        # Check if user exists
        if user_id not in self.user_similarity_matrix.index:
            print(f"User {user_id} not found in training data")
            return pd.DataFrame()
        
        # Get user's rated items
        user_ratings = self.user_item_matrix.loc[user_id]
        rated_items = user_ratings[user_ratings > 0].index.tolist()
        
        # Find similar users
        similar_users = self.user_similarity_matrix[user_id].sort_values(ascending=False)
        similar_users = similar_users.drop(user_id)  # Remove the user itself
        similar_users = similar_users.head(n_neighbors)
        
        # Get recommendations from similar users
        recommendations = pd.DataFrame(columns=['pred_rating', 'count'])  # Initialize with columns
        
        for similar_user, similarity in similar_users.items():
            # Get items rated by the similar user
            similar_user_ratings = self.user_item_matrix.loc[similar_user]
            similar_user_rated = similar_user_ratings[similar_user_ratings > 0]
            
            # Filter out items already rated by the target user
            new_attractions = similar_user_rated.drop(rated_items, errors='ignore')
            
            # Weight ratings by similarity
            weighted_ratings = new_attractions * similarity
            
            # Add to recommendations
            for item, rating in weighted_ratings.items():
                if item not in recommendations.index:
                    recommendations.at[item, 'pred_rating'] = rating
                    recommendations.at[item, 'count'] = 1
                else:
                    recommendations.at[item, 'pred_rating'] += rating
                    recommendations.at[item, 'count'] += 1
        
        # Check if empty before calculating final_rating
        if recommendations.empty:
            return pd.DataFrame()
        
        # Calculate average predicted rating
        recommendations['final_rating'] = recommendations['pred_rating'] / recommendations['count']
        
        # Sort and return top N recommendations
        return recommendations.sort_values('final_rating', ascending=False).head(n_recommendations)
    
    def recommend_content_based(self, user_id, n_recommendations=5):
        """
        Generate recommendations using content-based filtering
        """
        # Check if user exists
        if user_id not in self.user_item_matrix.index:
            print(f"User {user_id} not found in training data")
            return pd.DataFrame()
        
        # Get user's rated items
        user_ratings = self.user_item_matrix.loc[user_id]
        rated_items = user_ratings[user_ratings > 0]
        
        # Calculate weighted average content similarity
        recommendations = pd.DataFrame()
        
        for item_id in self.attraction_features.index:
            # Skip items already rated by the user
            if item_id in rated_items.index:
                continue
            
            weighted_sum = 0
            similarity_sum = 0
            
            for rated_item, rating in rated_items.items():
                # Get similarity between this item and the rated item
                if rated_item in self.content_similarity_matrix.index:
                    similarity = self.content_similarity_matrix.loc[item_id, rated_item]
                    weighted_sum += similarity * (rating - 3)  # Adjust ratings to be centered around 0
                    similarity_sum += abs(similarity)
            
            if similarity_sum > 0:
                # Calculate predicted rating (adjust back to 1-5 scale)
                recommendations.at[item_id, 'pred_rating'] = 3 + (weighted_sum / similarity_sum)
        
        # Sort and return top N recommendations
        return recommendations.sort_values('pred_rating', ascending=False).head(n_recommendations)
    
    def recommend_hybrid(self, user_id, n_recommendations=5, collab_weight=0.7):
        """
        Generate recommendations using a hybrid approach (collaborative + content)
        """
        # Get collaborative filtering recommendations
        cf_recs = self.recommend_collaborative(user_id, n_recommendations=10)
        
        # Get content-based recommendations
        cb_recs = self.recommend_content_based(user_id, n_recommendations=10)
        
        # Combine recommendations with weights
        if cf_recs.empty and cb_recs.empty:
            return pd.DataFrame()
        elif cf_recs.empty:
            return cb_recs.head(n_recommendations)
        elif cb_recs.empty:
            return cf_recs.head(n_recommendations)
        
        # Normalize ratings to 0-1 scale for fair combination
        cf_min = cf_recs['final_rating'].min()
        cf_max = cf_recs['final_rating'].max()
        if cf_max > cf_min:
            cf_recs['norm_rating'] = (cf_recs['final_rating'] - cf_min) / (cf_max - cf_min)
        else:
            cf_recs['norm_rating'] = 0.5
        
        cb_min = cb_recs['pred_rating'].min()
        cb_max = cb_recs['pred_rating'].max()
        if cb_max > cb_min:
            cb_recs['norm_rating'] = (cb_recs['pred_rating'] - cb_min) / (cb_max - cb_min)
        else:
            cb_recs['norm_rating'] = 0.5
        
        # Combine recommendations
        hybrid_recs = pd.DataFrame()
        
        # Add collaborative filtering recommendations
        for item_id, row in cf_recs.iterrows():
            hybrid_recs.at[item_id, 'cf_rating'] = row['norm_rating']
            hybrid_recs.at[item_id, 'cf_weight'] = collab_weight
        
        # Add content-based recommendations
        for item_id, row in cb_recs.iterrows():
            if item_id in hybrid_recs.index:
                hybrid_recs.at[item_id, 'cb_rating'] = row['norm_rating']
                hybrid_recs.at[item_id, 'cb_weight'] = 1 - collab_weight
            else:
                hybrid_recs.at[item_id, 'cf_rating'] = 0
                hybrid_recs.at[item_id, 'cf_weight'] = collab_weight
                hybrid_recs.at[item_id, 'cb_rating'] = row['norm_rating']
                hybrid_recs.at[item_id, 'cb_weight'] = 1 - collab_weight
        
        # Fill missing values
        hybrid_recs.fillna(0, inplace=True)
        
        # Calculate weighted ratings
        hybrid_recs['hybrid_rating'] = (
            (hybrid_recs['cf_rating'] * hybrid_recs['cf_weight']) + 
            (hybrid_recs['cb_rating'] * hybrid_recs['cb_weight'])
        )
        
        # Sort and return top N recommendations
        return hybrid_recs.sort_values('hybrid_rating', ascending=False).head(n_recommendations)
    
    def get_attraction_details(self, attraction_ids, attraction_data):
        """
        Get details for recommended attractions
        """
        # Filter attraction data for the recommended IDs
        attraction_details = attraction_data[attraction_data['AttractionId'].isin(attraction_ids)]
        
        # Return relevant columns
        return attraction_details[['AttractionId', 'Attraction', 'AttractionType', 'Country', 'Region', 'Rating']]
    
    def save_model(self, filepath):
        """
        Save the trained recommendation model
        """
        with open(filepath, 'wb') as f:
            pickle.dump(self, f)
        print(f"Model saved to {filepath}")

    def load_model(cls, filepath):
        """
        Load a trained recommendation model
        """
        with open(filepath, 'rb') as f:
            model = pickle.load(f)
        print(f"Model loaded from {filepath}")
        return model

In [3]:
if __name__ == "__main__":
    # Load your tourism data
    df = pd.read_excel('C:/Users/Saima Modak/Capstone Projects/Tourism Analysis/Datasets/Final Dataset.xlsx')
    
    # Initialize and train the recommendation system
    recommender = TourismRecommender()
    recommender.preprocess_data(df)
    recommender.build_collaborative_model()
    recommender.build_content_model()
    recommender.build_user_profiles(df)
    
    # Define your desired file path
    model_path = "C:/Users/Saima Modak/Capstone Projects/Tourism Analysis/Models/Recommendation Model.pkl"
    
    # Save the model
    recommender.save_model(model_path)

Preprocessing data...
Processed data: 33530 users, 30 attractions
Building collaborative filtering model...
Collaborative model built successfully
Building content-based filtering model...
Content-based model built successfully
Building user profiles...
Built 33530 user profiles
Model saved to C:/Users/Saima Modak/Capstone Projects/Tourism Analysis/Models/Recommendation Model.pkl
