In [2]:
pip install faker


Collecting faker
  Downloading Faker-33.3.1-py3-none-any.whl.metadata (15 kB)
Collecting typing-extensions (from faker)
  Downloading typing_extensions-4.12.2-py3-none-any.whl.metadata (3.0 kB)
Downloading Faker-33.3.1-py3-none-any.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading typing_extensions-4.12.2-py3-none-any.whl (37 kB)
Installing collected packages: typing-extensions, faker
Successfully installed faker-33.3.1 typing-extensions-4.12.2
Note: you may need to restart the kernel to use updated packages.


In [3]:
import random
import os
import csv
from faker import Faker

fake = Faker()

class DataGenerator:
    def __init__(self):
        self.users = []
        self.hotels = []
        self.activities = []
        self.packages = []
        self.bookings = []
        self.reviews = []

    def create_dummy_users(self, num_users=30):
        roles = {
            "hotel_owner": 10,        
            "activity_lister": 10,    
            "user": 30                
        }
        status_options = ["approved", "pending"]

        for role, count in roles.items():
            for i in range(count):
                email = f"{role}{i + 1}@example.com"
                status = status_options[i % 2] if role in ['hotel_owner', 'activity_lister'] else 'approved'

                user = {
                    'email': email,
                    'password': 'password123',
                    'role': role,
                    'is_approved': status
                }
                self.users.append(user)

        print(f"Dummy users created: {len(self.users)}")

    def create_dummy_hotels(self, num_hotels=20):
        hotel_names = [
            "Hotel Everest", "Mountain View Resort", "Lakeside Inn",
            "Himalayan Retreat", "Sunrise Hotel", "Luxury Escape",
            "Lama Guest House", "Budget Stay", "Urban Oasis", "Hilltop Haven", "Tranquil Stay", "Barahi Jungle Resort"
        ]
        locations = ["Kathmandu", "Pokhara", "Chitwan", "Lumbini", "Nagarkot", "Mustang", "Ghalegaun"]

        for _ in range(num_hotels):
            hotel = {
                'name': random.choice(hotel_names),
                'location': random.choice(locations),
                'price': random.randint(3000, 10000)
            }
            self.hotels.append(hotel)

        print(f"Dummy hotels created: {len(self.hotels)}")

    def create_dummy_activities(self, num_activities=20):
        activity_names = [
            "Paragliding", "Rafting", "Mountain Biking", "Jungle Safari",
            "Cultural Tour", "Trekking", "Hot Air Balloon Ride", "Bungee Jumping",
            "Zip Lining", "Rock Climbing", "Amusement Park", "Water Land"
        ]
        locations = ["Kathmandu", "Pokhara", "Chitwan", "Lumbini", "Nagarkot", "Mustang", "Ghalegaun"]

        for _ in range(num_activities):
            activity = {
                'name': random.choice(activity_names),
                'price': random.randint(1500, 5000),
                'location': random.choice(locations)
            }
            self.activities.append(activity)

        print(f"Dummy activities created: {len(self.activities)}")

    def create_dummy_packages(self, num_packages=50):
        locations = ["Kathmandu", "Pokhara", "Lumbini", "Everest Base Camp", "Chitwan", "Annapurna Circuit"]
        for i in range(num_packages):
            package = {
                'name': f"Package {i + 1}",
                'location': random.choice(locations),
                'description': 'Adventurous and peaceful trip',
                'base_price': random.randint(500, 3000),
                'duration': random.randint(3, 15),
                'availability': random.choice([True, False])
            }
            self.packages.append(package)

        print(f"Dummy packages created: {len(self.packages)}")

    def create_dummy_bookings(self, num_bookings=200):
        for _ in range(num_bookings):
            booking = {
                'user': random.choice(self.users),
                'package': random.choice(self.packages),
                'full_name': fake.name(),
                'phone_number': random.randint(9600000000, 9899999999),
                'additional_notes': fake.text(),
                'status': random.choice(['approved', 'pending', 'rejected']),
                'hotel': random.choice(self.hotels),
                'activity': random.choice(self.activities)
            }
            self.bookings.append(booking)

        print(f"Dummy bookings created: {len(self.bookings)}")

    def create_dummy_reviews(self, num_reviews=1000):
        for _ in range(num_reviews):
            review = {
                'user': random.choice(self.users),
                'package': random.choice(self.packages),
                'rating': random.randint(1, 5),
                'comment': random.choice([
                    "Amazing experience!",
                    "Could have been better.",
                    "Loved the trip, highly recommend!",
                    "Not worth the price.",
                    "Decent package overall."
                ])
            }
            self.reviews.append(review)

        print(f"Dummy reviews created: {len(self.reviews)}")

    def export_to_csv(self, filename='dummy_data.csv'):
        with open(filename, mode='w', newline='', encoding='utf-8') as file:
            writer = csv.writer(file)

            # Write headers
            writer.writerow(["Type", "Name", "Details"])

            # Write users
            for user in self.users:
                writer.writerow(['User', user['email'], f"Role: {user['role']}, Status: {user['is_approved']}"])

            # Write hotels
            for hotel in self.hotels:
                writer.writerow(['Hotel', hotel['name'], f"Location: {hotel['location']}, Price: {hotel['price']}"])

            # Write activities
            for activity in self.activities:
                writer.writerow(['Activity', activity['name'], f"Location: {activity['location']}, Price: {activity['price']}"])

            # Write packages
            for package in self.packages:
                writer.writerow(['Package', package['name'], f"Location: {package['location']}, Price: {package['base_price']}, Duration: {package['duration']} days"])

            # Write bookings
            for booking in self.bookings:
                writer.writerow(['Booking', booking['full_name'], f"Package: {booking['package']['name']}, Hotel: {booking['hotel']['name']}, Activity: {booking['activity']['name']}"])

            # Write reviews
            for review in self.reviews:
                writer.writerow(['Review', review['user']['email'], f"Package: {review['package']['name']}, Rating: {review['rating']}, Comment: {review['comment']}"])

        print(f"Data exported to {filename}")

# Example usage
data_gen = DataGenerator()
data_gen.create_dummy_users()
data_gen.create_dummy_hotels()
data_gen.create_dummy_activities()
data_gen.create_dummy_packages()
data_gen.create_dummy_bookings()
data_gen.create_dummy_reviews()
data_gen.export_to_csv()


Dummy users created: 50
Dummy hotels created: 20
Dummy activities created: 20
Dummy packages created: 50
Dummy bookings created: 200
Dummy reviews created: 1000
Data exported to dummy_data.csv


In [20]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import pearsonr

class RecommendationEngine:
    def __init__(self):
        self.cache_timeout = 3600  # 1 hour cache
        self.booking_weight = 0.7  # Weight for booking-based similarity
        self.review_weight = 0.3   # Weight for review-based similarity

    def get_user_preferences(self, user_id, ratings_data, bookings_data):
        """Get user's preferences based on both ratings and bookings"""
        user_ratings = ratings_data[ratings_data['user_id'] == user_id]
        user_bookings = bookings_data[bookings_data['user_id'] == user_id]

        ratings = {
            row['package_id']: row['rating'] 
            for _, row in user_ratings.iterrows()
        }

        bookings = {
            row['package_id']: self._get_booking_score(row['status'])
            for _, row in user_bookings.iterrows()
        }

        return ratings, bookings

    def _get_booking_score(self, status):
        """Convert booking status to numerical score"""
        status_scores = {
            'completed': 1.0,    
            'approved': 0.8,     
            'pending': 0.5,      
            'rejected': 0.2,     
            'cancelled': 0.3     
        }
        return status_scores.get(status.lower(), 0.5)

    def get_package_features(self, package, package_data):
        """Extract features from a package for content-based filtering"""
        package_info = package_data[package_data['package_id'] == package]
        
        if package_info.empty:
            return {}

        activities = package_info['activities'].values[0].split(',')
        hotels = package_info['hotels'].values[0].split(',')
        
        # Enhanced feature vector including popularity metrics
        features = {
            'location': package_info['location'].values[0],
            'price_range': self._get_price_range(package_info['base_price'].values[0]),
            'duration': package_info['duration'].values[0],
            'activities': ','.join(activities),
            'hotels': ','.join(hotels),
            'booking_rate': self._get_booking_rate(package),
            'completion_rate': self._get_completion_rate(package)
        }
        return features

    def _get_booking_rate(self, package):
        """Calculate booking rate for package"""
        package_bookings = bookings_data[bookings_data['package_id'] == package]
        total_views = len(package_bookings)
        if total_views == 0:
            return 0
        
        booking_count = package_bookings[package_bookings['status'].isin(['approved', 'completed'])].shape[0]
        return booking_count / total_views if total_views > 0 else 0

    def _get_completion_rate(self, package):
        """Calculate successful completion rate for package bookings"""
        package_bookings = bookings_data[bookings_data['package_id'] == package]
        total_bookings = package_bookings.shape[0]
        if total_bookings == 0:
            return 0
        
        completed_bookings = package_bookings[package_bookings['status'] == 'completed'].shape[0]
        return completed_bookings / total_bookings if total_bookings > 0 else 0

    def get_content_based_recommendations(self, user_id, ratings_data, bookings_data, package_data, num_recommendations=10):
        """Generate content-based recommendations using cosine similarity"""
        ratings, bookings = self.get_user_preferences(user_id, ratings_data, bookings_data)
        if not ratings and not bookings:
            return []

        # Get all packages and their features
        all_packages = package_data['package_id'].unique()
        package_features = {
            package: self.get_package_features(package, package_data) 
            for package in all_packages
        }

        # Convert features to numerical vectors
        df = pd.DataFrame.from_dict(package_features, orient='index')
        df_encoded = pd.get_dummies(df)

        # Calculate similarity between packages
        similarity_matrix = cosine_similarity(df_encoded)

        # Calculate weighted recommendations based on both ratings and bookings
        recommendations = {}
        for pkg_id in set(list(ratings.keys()) + list(bookings.keys())):
            try:
                pkg_idx = list(package_features.keys()).index(pkg_id)
                similarities = similarity_matrix[pkg_idx]
                
                # Combine rating and booking scores
                score = (
                    self.review_weight * ratings.get(pkg_id, 0) +
                    self.booking_weight * bookings.get(pkg_id, 0)
                )
                
                for idx, similarity in enumerate(similarities):
                    other_pkg_id = list(package_features.keys())[idx]
                    if other_pkg_id not in ratings and other_pkg_id not in bookings:
                        recommendations[other_pkg_id] = recommendations.get(other_pkg_id, 0) + (similarity * score)
            except ValueError:
                continue

        sorted_recommendations = sorted(
            recommendations.items(), 
            key=lambda x: x[1], 
            reverse=True
        )[:num_recommendations]

        return [pkg_id for pkg_id, score in sorted_recommendations]

    def get_collaborative_recommendations(self, user_id, ratings_data, bookings_data, num_recommendations=10):
        """Generate collaborative recommendations using Pearson correlation"""
        # Updated to avoid DeprecationWarning
        all_ratings = ratings_data.groupby('user_id').apply(
            lambda x: dict(zip(x['package_id'], x['rating']))
        ).reset_index(name='ratings').set_index('user_id')['ratings']

        all_bookings = bookings_data.groupby('user_id').apply(
            lambda x: dict(zip(x['package_id'], x['status'].apply(self._get_booking_score)))
        ).reset_index(name='bookings').set_index('user_id')['bookings']
        
        # Combine ratings and bookings for similarity calculation
        user_preferences = {}
        for user_id in set(list(all_ratings.keys()) + list(all_bookings.keys())):
            user_preferences[user_id] = {}
            for pkg_id in set(list(all_ratings.get(user_id, {}).keys()) + list(all_bookings.get(user_id, {}).keys())):
                user_preferences[user_id][pkg_id] = (
                    self.review_weight * all_ratings.get(user_id, {}).get(pkg_id, 0) +
                    self.booking_weight * all_bookings.get(user_id, {}).get(pkg_id, 0)
                )

        if user_id not in user_preferences:
            return []

        # Calculate similarity with other users
        user_similarities = {}
        user_prefs = user_preferences[user_id]
        
        for other_user_id, other_prefs in user_preferences.items():
            if other_user_id == user_id:
                continue

            common_packages = set(user_prefs.keys()) & set(other_prefs.keys())
            if len(common_packages) < 3:
                continue

            user_common_prefs = [user_prefs[pkg] for pkg in common_packages]
            other_common_prefs = [other_prefs[pkg] for pkg in common_packages]
            
            correlation, _ = pearsonr(user_common_prefs, other_common_prefs)
            if not np.isnan(correlation):
                user_similarities[other_user_id] = correlation

        # Get recommendations based on similar users
        recommendations = {}
        for other_user_id, similarity in user_similarities.items():
            if similarity <= 0:
                continue

            for package_id, score in user_preferences[other_user_id].items():
                if package_id not in user_prefs:
                    recommendations[package_id] = recommendations.get(package_id, 0) + (similarity * score)

        sorted_recommendations = sorted(
            recommendations.items(), 
            key=lambda x: x[1], 
            reverse=True
        )[:num_recommendations]

        return [pkg_id for pkg_id, score in sorted_recommendations]

    def get_popular_packages(self, package_data, num_recommendations=10):
        """Get popular packages based on bookings, ratings, and views"""
        popular_packages = package_data.groupby('package_id').apply(lambda x: {
            'avg_rating': x['rating'].mean(),
            'num_reviews': len(x),
            'num_bookings': len(x),
            'popularity_score': (x['rating'].mean() * 0.4) + (len(x) * 0.4) + (x['completion_rate'] * 0.2)
        })

        # Sort packages based on popularity score
        sorted_popular_packages = popular_packages.sort_values(by='popularity_score', ascending=False)
        return sorted_popular_packages.head(num_recommendations).index.tolist()

    def get_recommendations(self, user_id, ratings_data, bookings_data, package_data, num_recommendations=10):
        """Get hybrid recommendations combining different approaches"""
        if not user_id:
            return self.get_popular_packages(package_data, num_recommendations)

        ratings_count = ratings_data[ratings_data['user_id'] == user_id].shape[0]
        bookings_count = bookings_data[bookings_data['user_id'] == user_id].shape[0]
        
        if (ratings_count + bookings_count) < 3:
            return self.get_popular_packages(package_data, num_recommendations)

        # Get both types of recommendations
        content_based = set(self.get_content_based_recommendations(
            user_id, ratings_data, bookings_data, package_data, num_recommendations
        ))
        collaborative = set(self.get_collaborative_recommendations(
            user_id, ratings_data, bookings_data, num_recommendations
        ))

        # Combine recommendations
        hybrid_recommendations = list(content_based & collaborative)
        hybrid_recommendations.extend(list(content_based - collaborative))
        hybrid_recommendations.extend(list(collaborative - content_based))

        return hybrid_recommendations[:num_recommendations]


# Usage example:
engine = RecommendationEngine()
user_id = 1
# Assume ratings_data, bookings_data, and package_data are your input data frames
recommended_packages = engine.get_recommendations(user_id, ratings_data, bookings_data, package_data, num_recommendations=5)
print("Recommended Packages for User {}: {}".format(user_id, recommended_packages))


AttributeError: 'RecommendationEngine' object has no attribute '_get_price_range'

TypeError: RecommendationEngine.get_recommendations() missing 4 required positional arguments: 'user_id', 'ratings_data', 'bookings_data', and 'package_data'

In [19]:
# Example usage
if __name__ == "__main__":
    # Sample DataFrames
    ratings_data = pd.DataFrame([
        {'user_id': 1, 'package_id': 101, 'rating': 5},
        {'user_id': 1, 'package_id': 102, 'rating': 4},
        {'user_id': 2, 'package_id': 101, 'rating': 3},
        {'user_id': 2, 'package_id': 103, 'rating': 4},
        {'user_id': 3, 'package_id': 104, 'rating': 4}
    ])

    bookings_data = pd.DataFrame([
        {'user_id': 1, 'package_id': 101, 'status': 'completed'},
        {'user_id': 2, 'package_id': 101, 'status': 'approved'},
        {'user_id': 1, 'package_id': 103, 'status': 'pending'}
    ])

    package_data = pd.DataFrame([
        {'package_id': 101, 'activities': 'Trekking,Cultural Tour', 'hotels': 'Hotel A,Hotel B', 'location': 'Kathmandu', 'base_price': 200, 'duration': 5},
        {'package_id': 102, 'activities': 'Sightseeing', 'hotels': 'Hotel C,Hotel D', 'location': 'Pokhara', 'base_price': 150, 'duration': 4},
        {'package_id': 103, 'activities': 'Nature Tour', 'hotels': 'Hotel E', 'location': 'Chitwan', 'base_price': 180, 'duration': 3},
        {'package_id': 104, 'activities': 'Adventure, Jungle Safari', 'hotels': 'Hotel F', 'location': 'Lumbini', 'base_price': 220, 'duration': 6}
    ])

    recommender = RecommendationEngine()
    recommendations = recommender.get_recommendations(1, ratings_data, bookings_data, package_data)
    print("Recommended Packages for User 1:", recommendations)


Recommended Packages for User 1: [np.int64(104)]


  all_ratings = ratings_data.groupby('user_id').apply(lambda x: dict(zip(x['package_id'], x['rating'])))
  all_bookings = bookings_data.groupby('user_id').apply(lambda x: dict(zip(x['package_id'], x['status'].apply(self._get_booking_score))))
