<a href="https://colab.research.google.com/github/ritwikraha/ml-dataset-reviews/blob/main/notebooks/RecSys_Booking_RecTour_24.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re

In [None]:
# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')

In [None]:
# Load the dataset
df = pd.read_csv('booking_reviews_dataset.csv')

# Preprocess text
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [t for t in tokens if t not in stop_words]
    return ' '.join(tokens)


In [None]:
# Combine positive and negative reviews
df['combined_review'] = df['review_positive'] + ' ' + df['review_negative'].fillna('')
df['combined_review'] = df['combined_review'].apply(preprocess_text)

# Create TF-IDF vectors
tfidf = TfidfVectorizer(max_features=5000)
tfidf_matrix = tfidf.fit_transform(df['combined_review'])

# Calculate cosine similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Normalize numeric features
scaler = MinMaxScaler()
df[['review_score', 'review_helpful_votes', 'room_nights']] = scaler.fit_transform(df[['review_score', 'review_helpful_votes', 'room_nights']])

# Calculate weighted score
df['weighted_score'] = (df['review_score'] * 0.4 +
                        df['review_helpful_votes'] * 0.3 +
                        df['room_nights'] * 0.3)

In [None]:
def get_recommendations(accommodation_id, user_id, top_n=10):
    # Get reviews for the given accommodation
    accommodation_reviews = df[df['accommodation_id'] == accommodation_id]

    if accommodation_reviews.empty:
        return []

    # Get the user's past reviews
    user_reviews = df[df['user_id'] == user_id]

    if user_reviews.empty:
        # If user has no reviews, return top reviews based on weighted score
        return accommodation_reviews.sort_values('weighted_score', ascending=False)['review_id'].head(top_n).tolist()

    # Get the average feature values for the user
    user_avg_features = user_reviews[['review_score', 'review_helpful_votes', 'room_nights']].mean()

    # Calculate the similarity between each review and the user's average features
    accommodation_reviews['user_similarity'] = accommodation_reviews.apply(
        lambda row: cosine_similarity(
            user_avg_features.values.reshape(1, -1),
            row[['review_score', 'review_helpful_votes', 'room_nights']].values.reshape(1, -1)
        )[0][0],
        axis=1
    )

    # Combine content-based and collaborative filtering scores
    accommodation_reviews['final_score'] = (
        accommodation_reviews['weighted_score'] * 0.7 +
        accommodation_reviews['user_similarity'] * 0.3
    )

    # Sort and return top N review IDs
    return accommodation_reviews.sort_values('final_score', ascending=False)['review_id'].head(top_n).tolist()


In [None]:
# Generate recommendations for all accommodation-user pairs
results = []
for accommodation_id in df['accommodation_id'].unique():
    for user_id in df['user_id'].unique():
        recommendations = get_recommendations(accommodation_id, user_id)
        results.append([accommodation_id, user_id] + recommendations)


In [None]:
# Create the submission DataFrame
submission_df = pd.DataFrame(results, columns=['accommodation_id', 'user_id'] + [f'review_{i+1}' for i in range(10)])

# Save the submission file
submission_df.to_csv('recommendation_submission.csv', index=False)

print("Recommendation system completed. Submission file saved as 'recommendation_submission.csv'.")