In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from surprise import Dataset, Reader, SVD
import warnings
warnings.filterwarnings('ignore')
from IPython.display import display
from sklearn.neighbors import NearestNeighbors
import random
random.seed(10)

## Read in dataset

In [2]:
def read_first_n_lines(file_path, n=300000):
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = [next(f) for _ in range(n)]
    return pd.read_json(''.join(lines), lines=True)

In [3]:
clean_df = read_first_n_lines('merged_interaction_cleaned_data.json', 300000)
#display(clean_df.head())

## Step 1: Filter and Prepare Data

In [4]:
# Filter: keep only active users and popular businesses
user_counts = clean_df['user_id'].value_counts()
item_counts = clean_df['business_id'].value_counts()

cf_df = clean_df[
    clean_df['user_id'].isin(user_counts[user_counts >= 5].index) &
    clean_df['business_id'].isin(item_counts[item_counts >= 5].index)]
# Reset index
cf_df = cf_df.reset_index(drop=True)

This filters the original `clean_df` to create a new DataFrame `cf_df` that includes:

- Only active users: those who wrote at least 5 reviews.

- Only popular businesses: those with at least 5 reviews.


This is important because:

- Collaborative Filtering performs poorly on sparse data.

- Filtering out infrequent users/items helps create a denser user-item matrix, improving recommendation quality.

In [5]:
# Prepare business metadata (drop duplicates)
business_df = clean_df.drop_duplicates(subset='business_id')[['business_id', 'business_name', 'categories']].copy()
business_df['categories'] = business_df['categories'].fillna('').astype(str)

## Step 2: Collaborative Filtering (SVD)

In [6]:
# Create Surprise dataset
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(cf_df[['user_id', 'business_id', 'user_rating']], reader)
trainset = data.build_full_trainset()

In [7]:
# Train SVD model
svd_model = SVD()
svd_model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x22273553dd0>

In [8]:
# Get all business IDs
all_business_ids = cf_df['business_id'].unique()

def recommend_cf(user_id, model, business_ids, cf_df, business_df, n=5):
    rated = set(cf_df[cf_df['user_id'] == user_id]['business_id'])
    predictions = [model.predict(user_id, bid) for bid in business_ids if bid not in rated]
    predictions.sort(key=lambda x: x.est, reverse=True)

    result = []
    for pred in predictions:
        bid = pred.iid
        match = business_df[business_df['business_id'] == bid]
        if not match.empty and pd.notna(match.iloc[0]['business_name']):
            business_info = match.iloc[0]
            result.append({
                'business_id': business_info['business_id'],
                'business_name': business_info['business_name'],
                'categories': business_info['categories'],
                'predicted_rating': round(pred.est, 2)
            })
        if len(result) == n:
            break  # Stop once we have n valid recommendations

    return pd.DataFrame(result)


In [9]:
# Choose a user from dataset
sample_user_id = cf_df['user_id'].iloc[100]

# Collaborative Filtering Top 5
print("Top 5 Collaborative Filtering Recommendations:")
cf_top5 = recommend_cf(sample_user_id, svd_model, all_business_ids, cf_df, business_df)

# Format column names and display as a table
cf_top5_pretty = cf_top5.rename(columns={
    'business_id': 'Business ID',
    'business_name': 'Business Name',
    'categories': 'Categories',
    'predicted_rating': 'Predicted Rating'
}).reset_index(drop=True)

cf_top5_pretty.index += 1  # Start index at 1

display(cf_top5_pretty)

Top 5 Collaborative Filtering Recommendations:


Unnamed: 0,Business ID,Business Name,Categories,Predicted Rating
1,kpK6SmxIiNoGPNhlEGiL4w,Trattoria Marcella,"['Italian', 'Local Flavor', 'Restaurants']",3.7
2,u_S84xiDCRJ3CV204CIYuQ,Silver In the City,"['Event Planning & Services', 'Gift Shops', 'H...",3.52
3,AzseSGgDC6bVtMPEYo1CNQ,Creole Creamery,"['Food', 'Ice Cream & Frozen Yogurt']",3.4
4,FHNIvNgh3fS7VZQq2Y3dsA,Cheu Noodle Bar,"['Asian Fusion', 'Tapas/Small Plates', 'Americ...",3.33
5,mzZ_WTb2zvyJMBkm84B-WA,Dorignac's Food Center,"['Donuts', 'Delis', 'Event Planning & Services...",3.32


`Predicted Rating` value is the model's prediction of how much a user would rate a business they haven't interacted with yet.

## Step 3: Build TF-IDF for Content-Based Filtering

In [10]:
#business_df['categories'] = business_df['categories'].fillna('').astype(str)
#business_df = business_df[business_df['categories'].str.strip() != '']

In [11]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(business_df['categories'])

# Fit model using cosine distance
nn_model = NearestNeighbors(metric='cosine', algorithm='brute')
nn_model.fit(tfidf_matrix)

# Index map
business_idx = pd.Series(business_df.index, index=business_df['business_id']).drop_duplicates()

Cosine similarity matrix is too large to fit into memory. Trying to create a full pairwise similarity matrix between ~50,000 businesses, which results in over 2.5 billion values. Thus, using `NearestNeighbours` is a better option.

In [13]:
def content_based_top_5_nn(user_id, cf_df, business_df, tfidf_matrix, nn_model, business_idx, top_n=5):
    # Get user's highest-rated business
    user_ratings = cf_df[cf_df['user_id'] == user_id]
    if user_ratings.empty:
        return pd.DataFrame()

    top_rated = user_ratings.sort_values(by='user_rating', ascending=False).iloc[0]
    target_business_id = top_rated['business_id']

    if target_business_id not in business_idx:
        return pd.DataFrame()

    # Get TF-IDF vector for that business
    idx = business_idx[target_business_id]
    business_vector = tfidf_matrix[idx]

    # Find top similar businesses
    distances, indices = nn_model.kneighbors(business_vector, n_neighbors=top_n + 10)  # get extra to filter later
    similarities = 1 - distances.flatten()

    # Build results, excluding the target itself
    rec_indices = indices.flatten()[1:]
    sim_scores = similarities[1:]

    # Create DataFrame
    result = business_df.iloc[rec_indices][['business_id', 'business_name', 'categories']].copy()
    result['similarity_score'] = sim_scores.round(2)

    # Filter out businesses with missing or None business names
    result = result[result['business_name'].notna()]

    # Return top_n valid recommendations
    return result.head(top_n).reset_index(drop=True)


In [14]:
#sample_user_id = cf_df['user_id'].iloc[100]

print("Top 5 Content-Based Recommendations (NearestNeighbors):")
cbf_nn_top5 = content_based_top_5_nn(
    sample_user_id, cf_df, business_df, tfidf_matrix, nn_model, business_idx
)

# Pretty display
cbf_nn_top5_pretty = cbf_nn_top5.rename(columns={
    'business_id': 'Business ID',
    'business_name': 'Business Name',
    'categories': 'Categories',
    'similarity_score': 'Similarity Score'
})
cbf_nn_top5_pretty.index += 1
display(cbf_nn_top5_pretty)

Top 5 Content-Based Recommendations (NearestNeighbors):


Unnamed: 0,Business ID,Business Name,Categories,Similarity Score
1,AwmeLVLEfdFoCa0LaF0UYA,The Beer Store,"['Food', 'Beer', 'Wine & Spirits']",1.0
2,qrVCN6M272dDdEKPB1E44w,Moore Beverage,"['Food', 'Beer', 'Wine & Spirits']",1.0
3,9EgoCSRFfXUEGiJWFQLJ1w,Fine Wine & Good Spirits - Premium Collection,"['Beer', 'Wine & Spirits', 'Food']",1.0
4,mSAJEUM6soxsZ9Uk3kXlmQ,Brewer's Haven,"['Beer', 'Wine & Spirits', 'Food']",1.0
5,gM0HFwD_myHUAUSUN2boug,Crown Liquors,"['Food', 'Beer', 'Wine & Spirits']",1.0


- The similarity score in content-based filtering tells how similar a recommended business is to another business the user liked based on their categories.

- Score of 1.0 means perfect match

- Collaborative Filtering using matrix factorization (SVD) and Content-Based Filtering using TF-IDF on categories

Collaborative Filtering (SVD) 

- Description : Learns latent user & item features	

- Output: Top 5 recommended business_ids based on predicted rating





Content-Based Filtering

- Description : Uses TF-IDF on business categories

- Output: Top 5 similar businesses to the user's favorite


### Evaluation - CF

In [15]:
from collections import defaultdict
import numpy as np

def precision_f1_at_k(predictions, k=5, threshold=3.5):
    user_metrics = defaultdict(lambda: {'tp': 0, 'fp': 0, 'fn': 0})

    for uid, iid, true_r, est, _ in predictions:
        relevant = true_r >= threshold
        recommended = est >= threshold

        if recommended and relevant:
            user_metrics[uid]['tp'] += 1
        elif recommended and not relevant:
            user_metrics[uid]['fp'] += 1
        elif not recommended and relevant:
            user_metrics[uid]['fn'] += 1

    precisions = []
    f1_scores = []

    for uid in user_metrics:
        tp = user_metrics[uid]['tp']
        fp = user_metrics[uid]['fp']
        fn = user_metrics[uid]['fn']

        if tp + fp > 0:
            precision = tp / (tp + fp)
            precisions.append(precision)

            recall = tp / (tp + fn) if (tp + fn) > 0 else 0
            if precision + recall > 0:
                f1 = 2 * precision * recall / (precision + recall)
                f1_scores.append(f1)

    return round(np.mean(precisions), 4), round(np.mean(f1_scores), 4)

In [16]:
from surprise.model_selection import train_test_split

# Split dataset and retrain the model
trainset, testset = train_test_split(data, test_size=0.2)
algo = SVD()
algo.fit(trainset)
predictions = algo.test(testset)

# Evaluate
prec, f1 = precision_f1_at_k(predictions, k=5)
print(f"Precision@5: {prec}")
print(f"F1@5: {f1}")

Precision@5: 0.6865
F1@5: 0.7977


68.65% of the items shown in the top-5 are relevant.
F1 balances precision and the chance of missing relevant ones.

### Evaluation - CB

In [17]:
def evaluate_cbf_precision_f1(user_id, cf_df, recommendations, threshold=3):
    
    # Get businesses the user rated as relevant (≥ threshold)
    relevant_businesses = set(cf_df[(cf_df['user_id'] == user_id) & (cf_df['user_rating'] >= threshold)]['business_id'])

    if not relevant_businesses:
        return None, None  # Can't evaluate without ground truth

    # Get recommended businesses
    rec_businesses = set(recommendations['business_id'])

    # Compute true positives, false positives, false negatives
    tp = len(rec_businesses & relevant_businesses)
    fp = len(rec_businesses - relevant_businesses)
    fn = len(relevant_businesses - rec_businesses)

    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0

    return round(precision, 4), round(f1, 4)

In [18]:
sample_user = cf_df['user_id'].iloc[100]

cbf_top5 = content_based_top_5_nn(
    sample_user, cf_df, business_df, tfidf_matrix, nn_model, business_idx
)

cbf_prec, cbf_f1 = evaluate_cbf_precision_f1(sample_user, cf_df, cbf_top5)

if cbf_prec is not None:
    print(f"CBF Precision@5: {cbf_prec}")
    print(f"CBF F1@5: {cbf_f1}")
else:
    print("CBF Evaluation skipped: no relevant businesses found for this user.")

CBF Precision@5: 0.2
CBF F1@5: 0.25


High similarity means the model did its job, but low Precision@5 means the recommendations didn't align with the user’s actual history — often because of sparse ratings or strict thresholds.

Though similarity scores were high, Precision@5 was lower because many recommended businesses were not rated by the user.”