In [1]:
import os
import urllib
import zipfile

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
from pathlib import Path

In [2]:
# URLS for download - code later:
# https://business.yelp.com/external-assets/files/Yelp-JSON.zip
# https://business.yelp.com/external-assets/files/Yelp-Photos.zip

In [3]:
datapath = Path("../data/Yelp-JSON/Yelp-JSON/yelp_dataset")

In [4]:
chunk_size = 10000  # Adjust based on available memory

reviews = []  # List to store chunks
for chunk in pd.read_json(datapath / "yelp_academic_dataset_review.json", lines=True, chunksize = chunk_size):
    reviews.append(chunk)

reviews = pd.concat(reviews, ignore_index=True) 

In [5]:
reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6990280 entries, 0 to 6990279
Data columns (total 9 columns):
 #   Column       Dtype         
---  ------       -----         
 0   review_id    object        
 1   user_id      object        
 2   business_id  object        
 3   stars        int64         
 4   useful       int64         
 5   funny        int64         
 6   cool         int64         
 7   text         object        
 8   date         datetime64[ns]
dtypes: datetime64[ns](1), int64(4), object(4)
memory usage: 480.0+ MB


In [6]:
print(reviews.memory_usage(deep=True).sum() / 1e9, "GB")  # Print memory usage in GB

6.316121498 GB


In [7]:
reviews.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3,0,0,0,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11
1,BiTunyQ73aT9WBnpR9DZGw,OyoGAe7OKpv6SyGZT5g77Q,7ATYjTIgM3jUlt4UM3IypQ,5,1,0,1,I've taken a lot of spin classes over the year...,2012-01-03 15:28:18
2,saUsX_uimxRlCVr67Z4Jig,8g_iMtfSiwikVnbP2etR0A,YjUWPpI6HXG530lwP-fb2A,3,0,0,0,Family diner. Had the buffet. Eclectic assortm...,2014-02-05 20:30:30
3,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5,1,0,1,"Wow! Yummy, different, delicious. Our favo...",2015-01-04 00:01:03
4,Sx8TMOWLNuJBWer-0pcmoA,bcjbaE6dDog4jkNY91ncLQ,e4Vwtrqf-wpJfwesgvdgxQ,4,1,0,1,Cute interior and owner (?) gave us tour of up...,2017-01-14 20:54:15


In [8]:
users = []  # List to store chunks
for chunk in pd.read_json(datapath / "yelp_academic_dataset_user.json", lines=True, chunksize = chunk_size):
    users.append(chunk)

users = pd.concat(users, ignore_index=True) 

In [9]:
np.count_nonzero(users['user_id'].unique())

1987897

In [10]:
users['user_id'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 1987897 entries, 0 to 1987896
Series name: user_id
Non-Null Count    Dtype 
--------------    ----- 
1987897 non-null  object
dtypes: object(1)
memory usage: 15.2+ MB


In [11]:
restaurants = []  # List to store chunks
chunk_size = 10000  # Adjust based on available memory

for chunk in pd.read_json(datapath / "yelp_academic_dataset_business.json", lines = True, chunksize = chunk_size):
    restaurants.append(chunk)

restaurants = pd.concat(restaurants, ignore_index=True) 

In [12]:
restaurants.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150346 entries, 0 to 150345
Data columns (total 14 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   business_id   150346 non-null  object 
 1   name          150346 non-null  object 
 2   address       150346 non-null  object 
 3   city          150346 non-null  object 
 4   state         150346 non-null  object 
 5   postal_code   150346 non-null  object 
 6   latitude      150346 non-null  float64
 7   longitude     150346 non-null  float64
 8   stars         150346 non-null  float64
 9   review_count  150346 non-null  int64  
 10  is_open       150346 non-null  int64  
 11  attributes    136602 non-null  object 
 12  categories    150243 non-null  object 
 13  hours         127123 non-null  object 
dtypes: float64(3), int64(2), object(9)
memory usage: 16.1+ MB


In [13]:
print(restaurants.memory_usage(deep=True).sum() / 1e9, "GB")  # Print memory usage in GB

0.190214213 GB


In [14]:
print(users.memory_usage(deep=True).sum() / 1e9, "GB")  # Print memory usage in GB

3.457424948 GB


- Since number of users >>> number of items, we will use item-item approach.

In [15]:
reviews.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3,0,0,0,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11
1,BiTunyQ73aT9WBnpR9DZGw,OyoGAe7OKpv6SyGZT5g77Q,7ATYjTIgM3jUlt4UM3IypQ,5,1,0,1,I've taken a lot of spin classes over the year...,2012-01-03 15:28:18
2,saUsX_uimxRlCVr67Z4Jig,8g_iMtfSiwikVnbP2etR0A,YjUWPpI6HXG530lwP-fb2A,3,0,0,0,Family diner. Had the buffet. Eclectic assortm...,2014-02-05 20:30:30
3,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5,1,0,1,"Wow! Yummy, different, delicious. Our favo...",2015-01-04 00:01:03
4,Sx8TMOWLNuJBWer-0pcmoA,bcjbaE6dDog4jkNY91ncLQ,e4Vwtrqf-wpJfwesgvdgxQ,4,1,0,1,Cute interior and owner (?) gave us tour of up...,2017-01-14 20:54:15


In [16]:
restaurants['categories'].values

array(['Doctors, Traditional Chinese Medicine, Naturopathic/Holistic, Acupuncture, Health & Medical, Nutritionists',
       'Shipping Centers, Local Services, Notaries, Mailbox Centers, Printing Services',
       'Department Stores, Shopping, Fashion, Home & Garden, Electronics, Furniture Stores',
       ...,
       'Shopping, Jewelry, Piercing, Toy Stores, Beauty & Spas, Accessories, Fashion',
       'Fitness/Exercise Equipment, Eyewear & Opticians, Shopping, Sporting Goods, Bikes',
       'Beauty & Spas, Permanent Makeup, Piercing, Tattoo'], dtype=object)

In [17]:
restaurants['categories'] = restaurants['categories'].fillna('')

In [None]:
tokenizer = lambda x: [cat.strip() for cat in x.split(',')]
vectorizer = CountVectorizer(tokenizer=tokenizer, binary=True)
vectorized_restaurants = vectorizer.fit_transform(restaurants['categories'])
vectorized_rest_df = pd.DataFrame(vectorized_restaurants.toarray().astype(np.float32), columns=vectorizer.get_feature_names_out())
vectorized_rest_df.index = restaurants['business_id'].values
vectorized_rest_df.index.name = None
row_norms = np.linalg.norm(vectorized_rest_df.values, axis=1, keepdims=True)
row_norms[row_norms == 0] = 1
vectorized_rest_df[:] = vectorized_rest_df.values / row_norms



In [27]:
print(vectorized_rest_df.memory_usage(deep=True).sum() / 1e9, "GB")  # Print memory usage in GB

0.800893142 GB


In [20]:
# csmatrix = cosine_similarity(vectorized_restaurants)
# Can't compute cosine similarity matrix. It's just way too big to fit in RAM

In [28]:
ratings = reviews[['user_id', 'business_id', 'stars']]

In [None]:

class CollaborativeRecommender:

    def __init__(self, ratings, restaurants, restaurant_embeddings):
        self.ratings = ratings
        self.restaurants = restaurants
        self.restaurant_embeddings = restaurant_embeddings


    def generate_recommendations(self, user):
        """
        Args: user id e.g. Pns2l4eNsfO8kk83dixA6A
        Returns:
        top_rated_restaurant: e.g. w3giBYDmPWWnsNq5Sr2KQA
        most_similar_restaurant_ids: e.g. ['iZVfWpijwWyX_WR7hQyG9A', '-qjKoIo4tvWc6yF5DYVveg',
        'Kd1M6yXCpyhyqOYw-PPM6Q', 'tu2x5W3D7K1WMdtGi1J9Bw',
        'In47HN_pJzDdIyxYmDxCKw', 'vVN9HVQ_GTbfBt_Z0mS37w',
        '4T6snSpDCi0dQal8W_39zQ', '5vIOqHKIQWdmV-XNVmH0NQ',
        'JXvCRLxCDB5NbzRLoa6zWg', '3IC1K9FZ0Q1iMYMkHhkcBw']
        """
        # Get top rated restaurant by user
        user_ratings = self.ratings.loc[self.ratings['user_id']==user]
        user_ratings = user_ratings.sort_values(by='stars',axis=0,ascending=False)
        top_rated_restaurant = user_ratings.iloc[0,:]['business_id']
        # top_rated_restaurant_name = self.restaurants.loc[self.restaurants['business_id']==top_rated_restaurant,'name'].values[0]
        # Find most similar restaurants to the user's top rated restaurant
        cosine_similarity = (vectorized_rest_df.values @ vectorized_rest_df.loc['Pns2l4eNsfO8kk83dixA6A'].to_numpy()[:, np.newaxis]).flatten()
        top_indices_similar = np.argsort(cosine_similarity)[::-1]

        # Get 10 most similar movies excluding the movie itself
        most_similar = top_indices_similar[1:11]
        most_similar_restaurant_ids = vectorized_rest_df.index[most_similar].values
        return top_rated_restaurant, most_similar_restaurant_ids
    
recommender = CollaborativeRecommender(ratings, restaurants, vectorized_restaurants.toarray().astype(np.float64))
recommender.generate_recommendations("mh_-eMZ6K5RLWhZyISBhwA")

w3giBYDmPWWnsNq5Sr2KQA


('w3giBYDmPWWnsNq5Sr2KQA',
 array(['iZVfWpijwWyX_WR7hQyG9A', '-qjKoIo4tvWc6yF5DYVveg',
        'Kd1M6yXCpyhyqOYw-PPM6Q', 'tu2x5W3D7K1WMdtGi1J9Bw',
        'In47HN_pJzDdIyxYmDxCKw', 'vVN9HVQ_GTbfBt_Z0mS37w',
        '4T6snSpDCi0dQal8W_39zQ', '5vIOqHKIQWdmV-XNVmH0NQ',
        'JXvCRLxCDB5NbzRLoa6zWg', '3IC1K9FZ0Q1iMYMkHhkcBw'], dtype=object))

In [60]:
cosine_similarity = (vectorized_rest_df.values @ vectorized_rest_df.loc['Pns2l4eNsfO8kk83dixA6A'].to_numpy()[:, np.newaxis]).flatten()
top_indices_similar = np.argsort(cosine_similarity)[::-1]
most_similar = top_indices_similar[1:11]
string_ids = vectorized_rest_df.index[[1, 4, 0]]
print(string_ids.values)

['mpf3x-BjTdTEA3yCZrAYPw' 'mWMc6_wTdE0EUBKIGXDVfA'
 'Pns2l4eNsfO8kk83dixA6A']


In [65]:
restaurants[restaurants['business_id'].isin(string_ids.values)]

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ","1616 Chapala St, Ste 2",Santa Barbara,CA,93101,34.426679,-119.711197,5.0,7,0,{'ByAppointmentOnly': 'True'},"Doctors, Traditional Chinese Medicine, Naturop...",
1,mpf3x-BjTdTEA3yCZrAYPw,The UPS Store,87 Grasso Plaza Shopping Center,Affton,MO,63123,38.551126,-90.335695,3.0,15,1,{'BusinessAcceptsCreditCards': 'True'},"Shipping Centers, Local Services, Notaries, Ma...","{'Monday': '0:0-0:0', 'Tuesday': '8:0-18:30', ..."
4,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,101 Walnut St,Green Lane,PA,18054,40.338183,-75.471659,4.5,13,1,"{'BusinessAcceptsCreditCards': 'True', 'Wheelc...","Brewpubs, Breweries, Food","{'Wednesday': '14:0-22:0', 'Thursday': '16:0-2..."


In [55]:
vectorized_rest_df

Unnamed: 0,Unnamed: 1,& probates,3d printing,acai bowls,accessories,accountants,acne treatment,active life,acupuncture,addiction medicine,...,wine tasting room,wine tours,wineries,women's clothing,workers compensation law,wraps,yelp events,yoga,ziplining,zoos
Pns2l4eNsfO8kk83dixA6A,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.408248,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
mpf3x-BjTdTEA3yCZrAYPw,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
tUFrWirKiKi_TAnsVWINQQ,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
MTSW4McQd7CbVtyjqoe9mw,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
mWMc6_wTdE0EUBKIGXDVfA,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
IUQopTMmYQG-qRtBk-8QnA,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
c8GjPIOTGVmIemT7j5_SyQ,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
_QAMST-NrQobXduilWEqSw,0.0,0.0,0.0,0.0,0.377964,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
mtGm22y5c2UHNXDFAjaPNw,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [40]:
vectorized_rest_df.values.shape

(150346, 1312)

In [None]:
restaurants['categories'].loc[1]

'Shipping Centers, Local Services, Notaries, Mailbox Centers, Printing Services'

In [None]:
p = 'Doctors, Traditional Chinese Medicine, Naturopathic/Holistic, Acupuncture, Health & Medical, Nutritionists'
cats = p.lower().split(',')
print(cats)
vectorized_rest_df.loc[1].sum()

['doctors', ' traditional chinese medicine', ' naturopathic/holistic', ' acupuncture', ' health & medical', ' nutritionists']


np.int64(5)

In [30]:
vectorized_rest_df

Unnamed: 0,Unnamed: 1,& probates,3d printing,acai bowls,accessories,accountants,acne treatment,active life,acupuncture,addiction medicine,...,wine tasting room,wine tours,wineries,women's clothing,workers compensation law,wraps,yelp events,yoga,ziplining,zoos
Pns2l4eNsfO8kk83dixA6A,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.408248,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
mpf3x-BjTdTEA3yCZrAYPw,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
tUFrWirKiKi_TAnsVWINQQ,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
MTSW4McQd7CbVtyjqoe9mw,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
mWMc6_wTdE0EUBKIGXDVfA,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
IUQopTMmYQG-qRtBk-8QnA,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
c8GjPIOTGVmIemT7j5_SyQ,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
_QAMST-NrQobXduilWEqSw,0.0,0.0,0.0,0.0,0.377964,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
mtGm22y5c2UHNXDFAjaPNw,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
vectorized_rest_df.loc['Pns2l4eNsfO8kk83dixA6A']

               0.0
& probates     0.0
3d printing    0.0
acai bowls     0.0
accessories    0.0
              ... 
wraps          0.0
yelp events    0.0
yoga           0.0
ziplining      0.0
zoos           0.0
Name: Pns2l4eNsfO8kk83dixA6A, Length: 1312, dtype: float32