In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Imports

In [None]:
# importing required libraries
import pandas as pd
from collections import Counter 
import math
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.corpus import stopwords
from geopy.distance import geodesic
import plotly.graph_objects as go

## Reading Processed Restaurants Review and Business Datasets

In [None]:
# loading data from CSV
business=pd.read_csv('drive/MyDrive/DIC/business_final.csv')

In [None]:
rest_review=pd.read_csv('drive/MyDrive/DIC/rest_review.csv')

### Merging and cleaning un-necessary columns

In [None]:
# merging rest review with business 
rest_review=rest_review.merge(business,on='business_id',how='inner')


In [None]:
rest_review=rest_review.drop('Unnamed: 0_x',axis=1)

In [None]:
rest_review.columns# num_tags

Index(['review_id', 'user_id', 'business_id', 'stars_x', 'date', 'text',
       'useful', 'funny', 'cool', 'Unnamed: 0_y', 'name', 'address', 'city',
       'state', 'postal_code', 'latitude', 'longitude', 'stars_y',
       'review_count', 'is_open', 'categories', 'monday', 'tuesday',
       'wednesday', 'thursday', 'friday', 'saturday', 'sunday', 'cuisines'],
      dtype='object')

## Grouping all reviews for a business

In [None]:
# grouping business_id using groupby on rest_review
grouped_bus=rest_review.groupby('business_id')['text'].agg(lambda x:str(x))
grouped_bus=pd.DataFrame(grouped_bus)
grouped_bus=grouped_bus.reset_index()

In [None]:
# dropping non-significant columns
business=business.drop(['Unnamed: 0','monday','tuesday','wednesday','thursday','friday','saturday','sunday','is_open'],axis=1)

In [None]:
business.columns

Index(['business_id', 'name', 'address', 'city', 'state', 'postal_code',
       'latitude', 'longitude', 'stars', 'review_count', 'categories',
       'cuisines'],
      dtype='object')

##  Merging grouped_business with business dataframe

In [None]:
# meging business with grouped_bus
business=business.merge(grouped_bus,how='inner')

In [None]:
grouped_bus.iloc[0]['text']

"1529036    Hands down best local bar in Las Vegas! My bar...\n1529037    I'm changing my review to zero stars so far th...\n1529038    We went here to watch the World Series game. Y...\n1529039    My husband & I decided to go have a couple dri...\n1529040    Great local BAR! Love this place, I walk in an...\n                                 ...                        \n1529139    This location is not the greatest; however, it...\n1529140    Everytime we drive by this place on our way to...\n1529141    Great little place. It can be a little bit bus...\n1529142    My wife ordered the breakfast burrito.\\nWas ju...\n1529143    Great food, great service and great atmosphere...\nName: text, Length: 108, dtype: object"

In [None]:
business.drop_duplicates( subset='business_id', keep="first", inplace=True)
business=business.reset_index(drop=True)

## **Content Based Recommender System**

### 1. Recommending Restaurants similar to ones Users highly rated

We started by creating a TF-IDF matrix taking features - categories, cusisines, text and stars_scaled(normalized stars).
Then we applied Cosine Similarity and based on the similarity matrix onbtained, we then created a method that takes user_id as input and checks for business rated above 3 by the user. This method gives us the business_id, which we then use to get similar businesses. These businesses are then recommended to the user.

#### Removing stop words from text

In [None]:
# downloading stopwords nltk
data = business
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
data['text'] = data['text'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in stop_words]))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


#### Selecting Features for training and Pre-processing for fit

In [None]:
# selecting feature
features = ["categories", "cuisines", "text","stars_scaled"]
data['stars_scaled'] = (data['stars'] - data['stars'].min()) / (data['stars'].max() - data['stars'].min())
# Combining features into single column
data["combined_features"] = data[features].apply(lambda x: " ".join(x.dropna().astype(str)), axis=1)

#### Calculating TF-IDF Matrix

In [None]:
# initializing vectorizer and converting our combined features to vectors
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(data["combined_features"])

#### Applying Cosine Similarity

In [None]:
# applying cosine similarty for the matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [None]:
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics import pairwise

euclidean_dist = pairwise_distances(tfidf_matrix, metric='euclidean')

TypeError: ignored

In [None]:
set_list = set(data["combined_features"].str.split().sum().tolist())
jaccard_sim = pairwise.jaccard_similarity_score(set_list)

TypeError: ignored

#### Reommend and Evaluate Functions

In [None]:
# creating class contentrec with input arguments cosine_sim, review data, business data
class contentrecc():
    def __init__(self,cosine_sim,data,bus,top_n=5):
        self.cosine_sim=cosine_sim
        self.data=data
        self.bus=bus
        self.top_n=top_n

    # creating a function get_recommendations
    def get_recommendations(self,bid ):
        data=self.bus
        top_res=[]
        # looping over the business_ids
        for i in bid:
            # Get the index of the business that matches the title
            idx = data[data['business_id'] == i].index[0]
            sim_scores = list(enumerate(self.cosine_sim[idx]))
            # sim_s=[i for i in sim_scores if i[0] in list(result.index)]
            sim_s = sorted(sim_scores, key=lambda x: x[1], reverse=True)
            #print(sim_scores)
            top_n_scores = sim_s[1 : self.top_n + 1]
            top_n_indices = [i[0] for i in top_n_scores]
            # appending name and business_ids of top recommended restaurts to top_res lst
            top_res.append(data.iloc[top_n_indices][["name",'business_id','cuisines']])
        return top_res
    
    # creating function fit
    def fit(self,uid):
        data= self.data
        # taking only businesses that user has rated more than or equal to 3
        user_data=data[(data['user_id'] == uid) & (data['stars_x'] >= 3)]
        user_data=user_data.sort_values(by='stars_x', ascending=False)
        bid=user_data['business_id'].tolist()[:2]
        output=self.get_recommendations(bid)
        return output

    def precision(self, uid, top_n):
      data = self.data
      recommended_businesses = []
      for m in self.fit(uid):
          for n in m['cuisines']:
              recommended_businesses.append(n)      
          
      user_data = data[(data['user_id'] == uid) & (data['stars_x'] >= 3)]
      actual_businesses = set(user_data['cuisines'].str.lower().tolist())

      tp = len(set(recommended_businesses).intersection(actual_businesses))
      fp = top_n - tp
      if (tp+fp == 0) or tp==0: 
        return 0
      precision = tp / (tp+fp)
      return precision

    def recall(self, uid):
      data = self.data
      recommended_businesses = []
      for m in self.fit(uid):
          for n in m['cuisines']:
              recommended_businesses.append(n)

      user_data = data[(data['user_id'] == uid) & (data['stars_x'] >= 3)]
      # if len(user_data)==0:
      #   return 0
      actual_businesses = set(user_data['cuisines'].str.lower().tolist())

      tp = len(set(recommended_businesses).intersection(actual_businesses))
      fn = len(actual_businesses - set(recommended_businesses))
      
      if tp+fn == 0:
        return 0.5
      recall = tp / (tp + fn)
      return recall

#### Recommendations

In [None]:
con=contentrecc(euclidean_dist,rest_review,business, 5)
a=con.fit('EYkzxQlabF-XBVD_EB0maw')
a

[                                name             business_id  \
 2704  "House of Blues - Music Venue"  IzEHkReZTds6ipOaCq2G6g   
 3822                          "IHOP"  V0GHF8CkhOovDeQKCn7YQg   
 2802           "A Pig In a Fur Coat"  OoI3WUe4Xl86d53avy3oVA   
 1883                   "TGI Fridays"  XPma1KatL8CYQ-Ym4TWvsg   
 2200                 "Pizza Factory"  0ajTatYvxoFd7-C22r6L4w   
 
                     cuisines  
 2704          american (new)  
 3822  american (traditional)  
 2802           mediterranean  
 1883          american (new)  
 2200  american (traditional)  ,
                                       name             business_id  \
 2817                    "Hellriegel's Inn"  mJbrw3Y3NOjCKj2I5t8YsA   
 5322               "Troon North Golf Club"  CKIGXGlSweovWDljQyhSHA   
 2780                            "Ocotillo"  ElWzx5_fU8S2G45OnM-HpA   
 5777            "Luna's Deli & Restaurant"  jNWyePugCw1J_O-s04JtIQ   
 4988  "The Kitchen Counter at Whole Foods"  s_7R8WNq6LzCMxk

In [None]:
con=contentrecc(cosine_sim,rest_review,business, 5)
a=con.fit('EYkzxQlabF-XBVD_EB0maw')
a

[                            name             business_id cuisines
 3705                 "Taco Time"  S6V63OzrCaj22BQJeKBDCw  mexican
 6150         "Durango Taco Shop"  nt2-Zk4FmGY2SYSDBI0gHw  mexican
 10506  "Chicago Style Taco Shop"  9t8PEynZGVCG1EjOQr1xBA  mexican
 8793                "Taco Shack"  db12Hn9hdoE-Ne4_NsVKSw  mexican
 1996                  "Del Taco"  Fho2P7OP80-yChCRxkK7Hw  mexican,
                  name             business_id cuisines
 1284    "Asian Grill"  4sn2a0QzxTmAO3-mxi1s7g  chinese
 8316   "Asian Island"  Xfod0UWr_9B-TOM6qxVIwQ  chinese
 1828   "Asian Garden"  IIWz96zl3D8thYIxv3-5mA     thai
 8193       "Yes Cafe"  Y6Vw0FF8Qz9XD8fY4Z9Vfg  chinese
 9972  "Asian Cuisine"  uno5w5Zkhwn-yFkJgFuYOw  chinese]

#### Sampling Test Set (due to computational constraints)

In [None]:
test_set=rest_review.sample(frac=.00001, random_state=69)

#### Calculating Precision, Recall and F1-Score for Test Dataset

In [None]:
top_n=5
p=0
r=0
pre=[]
rc=[]
# iterating test set
for i,row in test_set.iterrows():
  # 
  model=contentrecc(euclidean_dist,rest_review,business,50)
  uid=row['user_id']
  fit=model.fit(uid)
  a=model.precision(uid, top_n)
  b=model.recall(uid)
  pre.append(a)
  rc.append(b)
  p+=a
  r+=b

precision=p/len(test_set)
recall=r/len(test_set)
f1_score = 2 * (( precision* recall) / (precision + recall))
print(f'precision:{precision},recall:{recall},f1-score:{f1_score}')

precision:0.2823529411764706,recall:0.5294117647058824,f1-score:0.3682864450127877


#### Plots 

In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=list(range(len(pre))), y=pre,
                    mode='lines+markers',
                    name='Precision'))
fig.add_trace(go.Scatter(x=list(range(len(rc))), y=rc,
                    mode='lines+markers',
                    name='Recall'))

# add axis labels and title
fig.update_layout(title='Precision and Recall',
                   xaxis_title='Test Dataset',
                   yaxis_title='Score')

# show plot
fig.show()

### 2. Recommending restaurants in the Demographic vicinity based on their past preferences 

 

1.   Created a TF-idf matrix taking features - categories, cusisines, text and stars_scaled(normalized stars).
2.    Applied Cosine Similarity to the feature vectors
3. Created a method that takes user_id, and location of user (latitude and longitude) as input and checks for business rated above 3 by the user. 
4. We then use these businesses similar to these businesses in the user's vicinity.

In [None]:
# Creating class contentrec_loc taking input arguments cosine_sim, latitude, longitude, restaurant review, business
class contentrec_loc():
    def __init__(self,cosine_sim,lat,lon,data,bus):
#         self.uid=uid
        self.lat=lat
        self.lon=lon
        self.cosine_sim=cosine_sim
        self.data=data
        self.bus=bus

    def get_recommendations(self,bid, cosine_sim=cosine_sim , top_n=20):
        data=self.bus
        top_lst=[]
        top_dict={}
        for i in bid:
            target_lat=self.lat
            target_long=self.lon
            distances = []
            # creating a column[Distance] in data which has the distances between restaurants and our location
            for j in range(len(data)):
                lat = data.iloc[j]['latitude']
                long = data.iloc[j]['longitude']
                dist = geodesic((float(target_lat), float(target_long)), (lat, long)).miles
                distances.append(dist)
            data['distance'] = distances

            # Sort the DataFrame by distance and return the top results
            result = data.sort_values('distance').head(30)
            # Get the index of the business that matches the title
            idx = data[data['business_id'] == i].index[0]
            sim_scores = list(enumerate(cosine_sim[idx]))
            sim_s=[i for i in sim_scores if i[0] in list(result.index)]
            sim_s = sorted(sim_s, key=lambda x: x[1], reverse=True)
            top_n_scores = sim_s[1 : top_n + 1]
            top_n_indices = [i[0] for i in top_n_scores]
            
            # appending name and business_ids of tep recommended restaurts to top_res dictionry
            top_lst=data.iloc[top_n_indices][["name","business_id","distance","latitude",'longitude']]

            top_dict['name']=data.iloc[top_n_indices]["name"]
            top_dict['business_id']=data.iloc[top_n_indices]["business_id"]
            top_dict['distance']=data.iloc[top_n_indices]["distance"]
            
        return [top_dict],top_lst
    
    def fit(self,uid):
        data= self.data
        user_data=data[(data['user_id'] == uid) & (data['stars_x'] >= 3)]
        user_data=user_data.sort_values(by='stars_x', ascending=False)
        bid=user_data['business_id'].tolist()[:1]
        output,rec=self.get_recommendations(bid)
        return output,pd.DataFrame(rec)



In [None]:
model=contentrec_loc(cosine_sim,36.066914,-115.170848,rest_review,business)
output,rec=model.fit('jFTfEhwPuBsWXydhcEbqAQ')
output

[{'name': 7681                              "Home Plate Grill & Bar"
  1272                       "Crazy Pita Rotisserie & Grill"
  475                   "Double Helix Wine & Whiskey Lounge"
  8213                           "Lazy Dog Restaurant & Bar"
  3                                "Flight Deck Bar & Grill"
  5587                          "Kabuki Japanese Restaurant"
  5029                               "FUWA Teppanyaki Grill"
  6496                            "La Bella Napoli Pizzeria"
  9594     "Tommy Bahama Restaurant | Bar | Store - Las V...
  1539                                 "Blue Martini Lounge"
  11842                                 "Hot Dog On A Stick"
  6948                                      "Bonefish Grill"
  11467                                     "Pot Liquor CAS"
  3485                                          "Yard House"
  11346                                             "LV bar"
  7444                              "17 South Booze & Bites"
  9862          

In [None]:
rec=rec.sort_values('distance').head(10)
rec

Unnamed: 0,name,business_id,distance,latitude,longitude
3,"""Flight Deck Bar & Grill""",Pd52CjgyEU3Rb8co6QfTPw,3.6e-05,36.066914,-115.170848
11842,"""Hot Dog On A Stick""",OwG2nkJrDoP-WGcU9tvkUA,0.082288,36.066499,-115.172226
11346,"""LV bar""",VWnMgGRexZMGFs7IrQR_9Q,0.194822,36.067581,-115.17423
4804,"""Fleming's Prime Steakhouse""",FYqFfaxVRW6pdviONXIoDw,0.244567,36.069568,-115.173747
8213,"""Lazy Dog Restaurant & Bar""",rdE9gg0WB7Z8kRytIMSapg,0.279334,36.070146,-115.173856
7540,"""Brio Tuscan Grille""",UYh1N1xQhOh-a7nX92xPzQ,0.297009,36.068152,-115.17593
6948,"""Bonefish Grill""",Q5olb1x6FGk2oLAlc9p5Lg,0.304058,36.070043,-115.174675
9594,"""Tommy Bahama Restaurant | Bar | Store - Las V...",uUEKndnFbHD8mVkGiirtpA,0.357552,36.068497,-115.176931
6496,"""La Bella Napoli Pizzeria""",y1esDY2vFyaTMf1YlRT7ZA,0.38187,36.06816,-115.177495
1539,"""Blue Martini Lounge""",UCpUOtvqR-NBWBNVMzJleA,0.387608,36.068371,-115.177536


In [None]:
data = [
    # First trace represents the recommended locations
    go.Scattermapbox(
        lat=rec["latitude"],
        lon=rec["longitude"],
        mode="markers",
        marker=dict(size=14, color=["red"]),
        text=[f"{name}<br>Distance: {distance:.4f} km" for name, distance in zip(rec["name"], rec["distance"])],
        hoverinfo="text",
    ),
    # Second trace represents the user's location
    go.Scattermapbox(
        lat=[36.066914],
        lon=[-115.170848],
        mode="markers",
        marker=dict(size=14, color="blue"),
        text=["Users' Location"]
    )
]

layout = go.Layout(
    autosize=True,
    hovermode="closest",
    # Setting the mapbox API access token and the center, zoom level, and style of the map
    mapbox=dict(
        accesstoken="pk.eyJ1IjoicGdvZGh3YW4iLCJhIjoiY2xnYjh1cjd5MW96NjNmbzhnZGNza3dycCJ9.yHzDlveagH2d0ibjgpDHPQ",
        center=dict(lat=rec["latitude"].mean(), lon=rec["longitude"].mean()),
        style="streets",
        zoom=15,
    ),
)

go.Figure(data=data, layout=layout).show()

### 3. Content based Recommendation using text input



1.  We start by selecting our features. Then we combine them and transform them into vectors using TFIDF vectorizer. 
2.  Created function recommend takes input a text and transforms into a vector. 
3. After that we find the similarity using cosine similarity and get indices of businesses.
4. Using indices we find the 'categories', 'cuisines','text', 'stars' and finally sort them by stars



In [None]:
class ContentBasedRS:
    def __init__(self, data):
        self.df = data
        # Fill in any missing values in the 'text' column with empty strings
        self.df['text'] = self.df['text'].fillna('')
        # Combine the values from 'text', 'categories', and 'cuisines' columns into a single column
        features=['text','categories','cuisines']
        self.df["combined_features"] = data[features].apply(lambda x: " ".join(x.dropna().astype(str)), axis=1)
        # Vectorize the text data using TfidfVectorizer
        self.vectorizer = TfidfVectorizer(stop_words='english')
        self.tfidf_matrix = self.vectorizer.fit_transform(self.df["combined_features"])

    # Recommend the top_n items based on the similarity between the input text and the items in the dataset
    def recommend(self, text, top_n=5):
        # Vectorize the input text
        text_tfidf = self.vectorizer.transform([text])
        # Calculate the cosine similarity between the input text and the items in the dataset
        similarity_scores = cosine_similarity(text_tfidf, self.tfidf_matrix)[0]
        # Get the indices of the top_n items with the highest similarity scores
        top_indices = similarity_scores.argsort()[::-1][:top_n]
        # Get the output data for the top_n items, filtering out items with star ratings below 3 and sorting by star rating
        output_data=self.df.iloc[top_indices][['business_id', 'categories', 'cuisines','text', 'stars']]
        output_data=output_data[output_data['stars'] >= 3]
        output_data=output_data.sort_values(by='stars', ascending=False)
        return output_data.head(5)

In [None]:
rec=ContentBasedRS(business)
text='i need best pizza in downtown'
rec.recommend(text)

Unnamed: 0,business_id,categories,cuisines,text,stars
72,NlBifQZCNUAnbuo_wKIVpg,pizza;italian;restaurants,italian,1642946 Absolutely best wings area. orde... 16...,4.5
1548,ZL-4ejIO3DCgSsdRWwsX-A,pizza;restaurants;italian,italian,1011125 Easily best slice pizza downtown Cle.....,4.0
370,sByhyN_S2FgNnPl5pqY0Ow,restaurants;pizza;italian;salad,italian,1589362 Good quality great price. can't beat ....,4.0
1499,egLgkvwupLaUXhwnWnvp7A,restaurants;italian;pizza,italian,"1499277 pizza Charlotte, ... 1499278 yelpers f...",4.0
4814,a5l5JLVC0_lkz9QNOc5mHg,restaurants;italian;pizza,italian,1219983 traveled Olmsted area 6-8 times ... 12...,4.0


In [None]:
text='night life club with burgers'
rec.recommend(text)

Unnamed: 0,business_id,categories,cuisines,text,stars
2402,afuh98hZmRUPS_H2wl6y4g,american (new);southern;restaurants,american (new),"1681364 Fish, chicken, burgers, fries, even fu...",5.0
11183,RUnq8jOzySCpRa5X64FgJQ,music venues;bars;nightlife;arts & entertainme...,american (traditional),"524384 perfect sized music venue, opi... 52438...",4.0
11710,RrKfYuyKqBn3wKBZNBuFSw,bars;restaurants;nightlife;american (tradition...,american (traditional),1681951 place nice inside server catalina... 1...,4.0
5307,0jMWCioUATOJgMpsPTyC7g,bars;american (new);nightlife;american (tradit...,american (new),1323298 Awesome place. Jenn Carlos took care m...,3.5
11543,9e9AW5qdxhWNrK2uhNCkAQ,bars;american (new);restaurants;nightlife,american (new),897338 Club Tavern decent place play trivia .....,3.0


In [None]:
text='indian buffet'
rec.recommend(text)

Unnamed: 0,business_id,categories,cuisines,text,stars
8452,prFx0Kg92PfPkoU8OY4KxA,buffets;restaurants;vegan;gluten-free;indian,indian,"1591634 Fabulous buffet exotic treats, sp... 1...",4.5
6620,6gPcuK1mTlJDb3SG7d0q7A,event planning & services;indian;restaurants;v...,indian,"1619012 tried weekend lunch buffet here, nd th...",4.5
8312,49sbCG7F2jvqJsfG6-N8Rw,restaurants;indian,indian,1602901 FINALLY Indian food restaurant o... 16...,4.5
11349,gTlDDzDEHyDQ6iwjNhpI6A,indian;restaurants,indian,608638 fav Indian buffet town. Eaten lunch her...,4.5
894,P2uPCEBP2VZsApnvEHCgag,indian;restaurants,indian,"523089 dinner yet, lunch bu... 523090 One best...",4.0


## References

https://medium.com/mlearning-ai/restaurant-recommendation-system-based-on-the-content-in-reviews-dfc3351004db
