In [None]:
!pip install bertopic
!pip install contractions
!pip install transformers
!pip install flair

import contractions
import gensim
import locale
import logging
import nltk
import re
import spacy
import spacy.cli
import warnings
import pandas as pd
from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer
from flair.data import Sentence
from flair.models import TARSClassifier
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize, RegexpTokenizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import pipeline
from umap import UMAP

spacy.cli.download("en_core_web_md")
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)
nltk.download('omw-1.4')

pd.set_option('display.max_colwidth', None)

In [None]:
df = pd.read_table("Restaurant_Reviews.tsv")

In [None]:
df.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday off Rick Steve recommendation and loved it.,1
4,The selection on the menu was great and so were the prices.,1


In [None]:
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

# Data Preparation

In [None]:
def fix_contractions(text):
    expanded_words = []   
    for word in text.split():
        expanded_words.append(contractions.fix(word))  
   
    return ' '.join(expanded_words)

def clean(text):
    return re.sub(r"\s+", " ", re.sub(r"[^\sA-Za-z0-9]", "", re.sub(r'[^\w\s]', ' ', text))).lower()
    # return " ".join(map(spellchecker.correction, re.sub(r"\s+", " ", text).lower().split()))

def tokenize(text):
    return word_tokenize(text)

STOP_WORDS = set(stopwords.words("english"))
STOP_WORDS.remove("not")
STOP_WORDS.remove("no")

def filter_stopwords(tokenized_text):
    return list(filter(lambda x: x not in STOP_WORDS, tokenized_text))

def lemmatize(filtered_text):
    return list(map(WordNetLemmatizer().lemmatize, filtered_text))

# def lemmatize_custom(sentence):
#     return " ".join(list(map(WordNetLemmatizer().lemmatize, filter_stopword(word_tokenize(sentence)))))

def stemming(lemmatized_text):
    return " ".join(list(map(SnowballStemmer("english").stem, lemmatized_text)))

def preprocess(text):
    pipeline = [fix_contractions, clean, tokenize, filter_stopwords, lemmatize]
    for process in pipeline:
        text = process(text)

    return " ".join(text)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Collecting textsearch>=0.0.21
  Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Collecting pyahocorasick
  Downloading pyahocorasick-2.0.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl (104 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.5/104.5 KB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting anyascii
  Downloading anyascii-0.3.1-py3-none-any.whl (287 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m287.5/287.5 KB[0m [31m32.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyahocorasick, anyascii, textsearch, contractions
Successfully installed anyascii-0.3.1 contractions-0.1.73 pyahocorasick-2.0.0 textsearch-0.0.24


[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [None]:
df["cleaned_review"] = df["Review"].apply(preprocess)

In [None]:
positive_df = df[df["Liked"] == 1]
negative_df = df[df["Liked"] == 0]

# Model

## LDA

In [None]:
#Source: https://github.com/marcmuon/nlp_yelp_review_unsupervised/tree/master/notebooks

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

def bigrams(words, bi_min = 15):
    bigram = gensim.models.Phrases(words, min_count = bi_min)
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    return bigram_mod

def get_corpus(df, column):
    words = list(sent_to_words(df[column]))
    bigram_mod = bigrams(words)
    bigram = [bigram_mod[word] for word in words]
    id2word = gensim.corpora.Dictionary(bigram)
    id2word.filter_extremes(no_below = 10)
    id2word.compactify()
    corpus = [id2word.doc2bow(text) for text in bigram]
    
    return corpus, id2word, bigram

In [None]:
positive_corpus, positive_id2word, positive_bigram = get_corpus(positive_df, "cleaned_review")
negative_corpus, negative_id2word, negative_bigram = get_corpus(negative_df, "cleaned_review")

In [None]:
with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    lda_positive = gensim.models.ldamulticore.LdaMulticore(
                           corpus = positive_corpus,
                           num_topics = 8, 
                           id2word = positive_id2word,
                           per_word_topics = True)

with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    lda_negative = gensim.models.ldamulticore.LdaMulticore(
                           corpus = negative_corpus,
                           num_topics = 8, 
                           id2word = negative_id2word,
                           per_word_topics = True)



In [None]:
lda_positive.print_topics(10, num_words = 15)

[(0,
  '0.171*"great" + 0.080*"food" + 0.070*"place" + 0.070*"friendly" + 0.070*"restaurant" + 0.059*"service" + 0.043*"staff" + 0.043*"really" + 0.038*"nice" + 0.027*"good" + 0.027*"always" + 0.027*"one" + 0.022*"not" + 0.022*"go" + 0.022*"love"'),
 (1,
  '0.241*"good" + 0.190*"food" + 0.089*"service" + 0.045*"delicious" + 0.045*"like" + 0.039*"love" + 0.032*"pretty" + 0.026*"place" + 0.026*"friendly" + 0.026*"even" + 0.020*"time" + 0.020*"experience" + 0.013*"great" + 0.013*"nice" + 0.013*"best"'),
 (2,
  '0.099*"good" + 0.086*"service" + 0.067*"great" + 0.060*"amazing" + 0.053*"nice" + 0.047*"food" + 0.047*"not" + 0.047*"fresh" + 0.040*"also" + 0.034*"pizza" + 0.034*"price" + 0.034*"fantastic" + 0.027*"menu" + 0.027*"atmosphere" + 0.027*"go"'),
 (3,
  '0.208*"great" + 0.076*"place" + 0.076*"made" + 0.058*"not" + 0.058*"pizza" + 0.048*"service" + 0.048*"like" + 0.039*"price" + 0.029*"really" + 0.029*"time" + 0.029*"server" + 0.029*"experience" + 0.020*"good" + 0.020*"food" + 0.020*"l

In [None]:
lda_negative.print_topics(10, num_words = 15)

[(0,
  '0.254*"not" + 0.098*"ever" + 0.074*"minute" + 0.074*"no" + 0.050*"food" + 0.050*"worst" + 0.049*"get" + 0.038*"never" + 0.038*"would" + 0.038*"got" + 0.038*"restaurant" + 0.038*"bland" + 0.026*"like" + 0.026*"one" + 0.025*"disappointed"'),
 (1,
  '0.287*"not" + 0.128*"back" + 0.090*"really" + 0.052*"table" + 0.040*"food" + 0.040*"one" + 0.040*"get" + 0.040*"going" + 0.027*"place" + 0.027*"ever" + 0.027*"go" + 0.027*"restaurant" + 0.021*"much" + 0.021*"like" + 0.014*"service"'),
 (2,
  '0.135*"came" + 0.135*"bland" + 0.091*"food" + 0.069*"not" + 0.069*"good" + 0.047*"service" + 0.047*"like" + 0.047*"bad" + 0.047*"minute" + 0.025*"back" + 0.025*"time" + 0.025*"would" + 0.025*"one" + 0.025*"never" + 0.025*"table"'),
 (3,
  '0.123*"worst" + 0.108*"service" + 0.092*"like" + 0.078*"food" + 0.077*"bad" + 0.071*"go" + 0.062*"eat" + 0.047*"no" + 0.047*"one" + 0.039*"back" + 0.039*"ever" + 0.032*"not" + 0.032*"would" + 0.032*"slow" + 0.024*"never"'),
 (4,
  '0.228*"not" + 0.196*"food" + 

In [None]:
positive_topic_vec = []
for i in range(len(positive_df)):
    top_topics = lda_positive.get_document_topics(positive_corpus[i], minimum_probability = 0.0)
    topic_values = sorted(top_topics, key = lambda x: x[1])[-1]
    positive_topic_vec += [topic_values]

positive_topic_set = list(map(lambda x: x if x[1] - 0.33333335 > 0 else (4, 0), positive_topic_vec))
positive_topic = list(map(lambda x: x[0], positive_topic_set))
positive_topic_values = list(map(lambda x: x[1], positive_topic_set))

In [None]:
positive_df["topic"] = positive_topic
positive_df["topic_prob"] = positive_topic_values

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  positive_df["topic"] = positive_topic
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  positive_df["topic_prob"] = positive_topic_values


In [None]:
positive_df

Unnamed: 0,Review,Liked,cleaned_review,topic,topic_prob
0,Wow... Loved this place.,1,wow loved place,7,0.708167
3,Stopped by during the late May bank holiday off Rick Steve recommendation and loved it.,1,stopped late may bank holiday rick steve recommendation loved,2,0.561919
4,The selection on the menu was great and so were the prices.,1,selection menu great price,3,0.780826
8,The fries were great too.,1,fry great,3,0.562255
9,A great touch.,1,great touch,3,0.562239
...,...,...,...,...,...
899,"Overall, a great experience.",1,overall great experience,3,0.708163
901,Their regular toasted bread was equally satisfying with the occasional pats of butter... Mmmm...!,1,regular toasted bread equally satisfying occasional pat butter mmmm,4,0.000000
907,The chips and sals a here is amazing!!!!!!!!!!!!!!!!!!!,1,chip sals amazing,2,0.562127
909,This is my new fav Vegas buffet spot.,1,new fav vega buffet spot,6,0.562434


## BERTopic

### Variation 1

#### Positive

In [None]:
umap_model = UMAP(n_neighbors=15, n_components=10, min_dist=0.0, metric='cosine')
vectorizer_model = CountVectorizer(stop_words="english")
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
model_bert = BERTopic(vectorizer_model=vectorizer_model, ctfidf_model=ctfidf_model, top_n_words=4, umap_model=umap_model)
topics, probs = model_bert.fit_transform(list(positive_df["cleaned_review"]))

In [None]:
model_bert.get_representative_docs()

{-1: ['really vega fine dining used right menu handed lady no price listed',
  'service excellent price pretty reasonable considering vega located inside crystal shopping mall aria',
  'love authentic mexican food want whole bunch interesting yet delicious meat choose need try place'],
 0: ['also served hot bread butter home made potato chip bacon bit top original good',
  'good selection food including massive meatloaf sandwich crispy chicken wrap delish tuna melt tasty burger',
  'loved mussel cooked wine reduction duck tender potato dish delicious'],
 1: ['server nice even though looked little overwhelmed need stayed professional friendly end',
  'staff always super friendly helpful especially cool bring two small boy baby',
  'staff super nice quick even crazy crowd downtown jury lawyer court staff'],
 2: ['continue come lady night andddd date night highly recommend place anyone area',
  'pro large seating area nice bar area great simple drink menu best brick oven pizza homemade do

In [None]:
model_bert.visualize_topics()

In [None]:
model_bert.visualize_barchart()

In [None]:
model_bert.get_document_info(list(positive_df["cleaned_review"]))

Unnamed: 0,Document,Topic,Name,Top_n_words,Probability,Representative_document
0,wow loved place,2,2_place_area_recommend_eat,place - area - recommend - eat,1.000000,False
1,stopped late may bank holiday rick steve recommendation loved,-1,-1_pizza_menu_price_selection,pizza - menu - price - selection,0.000000,False
2,selection menu great price,-1,-1_pizza_menu_price_selection,pizza - menu - price - selection,0.000000,False
3,fry great,0,0_steak_delicious_fresh_taste,steak - delicious - fresh - taste,1.000000,False
4,great touch,-1,-1_pizza_menu_price_selection,pizza - menu - price - selection,0.000000,False
...,...,...,...,...,...,...
495,overall great experience,-1,-1_pizza_menu_price_selection,pizza - menu - price - selection,0.000000,False
496,regular toasted bread equally satisfying occasional pat butter mmmm,-1,-1_pizza_menu_price_selection,pizza - menu - price - selection,0.000000,False
497,chip sals amazing,0,0_steak_delicious_fresh_taste,steak - delicious - fresh - taste,0.870653,False
498,new fav vega buffet spot,4,4_vega_breakfast_buffet_better,vega - breakfast - buffet - better,1.000000,False


In [None]:
model_bert.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,185,-1_pizza_menu_price_selection
1,0,132,0_steak_delicious_fresh_taste
2,1,62,1_service_staff_friendly_super
3,2,50,2_place_area_recommend_eat
4,3,21,3_food_service_little_joy
5,4,19,4_vega_breakfast_buffet_better
6,5,18,5_soon_assure_definitely_wait
7,6,13,6_sooooo_performed_beat_cool


In [None]:
# Visualize term rank decrease
model_bert.visualize_term_rank()

In [None]:
# Visualize similarity using heatmap
model_bert.visualize_heatmap()

In [None]:
model_bert.hierarchical_topics(positive_df["cleaned_review"])

100%|██████████| 6/6 [00:00<00:00, 108.26it/s]


Unnamed: 0,Parent_ID,Parent_Name,Topics,Child_Left_ID,Child_Left_Name,Child_Right_ID,Child_Right_Name,Distance
5,12,service_place_food_steak,"[0, 1, 2, 3, 4, 5, 6]",9,vega_breakfast_buffet_definitely,11,service_place_food_steak,1.017564
4,11,service_place_food_steak,"[0, 1, 2, 3, 6]",8,place_steak_delicious_fresh,10,service_friendly_staff_server,0.922584
3,10,service_friendly_staff_server,"[1, 3, 6]",7,service_friendly_staff_server,6,sooooo_performed_beat_cool,0.913393
2,9,vega_breakfast_buffet_definitely,"[4, 5]",5,soon_assure_definitely_wait,4,vega_breakfast_buffet_better,0.8805
1,8,place_steak_delicious_fresh,"[0, 2]",2,place_area_recommend_eat,0,steak_delicious_fresh_taste,0.871997
0,7,service_friendly_staff_server,"[1, 3]",1,service_staff_friendly_super,3,food_service_little_joy,0.850281


In [None]:
from scipy.cluster import hierarchy as sch

linkage_function = lambda x: sch.linkage(x, 'single', optimal_ordering=True)
hierarchical_topics = model_bert.hierarchical_topics(positive_df["cleaned_review"], linkage_function=linkage_function)

100%|██████████| 6/6 [00:00<00:00, 105.35it/s]


In [None]:
model_bert.visualize_hierarchy(hierarchical_topics=hierarchical_topics)

In [None]:
tree = model_bert.get_topic_tree(hierarchical_topics)
print(tree)

.
├─■──sooooo_performed_beat_cool ── Topic: 6
└─service_place_food_steak
     ├─service_place_food_steak
     │    ├─service_friendly_staff_server
     │    │    ├─■──service_staff_friendly_super ── Topic: 1
     │    │    └─■──food_service_little_joy ── Topic: 3
     │    └─place_steak_delicious_fresh
     │         ├─■──place_area_recommend_eat ── Topic: 2
     │         └─■──steak_delicious_fresh_taste ── Topic: 0
     └─vega_breakfast_buffet_definitely
          ├─■──vega_breakfast_buffet_better ── Topic: 4
          └─■──soon_assure_definitely_wait ── Topic: 5



#### Negative

In [None]:
umap_model = UMAP(n_neighbors=15, n_components=10, min_dist=0.0, metric='cosine')
vectorizer_model = CountVectorizer(stop_words="english")
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
model_bert = BERTopic(vectorizer_model=vectorizer_model, ctfidf_model=ctfidf_model, top_n_words=3, umap_model=umap_model)
topics, probs = model_bert.fit_transform(list(negative_df["cleaned_review"]))

In [None]:
model_bert.get_representative_docs()

{-1: ['ambiance not feel like buffet setting douchey indoor garden tea biscuit',
  'owner really really need quit soooooo cheap let wrap freaking sandwich two paper not one',
  'immediately said wanted talk manager not want talk guy shot fireball behind bar'],
 0: ['much good food vega feel cheated wasting eating opportunity going rice company',
  'hilarious yummy christmas eve dinner remember biggest fail entire trip u',
  'paying 7 85 hot dog fry look like came kid meal wienerschnitzel not idea good meal'],
 1: ['high hope place since burger cooked charcoal grill unfortunately taste fell flat way flat',
  'burger absolutely no flavor meat totally bland burger overcooked no charcoal flavor',
  'hot dish not hot cold dish close room temp watched staff prepare food bare hand no glove everything deep fried oil'],
 2: ['guess known place would suck inside excalibur not use common sense',
  'bland not liking place number reason not want waste time bad reviewing leave',
  'point friend basi

In [None]:
model_bert.visualize_topics()

In [None]:
model_bert.visualize_barchart()

In [None]:
model_bert.get_document_info(list(positive_df["cleaned_review"]))

Unnamed: 0,Document,Topic,Name,Top_n_words,Probability,Representative_document
0,wow loved place,8,8_dirt_gross_market,dirt - gross - market,0.753846,False
1,stopped late may bank holiday rick steve recommendation loved,1,1_flavor_burger_meat,flavor - burger - meat,1.000000,False
2,selection menu great price,-1,-1_sushi_vega_bad,sushi - vega - bad,0.000000,False
3,fry great,1,1_flavor_burger_meat,flavor - burger - meat,0.920881,False
4,great touch,1,1_flavor_burger_meat,flavor - burger - meat,0.647890,False
...,...,...,...,...,...,...
495,overall great experience,1,1_flavor_burger_meat,flavor - burger - meat,1.000000,False
496,regular toasted bread equally satisfying occasional pat butter mmmm,0,0_meal_food_quality,meal - food - quality,0.975766,False
497,chip sals amazing,-1,-1_sushi_vega_bad,sushi - vega - bad,0.000000,False
498,new fav vega buffet spot,-1,-1_sushi_vega_bad,sushi - vega - bad,0.000000,False


In [None]:
model_bert.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,109,-1_sushi_vega_bad
1,0,90,0_meal_food_quality
2,1,79,1_flavor_burger_meat
3,2,48,2_place_waste_money
4,3,47,3_going_probably_coming
5,4,39,4_minute_waited_hour
6,5,27,5_service_server_slow
7,6,20,6_mistake_disappointed_experience
8,7,15,7_waiter_rude_management
9,8,14,8_dirt_gross_market


### Variation 2

In [None]:
nlp = spacy.load('en_core_web_md', exclude=['tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer'])

topic_model = BERTopic(embedding_model=nlp, top_n_words=3)
topics, probs = topic_model.fit_transform(positive_df["cleaned_review"])

fig = topic_model.visualize_topics()
fig.show()

In [None]:
topic_model.visualize_barchart()

In [None]:
topic_model.get_document_info(list(positive_df["cleaned_review"]))

Unnamed: 0,Document,Topic,Name,Top_n_words,Probability,Representative_document
0,wow loved place,5,5_place_amazing_town,place - amazing - town,1.000000,False
1,stopped late may bank holiday rick steve recommendation loved,0,0_not_back_time,not - back - time,1.000000,False
2,selection menu great price,2,2_service_food_price,service - food - price,0.828998,False
3,fry great,1,1_chicken_delicious_good,chicken - delicious - good,0.954802,False
4,great touch,-1,-1_great_best_good,great - best - good,0.000000,False
...,...,...,...,...,...,...
495,overall great experience,-1,-1_great_best_good,great - best - good,0.000000,False
496,regular toasted bread equally satisfying occasional pat butter mmmm,1,1_chicken_delicious_good,chicken - delicious - good,1.000000,False
497,chip sals amazing,1,1_chicken_delicious_good,chicken - delicious - good,0.683544,False
498,new fav vega buffet spot,-1,-1_great_best_good,great - best - good,0.000000,False


In [None]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,156,-1_great_best_good
1,0,142,0_not_back_time
2,1,74,1_chicken_delicious_good
3,2,38,2_service_food_price
4,3,21,3_service_friendly_attentive
5,4,20,4_nice_restaurant_clean
6,5,18,5_place_amazing_town
7,6,18,6_food_good_compliment
8,7,13,7_star_soup_100


## Zero-Shot

### Transformers

https://medium.com/grabngoinfo/zero-shot-topic-modeling-with-deep-learning-using-python-a895d2d0c773

#### Single Label

In [None]:
classifier = pipeline(task="zero-shot-classification", 
                      model="facebook/bart-large-mnli",
                      device=0) 

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
# Define the candidate labels 
candidate_labels = ["place", "services", "food", "staff", "waiter"]

# Set the hyppothesis template
hypothesis_template = "The topic of this review is {}."

# Prediction results
single_topic_prediction = classifier(list(positive_df["cleaned_review"]), candidate_labels, hypothesis_template=hypothesis_template)

# Save the output as a dataframe
single_topic_prediction = pd.DataFrame(single_topic_prediction)

# Take a look at the data
single_topic_prediction.head()

Unnamed: 0,sequence,labels,scores
0,wow loved place,"[place, food, services, staff, waiter]","[0.9027376770973206, 0.039525922387838364, 0.03709842637181282, 0.01576852798461914, 0.004869346506893635]"
1,stopped late may bank holiday rick steve recommendation loved,"[services, waiter, staff, food, place]","[0.3909590244293213, 0.2565339505672455, 0.13662995398044586, 0.13621096312999725, 0.07966616749763489]"
2,selection menu great price,"[food, place, services, waiter, staff]","[0.7900527119636536, 0.10980317741632462, 0.06313919275999069, 0.02756166271865368, 0.009443264454603195]"
3,fry great,"[food, services, waiter, place, staff]","[0.9117928147315979, 0.03843012824654579, 0.02959602326154709, 0.012543514370918274, 0.0076375799253582954]"
4,great touch,"[services, staff, place, food, waiter]","[0.5739902257919312, 0.17066626250743866, 0.1107017770409584, 0.08041735738515854, 0.06422434747219086]"


In [None]:
# The column for the predicted topic
single_topic_prediction['predicted_topic'] = single_topic_prediction['labels'].apply(lambda x: x[0])

# The column for the score of predi ted topic
single_topic_prediction['predicted_topic_score'] = single_topic_prediction['scores'].apply(lambda x: x[0])

# Take a look at the data
single_topic_prediction.head()

Unnamed: 0,sequence,labels,scores,predicted_topic,predicted_topic_score
0,wow loved place,"[place, food, services, staff, waiter]","[0.9027376770973206, 0.039525922387838364, 0.03709842637181282, 0.01576852798461914, 0.004869346506893635]",place,0.902738
1,stopped late may bank holiday rick steve recommendation loved,"[services, waiter, staff, food, place]","[0.3909590244293213, 0.2565339505672455, 0.13662995398044586, 0.13621096312999725, 0.07966616749763489]",services,0.390959
2,selection menu great price,"[food, place, services, waiter, staff]","[0.7900527119636536, 0.10980317741632462, 0.06313919275999069, 0.02756166271865368, 0.009443264454603195]",food,0.790053
3,fry great,"[food, services, waiter, place, staff]","[0.9117928147315979, 0.03843012824654579, 0.02959602326154709, 0.012543514370918274, 0.0076375799253582954]",food,0.911793
4,great touch,"[services, staff, place, food, waiter]","[0.5739902257919312, 0.17066626250743866, 0.1107017770409584, 0.08041735738515854, 0.06422434747219086]",services,0.57399


In [None]:
single_topic_prediction

Unnamed: 0,sequence,labels,scores,predicted_topic,predicted_topic_score
0,wow loved place,"[place, food, services, staff, waiter]","[0.9027376770973206, 0.039525922387838364, 0.03709842637181282, 0.01576852798461914, 0.004869346506893635]",place,0.902738
1,stopped late may bank holiday rick steve recommendation loved,"[services, waiter, staff, food, place]","[0.3909590244293213, 0.2565339505672455, 0.13662995398044586, 0.13621096312999725, 0.07966616749763489]",services,0.390959
2,selection menu great price,"[food, place, services, waiter, staff]","[0.7900527119636536, 0.10980317741632462, 0.06313919275999069, 0.02756166271865368, 0.009443264454603195]",food,0.790053
3,fry great,"[food, services, waiter, place, staff]","[0.9117928147315979, 0.03843012824654579, 0.02959602326154709, 0.012543514370918274, 0.0076375799253582954]",food,0.911793
4,great touch,"[services, staff, place, food, waiter]","[0.5739902257919312, 0.17066626250743866, 0.1107017770409584, 0.08041735738515854, 0.06422434747219086]",services,0.573990
...,...,...,...,...,...
495,overall great experience,"[services, place, staff, food, waiter]","[0.6046612858772278, 0.22488664090633392, 0.08542779833078384, 0.05037461966276169, 0.03464965149760246]",services,0.604661
496,regular toasted bread equally satisfying occasional pat butter mmmm,"[food, services, place, waiter, staff]","[0.9808177947998047, 0.0076177045702934265, 0.003864046186208725, 0.0038579916581511497, 0.0038424204103648663]",food,0.980818
497,chip sals amazing,"[food, services, staff, place, waiter]","[0.9092738032341003, 0.062062911689281464, 0.01079279463738203, 0.009367921389639378, 0.008502510376274586]",food,0.909274
498,new fav vega buffet spot,"[food, place, services, waiter, staff]","[0.8745837211608887, 0.11451338231563568, 0.0072356341406702995, 0.002139533869922161, 0.0015277615748345852]",food,0.874584


In [None]:
bert_topic = model_bert.get_document_info(list(positive_df["cleaned_review"]))[["Document", "Top_n_words"]]

bert_topic.columns = ["sequence", "subtopic"]

In [None]:
single_topic_prediction[["sequence", "predicted_topic"]].join(bert_topic, lsuffix = "l")

Unnamed: 0,sequencel,predicted_topic,sequence,subtopic
0,wow loved place,place,wow loved place,place - area - recommend - eat
1,stopped late may bank holiday rick steve recommendation loved,services,stopped late may bank holiday rick steve recommendation loved,pizza - menu - price - selection
2,selection menu great price,food,selection menu great price,pizza - menu - price - selection
3,fry great,food,fry great,steak - delicious - fresh - taste
4,great touch,services,great touch,pizza - menu - price - selection
...,...,...,...,...
495,overall great experience,services,overall great experience,pizza - menu - price - selection
496,regular toasted bread equally satisfying occasional pat butter mmmm,food,regular toasted bread equally satisfying occasional pat butter mmmm,pizza - menu - price - selection
497,chip sals amazing,food,chip sals amazing,steak - delicious - fresh - taste
498,new fav vega buffet spot,food,new fav vega buffet spot,vega - breakfast - buffet - better


#### Multi Label

In [None]:
# Prediction results
multi_topic_prediction = classifier(list(positive_df["cleaned_review"]), candidate_labels, hypothesis_template=hypothesis_template, multi_label=True)

# Save the output in a dataframe
multi_topic_prediction = pd.DataFrame(multi_topic_prediction)

# Take a look at the data
multi_topic_prediction.head()

Unnamed: 0,sequence,labels,scores
0,wow loved place,"[place, food, services, staff, waiter]","[0.9973077774047852, 0.4749133288860321, 0.39162465929985046, 0.21157842874526978, 0.017929434776306152]"
1,stopped late may bank holiday rick steve recommendation loved,"[services, food, staff, waiter, place]","[0.6085400581359863, 0.04878292232751846, 0.024505766108632088, 0.01964750699698925, 0.014239303767681122]"
2,selection menu great price,"[food, place, services, waiter, staff]","[0.9765596389770508, 0.22808271646499634, 0.13489174842834473, 0.005593220237642527, 0.0006310648168437183]"
3,fry great,"[food, services, waiter, place, staff]","[0.997489333152771, 0.464260458946228, 0.04853450134396553, 0.015775399282574654, 0.005298870149999857]"
4,great touch,"[services, staff, place, food, waiter]","[0.8843616843223572, 0.26677367091178894, 0.0845634713768959, 0.07716678082942963, 0.01724456064403057]"


In [None]:
# Threshold probability
threshold = 0.9

# Expand the lists
multi_topic_prediction = multi_topic_prediction.set_index('sequence').apply(pd.Series.explode).reset_index()

# Filter by threshold
multi_topic_prediction2 = multi_topic_prediction[multi_topic_prediction['scores'] >= threshold]

# Take a look at the data
multi_topic_prediction2.head()

Unnamed: 0,sequence,labels,scores
0,wow loved place,place,0.997308
10,selection menu great price,food,0.97656
15,fry great,food,0.997489
25,service prompt,services,0.976481
30,tried cape cod ravoli chicken cranberry mmmm,food,0.999313


In [None]:
multi_topic_prediction2[multi_topic_prediction2.duplicated(subset=['sequence'],keep=False)]

Unnamed: 0,sequence,labels,scores
65,hole wall great mexican street taco friendly staff,food,0.998399
66,hole wall great mexican street taco friendly staff,staff,0.993162
67,hole wall great mexican street taco friendly staff,place,0.951504
68,hole wall great mexican street taco friendly staff,services,0.946905
95,first visit hiro delight,place,0.986916
...,...,...,...
2425,go place gyro,food,0.99839
2426,go place gyro,place,0.983013
2460,want first say server great perfect service,services,0.99502
2461,want first say server great perfect service,waiter,0.950733


### Flair

In [None]:
classifier2 = TARSClassifier.load('tars-base')
cleaned_review_sentences = list(map(lambda x: Sentence(x), positive_df["cleaned_review"]))

2023-02-21 13:41:49,941 https://nlp.informatik.hu-berlin.de/resources/models/tars-base/tars-base-v8.pt not found in cache, downloading to /tmp/tmp5_xs7yo9


100%|██████████| 438064585/438064585 [00:39<00:00, 11140482.79B/s]

2023-02-21 13:42:29,804 copying /tmp/tmp5_xs7yo9 to cache at /root/.flair/models/tars-base-v8.pt





2023-02-21 13:42:31,287 removing temp file /tmp/tmp5_xs7yo9
2023-02-21 13:42:31,348 loading file /root/.flair/models/tars-base-v8.pt


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
classes = ["place", "services", "food", "staff", "waiter"]

classifier2.predict_zero_shot(cleaned_review_sentences, classes)

In [None]:
cleaned_review_sentences

[Sentence: "wow loved place" → place (0.9961),
 Sentence: "stopped late may bank holiday rick steve recommendation loved",
 Sentence: "selection menu great price" → food (0.6886); waiter (0.7198),
 Sentence: "fry great",
 Sentence: "great touch",
 Sentence: "service prompt" → services (0.8826); staff (0.7671),
 Sentence: "tried cape cod ravoli chicken cranberry mmmm" → food (0.9389),
 Sentence: "highly recommended",
 Sentence: "food amazing" → food (0.9491),
 Sentence: "service also cute" → services (0.7476); staff (0.5766),
 Sentence: "could care le interior beautiful",
 Sentence: "performed" → services (0.9065); staff (0.5177),
 Sentence: "right red velvet cake ohhh stuff good",
 Sentence: "hole wall great mexican street taco friendly staff" → place (0.9444); services (0.7362); staff (0.8837),
 Sentence: "also combo like burger fry beer 23 decent deal" → food (0.6851); waiter (0.7041),
 Sentence: "found place accident could not happier",
 Sentence: "overall like place lot" → place (0

In [None]:
cleaned_review_sentences[0].labels[0].value

'place'

In [None]:
cleaned_review_sentences[0].labels[0].score

0.9961339235305786