In [1]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF
from sklearn import metrics

In [2]:
df = pd.read_csv('trip_advisor_reviews_2.csv')
df

Unnamed: 0,_id,review,lem_review,sw_review,mod_review,new_review,new_lem_review
0,65496e562042bfa2da026b54,having lived in the bay area for year i regret...,lived in the bay area for year i regret not f...,lived bay area year regret finding place soone...,lived bay area year regret finding place soone...,lived bay area year regret finding place soone...,live bay area year regret find place treasure ...
1,65496e562042bfa2da026b55,long time diner first time reviewer ive been c...,long time diner first time reviewer been comi...,long time diner time reviewer coming year impr...,long time diner time reviewer coming year impr...,long time diner time reviewer coming year impr...,long time diner time reviewer come year impres...
2,65496e562042bfa2da026b56,the view the staff the ceviche and the fish ch...,the view the staff the and the fish chip were...,view staff fish chip year favorite place escap...,view staff fish chip year favorite place escap...,view staff fish chip year favorite place escap...,view staff fish chip year favorite place escap...
3,65496e562042bfa2da026b57,my brother in law love fish and chip we heard ...,my brother in law love fish and chip we that ...,brother law love fish chip treasure island are...,brother law love fish chip treasure island are...,brother law love fish chip treasure island are...,brother law love fish chip treasure island are...
4,65496e562042bfa2da026b58,the food is always great i always start with t...,the food is always great i always start with t...,food great start chowder rest menu delicious p...,food great start chowder rest menu delicious p...,food great start chowder rest menu delicious p...,food great start chowder rest menu delicious g...
...,...,...,...,...,...,...,...
3412,65496e5b2042bfa2da0278af,if your looking for great food the best servic...,if your looking for great food the best servic...,great food service great bartender restaurant ...,great food service great bartender restaurant ...,great food service great bartender restaurant ...,great food service great bartender restaurant ...
3413,65496e5b2042bfa2da0278b0,if you come alone you won t feel alone for ver...,if you come alone you won t feel alone for ver...,feel long great place feel local great salmon ...,feel long great place feel local great salmon ...,feel long great place feel local great salmon ...,feel great place feel local great salmon amazi...
3414,65496e5b2042bfa2da0278b1,rather than spend an hour in grid lock of the ...,rather than spend an hour in grid lock of the ...,spend hour grid lock main drag weekend street ...,spend hour grid lock main drag weekend street ...,spend hour grid lock main drag weekend street ...,spend hour grid lock main drag weekend street ...
3415,65496e5b2042bfa2da0278b2,good solid old school mexican food at a very f...,good solid old school food at a very fair pri...,good solid school food fair price good enchila...,good solid school food fair price good enchila...,good solid school food fair price good enchila...,good solid school food fair price good enchila...


In [3]:
df.drop(columns=['review', 'lem_review', 'sw_review', 'mod_review', 'new_review'], inplace=True)
df.rename(columns={'new_lem_review': 'review'}, inplace=True)

In [4]:
df.review.isna().sum()

4

In [5]:
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,_id,review
0,65496e562042bfa2da026b54,live bay area year regret find place treasure ...
1,65496e562042bfa2da026b55,long time diner time reviewer come year impres...
2,65496e562042bfa2da026b56,view staff fish chip year favorite place escap...
3,65496e562042bfa2da026b57,brother law love fish chip treasure island are...
4,65496e562042bfa2da026b58,food great start chowder rest menu delicious g...
...,...,...
3408,65496e5b2042bfa2da0278af,great food service great bartender restaurant ...
3409,65496e5b2042bfa2da0278b0,feel great place feel local great salmon amazi...
3410,65496e5b2042bfa2da0278b1,spend hour grid lock main drag weekend street ...
3411,65496e5b2042bfa2da0278b2,good solid school food fair price good enchila...


**Removal of Some Other Stopwords**

In [6]:
custom_stopwords = ['excellent', 'outstanding', 'superb', 'fantastic', 'terrific', 'marvelous', 'wonderful',
                    'exceptional', 'admirable', 'splendid', 'poor', 'inferior', 'subpar', 'mediocre', 'lousy',
                    'terrible', 'awful', 'horrible', 'abysmal', 'dismal', 'overwhelming', 'exemplary',
                    'extraordinary', 'remarkable', 'unparalleled', 'exceptional', 'unsurpassed', 'superlative',
                    'peerless', 'incomparable', 'atrocious', 'dreadful', 'deplorable', 'appalling', 'catastrophic',
                    'abominable', 'monstrous', 'detestable', 'reprehensible', 'unbearable', 'good', 'bad', 'great',
                    'eatery', 'diner', 'bistro', 'cafe', 'brasserie', 'tavern', 'cafeteria', 'grill', 'pub',
                    'trattoria', 'location', 'spot', 'venue', 'area', 'site', 'locale', 'setting', 'region',
                    'space', 'position', 'restaurant', 'place', 'sommeli', 'cha', 'second', 'minute', 'hour', 'day', 'week',
                    'fortnight', 'month', 'year', 'decade', 'century', 'millennium', 'moment', 'quarter', 'half',
                    'nighttime', 'midnight', 'noon', 'future', 'era', 'epoch', 'age', 'period', 'interval', 'schedule',
                    'calendar', 'clock', 'watch', 'stopwatch', 'timer', 'chronometer', 'timepiece', 'o\'clock', 'a.m.',
                    'p.m.', 'yesterday', 'today', 'tomorrow', 'dawn', 'dusk', 'zone', 'daylight', 'lunar', 'solar',
                    'calendar', 'leap', 'gregorian', 'julian', 'sidereal', 'equinox', 'solstice', 'century', 'millennium', 'san']


df['review'] = df['review'].apply(lambda x: ' '.join([word for word in x.split() if word not in custom_stopwords]))
df

Unnamed: 0,_id,review
0,65496e562042bfa2da026b54,live bay regret find treasure island unique vi...
1,65496e562042bfa2da026b55,long time time reviewer come impress fish chip...
2,65496e562042bfa2da026b56,view staff fish chip favorite escape city enjo...
3,65496e562042bfa2da026b57,brother law love fish chip treasure island rum...
4,65496e562042bfa2da026b58,food start chowder rest menu delicious enjoy a...
...,...,...
3408,65496e5b2042bfa2da0278af,food service bartender burrito margarita famil...
3409,65496e5b2042bfa2da0278b0,feel feel local salmon amazing prawn choose ha...
3410,65496e5b2042bfa2da0278b1,spend grid lock main drag weekend street inter...
3411,65496e5b2042bfa2da0278b2,solid school food fair price enchilada inspire...


# Modelling

**Training the Vectorizer**

In [7]:
review = df.review.to_list()
review

['live bay regret find treasure island unique view waterfront golden gate bridge bay bridge super lay incredible food drink outrageous view degree water view floor ceiling window incredible poke fresh amazing hostess order house special truffle fry fish chip',
 'long time time reviewer come impress fish chip dish visit fish chowder large delicious chunk fish vegetable melt mouth cauliflower fish brownie eat worth trip',
 'view staff fish chip favorite escape city enjoy island life',
 'brother law love fish chip treasure island rumor accurate portion huge delightfully crispy haddock moist juicy inside potato authentic truffle fry fish chowder experience repeat',
 'food start chowder rest menu delicious enjoy afternoon walk island perimeter hot cold desert head come landscape moon bay gal show beautiful check pet friendly check bay episode discover napa',
 'rate high fish chip flavorless mushy potato bland cut big taste husband poke bowl beautiful hard recreate spectacular bland service 

In [8]:
doc_label = [doc[:25]+"..." for doc in review]

In [9]:
vectorizer = CountVectorizer()
doc_word = vectorizer.fit_transform(review)
columns = vectorizer.get_feature_names_out()
pd.DataFrame(doc_word.toarray(), index=doc_label, columns=columns)

Unnamed: 0,abalone,ability,absinthe,absolute,absurd,abundance,abundant,abusive,accelerate,accent,...,yuck,yummy,zabaglione,zephyr,zesty,zinfandel,zip,zoo,zoom,zucchini
live bay regret find trea...,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
long time time reviewer c...,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
view staff fish chip favo...,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
brother law love fish chi...,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
food start chowder rest m...,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
food service bartender bu...,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
feel feel local salmon am...,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
spend grid lock main drag...,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
solid school food fair pr...,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


**Training the NMF Model**

In [10]:
nmf_model = NMF(n_components=10)
doc_topic = nmf_model.fit_transform(doc_word)

In [11]:
nmf_df = pd.DataFrame(doc_topic.round(10),
             index = doc_label,
             columns = ["Topic 0","Topic 1","Topic 2","Topic 3","Topic 4","Topic 5","Topic 6","Topic 7","Topic 8","Topic 9"])
nmf_df

Unnamed: 0,Topic 0,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,Topic 6,Topic 7,Topic 8,Topic 9
live bay regret find trea...,0.029891,0.186850,0.000000,0.170157,0.025560,0.110707,0.008395,0.022574,0.008912,0.000000
long time time reviewer c...,0.000000,0.006723,0.000000,0.000000,0.013501,0.092925,0.356194,0.205070,0.000000,0.000000
view staff fish chip favo...,0.000217,0.003343,0.013398,0.000000,0.005299,0.098786,0.000000,0.004717,0.005284,0.000000
brother law love fish chi...,0.000000,0.000000,0.000000,0.019886,0.004897,0.054857,0.001274,0.053414,0.103759,0.025396
food start chowder rest m...,0.013648,0.161599,0.071113,0.000000,0.000000,0.151528,0.002433,0.022309,0.001696,0.024915
...,...,...,...,...,...,...,...,...,...,...
food service bartender bu...,0.000000,0.161410,0.003391,0.000000,0.136931,0.007454,0.000000,0.000000,0.128972,0.000000
feel feel local salmon am...,0.019705,0.013044,0.100136,0.004638,0.000000,0.175862,0.000000,0.031875,0.039402,0.000000
spend grid lock main drag...,0.015704,0.011687,0.066481,0.041356,0.158942,0.026679,0.000000,0.032521,0.053715,0.016011
solid school food fair pr...,0.000000,0.334145,0.007583,0.000000,0.028969,0.000000,0.000000,0.000000,0.131429,0.028766


In [12]:
topic_word = pd.DataFrame(nmf_model.components_.round(3),
             index = ["Topic 0","Topic 1","Topic 2","Topic 3","Topic 4","Topic 5","Topic 6","Topic 7","Topic 8","Topic 9"],
             columns = columns)
topic_word

Unnamed: 0,abalone,ability,absinthe,absolute,absurd,abundance,abundant,abusive,accelerate,accent,...,yuck,yummy,zabaglione,zephyr,zesty,zinfandel,zip,zoo,zoom,zucchini
Topic 0,0.0,0.007,0.0,0.016,0.0,0.0,0.0,0.005,0.013,0.0,...,0.0,0.005,0.003,0.0,0.0,0.0,0.059,0.0,0.008,0.0
Topic 1,0.001,0.0,0.0,0.019,0.005,0.003,0.0,0.0,0.001,0.0,...,0.002,0.0,0.0,0.001,0.0,0.0,0.0,0.002,0.0,0.0
Topic 2,0.025,0.0,0.002,0.064,0.003,0.0,0.005,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.015,0.0,0.0,0.0,0.0
Topic 3,0.002,0.0,0.005,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.062,0.0,0.0,0.0,0.003,0.008,0.0,0.0,0.031
Topic 4,0.0,0.039,0.0,0.0,0.0,0.0,0.0,0.0,0.004,0.0,...,0.0,0.0,0.003,0.0,0.006,0.0,0.0,0.0,0.0,0.0
Topic 5,0.0,0.0,0.015,0.0,0.0,0.0,0.004,0.0,0.0,0.008,...,0.0,0.107,0.0,0.001,0.0,0.0,0.0,0.0,0.0,0.002
Topic 6,0.0,0.0,0.013,0.0,0.0,0.008,0.002,0.0,0.0,0.002,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Topic 7,0.0,0.0,0.0,0.027,0.01,0.0,0.002,0.0,0.0,0.013,...,0.002,0.033,0.0,0.0,0.009,0.0,0.044,0.001,0.0,0.018
Topic 8,0.004,0.001,0.032,0.011,0.0,0.001,0.0,0.004,0.011,0.003,...,0.0,0.0,0.01,0.0,0.012,0.0,0.0,0.0,0.004,0.012
Topic 9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.007,0.0,0.0,...,0.0,0.019,0.0,0.0,0.0,0.002,0.0,0.0,0.0,0.0


**Displaying Top 10 Keywords for Each Topic**

In [13]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("Topic ", ix)
        else:
            print("Topic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [14]:
display_topics(nmf_model, columns, 10)

Topic  0
table, seat, wait, people, reservation, book, server, dining, tell, main
Topic  1
food, amazing, eat, recommend, drink, price, people, quality, friendly, atmosphere
Topic  2
wine, menu, chef, visit, list, glass, taste, dining, evening, wife
Topic  3
order, wait, seat, bar, waiter, drink, salad, fry, server, chicken
Topic  4
service, price, attentive, visit, dinner, view, fresh, quality, atmosphere, tasty
Topic  5
delicious, staff, nice, friendly, dinner, pizza, salad, recommend, atmosphere, bar
Topic  6
time, eat, visit, long, reservation, love, seat, crab, lunch, leave
Topic  7
dish, menu, flavor, taste, crab, eat, shrimp, sauce, chicken, noodle
Topic  8
experience, meal, server, dining, main, feel, wife, review, waiter, starter
Topic  9
breakfast, egg, coffee, hotel, toast, cold, morning, potato, benedict, bacon


**Topics and Topic Numbers**

In [15]:
nmf_topic_num = np.argmax(doc_topic, axis=1)
nmf_topic_names = ["Topic " + str(topic) for topic in nmf_topic_num]

df['topic_num'] = nmf_topic_num
df['topic'] = nmf_topic_names
df

Unnamed: 0,_id,review,topic_num,topic
0,65496e562042bfa2da026b54,live bay regret find treasure island unique vi...,1,Topic 1
1,65496e562042bfa2da026b55,long time time reviewer come impress fish chip...,6,Topic 6
2,65496e562042bfa2da026b56,view staff fish chip favorite escape city enjo...,5,Topic 5
3,65496e562042bfa2da026b57,brother law love fish chip treasure island rum...,8,Topic 8
4,65496e562042bfa2da026b58,food start chowder rest menu delicious enjoy a...,1,Topic 1
...,...,...,...,...
3408,65496e5b2042bfa2da0278af,food service bartender burrito margarita famil...,1,Topic 1
3409,65496e5b2042bfa2da0278b0,feel feel local salmon amazing prawn choose ha...,5,Topic 5
3410,65496e5b2042bfa2da0278b1,spend grid lock main drag weekend street inter...,4,Topic 4
3411,65496e5b2042bfa2da0278b2,solid school food fair price enchilada inspire...,1,Topic 1


In [16]:
df.topic.value_counts()

Topic 1    1093
Topic 5     540
Topic 6     315
Topic 8     302
Topic 4     271
Topic 3     228
Topic 2     204
Topic 7     197
Topic 9     153
Topic 0     110
Name: topic, dtype: int64

**Assigning Topic Labels**

In [47]:
topic_labels = {
    0: 'Efficient Dining Service',
    1: 'Quality Food and Atmosphere',
    2: 'Wine and Culinary Delights',
    3: 'Ordering and Waiting',
    4: 'Attentive Dining Experience',
    5: 'Delicious Dinner Atmosphere',
    6: 'Timely Dining Experience',
    7: 'Flavorful Menu Options',
    8: 'Overall Dining Experience',
    9: 'Hotel Breakfast Delights'
}

df['topic'] = df['topic_num'].map(topic_labels)
df

Unnamed: 0,_id,review,topic_num,topic
0,65496e562042bfa2da026b54,live bay regret find treasure island unique vi...,1,Quality Food and Atmosphere
1,65496e562042bfa2da026b55,long time time reviewer come impress fish chip...,6,Timely Dining Experience
2,65496e562042bfa2da026b56,view staff fish chip favorite escape city enjo...,5,Delicious Dinner Atmosphere
3,65496e562042bfa2da026b57,brother law love fish chip treasure island rum...,8,Overall Dining Experience
4,65496e562042bfa2da026b58,food start chowder rest menu delicious enjoy a...,1,Quality Food and Atmosphere
...,...,...,...,...
3408,65496e5b2042bfa2da0278af,food service bartender burrito margarita famil...,1,Quality Food and Atmosphere
3409,65496e5b2042bfa2da0278b0,feel feel local salmon amazing prawn choose ha...,5,Delicious Dinner Atmosphere
3410,65496e5b2042bfa2da0278b1,spend grid lock main drag weekend street inter...,4,Attentive Dining Experience
3411,65496e5b2042bfa2da0278b2,solid school food fair price enchilada inspire...,1,Quality Food and Atmosphere


**Comparing Topics with the Original and Preprocessed Documents**

In [18]:
reviews_full = pd.read_json('reviews.json', lines=True)
df['full_review'] = reviews_full.review
df

Unnamed: 0,_id,review,topic_num,topic,full_review
0,65496e562042bfa2da026b54,live bay regret find treasure island unique vi...,1,Quality Food and Atmosphere,Having lived in the Bay Area for 30+ years I r...
1,65496e562042bfa2da026b55,long time time reviewer come impress fish chip...,6,Timely Dining Experience,"Long time diner, first time reviewer. I’ve bee..."
2,65496e562042bfa2da026b56,view staff fish chip favorite escape city enjo...,5,Delicious Dinner Atmosphere,"The view, the staff, the ceviche and the fish ..."
3,65496e562042bfa2da026b57,brother law love fish chip treasure island rum...,8,Overall Dining Experience,My brother-in-law loves Fish and Chips. We he...
4,65496e562042bfa2da026b58,food start chowder rest menu delicious enjoy a...,1,Quality Food and Atmosphere,The food is always great! I always start with ...
...,...,...,...,...,...
3408,65496e5b2042bfa2da0278af,food service bartender burrito margarita famil...,1,Quality Food and Atmosphere,My wife and I were here on a business trip and...
3409,65496e5b2042bfa2da0278b0,feel feel local salmon amazing prawn choose ha...,5,Delicious Dinner Atmosphere,We dined here two nights in a row on our stay ...
3410,65496e5b2042bfa2da0278b1,spend grid lock main drag weekend street inter...,4,Attentive Dining Experience,Best Clam Chowder!\nThe chowder here is so del...
3411,65496e5b2042bfa2da0278b2,solid school food fair price enchilada inspire...,1,Quality Food and Atmosphere,"TravelGirl, We are so glad you were able to di..."


In [19]:
df.full_review[0]

'Having lived in the Bay Area for 30+ years I regret not finding out about this place sooner. Treasure Island has such unique views of the SF skyline, from the waterfront to the Golden Gate Bridge all the way around to the Bay Bridge, \n\nMersea...has a super laid-back vibe with incredible food and drinks and the most outrageous view of the SF skyline,  a full 270 degree water view with floor to ceiling windows. \n\nDaniel makes an incredible Manhattan, and the poke was super fresh.  Meesun was an amazing hostess and made sure we ordered all the house specials…Truffle fries, Fish and Chips etc etc. Mmmmm!! We will definitely be back soon!More'

In [20]:
df.full_review[2]

"The view, the staff, the ceviche and the fish & chips were the best I've encountered in years! My new favorite place to escape Marin and the city and enjoy a moment of island life!"

In [21]:
df.full_review[3410]

'Best Clam Chowder!\nThe chowder here is so delicious. It is a thinner soup, unlike the thick heavy versions elsewhere. The dining area is rustic and traditional. Definitely a place to have a more elevated experience. Saying that, there is outdoor space to enjoy a...casual lunch or evening too. The staff is attentive and the variety of seafood dishes are abundant. I love coming here with family, friends or even business.More'

In [22]:
df.sample(10)

Unnamed: 0,_id,review,topic_num,topic,full_review
181,65496e562042bfa2da026c09,phenomenal meal event train marathon meal atte...,8,Overall Dining Experience,"This place is phenomenal. It isn’t a meal, it’..."
3295,65496e5b2042bfa2da02783e,night know eat garden service beat nectarine s...,4,Attentive Dining Experience,Beware of scam for events — the manager Sam st...
984,65496e562042bfa2da026f33,dinner sumptuous service attentive food layer ...,8,Overall Dining Experience,Dreaming to visit and taste those masterpiece ...
2400,65496e5a2042bfa2da0274bf,delighted hear meal crab house prime rib glad ...,8,Overall Dining Experience,My husband and i went to ChaChaCha last night ...
639,65496e562042bfa2da026dd4,big mistake stop eat holiday holiday eat eat w...,3,Ordering and Waiting,"The service was attentive, paced perfectly for..."
2767,65496e5a2042bfa2da02762e,wife feel pizza staff friendly pizza delicious...,5,Delicious Dinner Atmosphere,We stopped by for pizza after seeing the Paint...
384,65496e562042bfa2da026cd5,kind review rating experience service peter vi...,4,Attentive Dining Experience,Stopped for lunch on our way to Sonoma. Peter ...
707,65496e562042bfa2da026e1b,fish chip eat discovery princess fisherman wha...,1,Quality Food and Atmosphere,Another amazing dining experience at the bar. ...
741,65496e562042bfa2da026e3d,emma happy entire group meal apologize small s...,8,Overall Dining Experience,I have no idea why this has the ratings it doe...
658,65496e562042bfa2da026de7,memorable meal west coast visit celebrate birt...,5,Delicious Dinner Atmosphere,I have been here before and the food remains v...


In [23]:
df.full_review[707]

"Another amazing dining experience at the bar. Attentive warm service. Perfectly prepared steaks (Bone in Ribeye) with Harris' scalloped potatoes. Family has been coming here for year and is a must whenever we are in SF."

In [24]:
df.full_review[639]

'The service was attentive, paced perfectly for us and friendly. Food tasted amazingly good, fresh hot and delicious accompaniments. $90 covered 2 good burgers, side order of 2 sets of fries and onion rings with sodas. Really enjoyed it.'

In [25]:
df.sample(5)

Unnamed: 0,_id,review,topic_num,topic,full_review
2143,65496e5a2042bfa2da0273bd,early dinner walk pier nice food minor issue q...,1,Quality Food and Atmosphere,"Food was good , service excellent very good ni..."
863,65496e562042bfa2da026eb9,drink service bar amazing clam chowder view at...,4,Attentive Dining Experience,The food was great however we felt like we wer...
2092,65496e5a2042bfa2da02738a,dinner evening daughter experience real food c...,1,Quality Food and Atmosphere,This small but very lively place serves excell...
2121,65496e5a2042bfa2da0273a7,trip hotel knob hill live piano player furnish...,4,Attentive Dining Experience,Stepping into Big 4 is like stepping into anot...
621,65496e562042bfa2da026dc2,food average local live nearby expect long wai...,3,Ordering and Waiting,The duck leg was too dry and some of the musse...


In [26]:
df.full_review[621]

"The duck leg was too dry and some of the mussels were off. The beef bourguignon was the only good dish of the three we ordered. The wine (a GSM) was good and went well with the food. But we won't be back - mostly...because we emphasized how picky we are about mussels and were assured they were super fresh, came in that day.More"

In [27]:
df.sample(5)

Unnamed: 0,_id,review,topic_num,topic,full_review
3405,65496e5b2042bfa2da0278ac,food southern popular local food service waite...,1,Quality Food and Atmosphere,This restaurant has huge open windows that loo...
160,65496e562042bfa2da026bf4,smell urine entire offer whiff sanctuary smell...,5,Delicious Dinner Atmosphere,Smelled like urine in the entire restaurant- w...
3362,65496e5b2042bfa2da027881,person trip staff courteous attentive atmosphe...,8,Overall Dining Experience,Probably the best Asian cuisine I’ve eaten. Ma...
896,65496e562042bfa2da026edb,breakfast food taste presentation service tour...,1,Quality Food and Atmosphere,Low key neighborhood kind of pizza place. Chi...
1165,65496e592042bfa2da026fe8,worth food staff friendly helpful crowded affo...,1,Quality Food and Atmosphere,My husband and I visited here while in San Fra...


# Conclusion

**Encapsulating All Preprocessing Steps in One Function**

In [26]:
import re
import string
import emoji
import langid
import chardet
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import spacy

nltk.download('wordnet')
nltk.download('stopwords')

lemmatizer = WordNetLemmatizer()
stop_words = stopwords.words('english')+["0o", "0s", "3a", "3b", "3d", "6b", "6o", "a", "a1", "a2", "a3", "a4", "ab", "able", "about", "above", "abst", "ac", "accordance", "according", "accordingly", "across", "act", "actually", "ad", "added", "adj", "ae", "af", "affected", "affecting", "affects", "after", "afterwards", "ag", "again", "against", "ah", "ain", "ain't", "aj", "al", "all", "allow", "allows", "almost", "alone", "along", "already", "also", "although", "always", "am", "among", "amongst", "amoungst", "amount", "an", "and", "announce", "another", "any", "anybody", "anyhow", "anymore", "anyone", "anything", "anyway", "anyways", "anywhere", "ao", "ap", "apart", "apparently", "appear", "appreciate", "appropriate", "approximately", "ar", "are", "aren", "arent", "aren't", "arise", "around", "as", "a's", "aside", "ask", "asking", "associated", "at", "au", "auth", "av", "available", "aw", "away", "awfully", "ax", "ay", "az", "b", "b1", "b2", "b3", "ba", "back", "bc", "bd", "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", "begin", "beginning", "beginnings", "begins", "behind", "being", "believe", "below", "beside", "besides", "best", "better", "between", "beyond", "bi", "bill", "biol", "bj", "bk", "bl", "bn", "both", "bottom", "bp", "br", "brief", "briefly", "bs", "bt", "bu", "but", "bx", "by", "c", "c1", "c2", "c3", "ca", "call", "came", "can", "cannot", "cant", "can't", "cause", "causes", "cc", "cd", "ce", "certain", "certainly", "cf", "cg", "ch", "changes", "ci", "cit", "cj", "cl", "clearly", "cm", "c'mon", "cn", "co", "com", "come", "comes", "con", "concerning", "consequently", "consider", "considering", "contain", "containing", "contains", "corresponding", "could", "couldn", "couldnt", "couldn't", "course", "cp", "cq", "cr", "cry", "cs", "c's", "ct", "cu", "currently", "cv", "cx", "cy", "cz", "d", "d2", "da", "date", "dc", "dd", "de", "definitely", "describe", "described", "despite", "detail", "df", "di", "did", "didn", "didn't", "different", "dj", "dk", "dl", "do", "does", "doesn", "doesn't", "doing", "don", "done", "don't", "down", "downwards", "dp", "dr", "ds", "dt", "du", "due", "during", "dx", "dy", "e", "e2", "e3", "ea", "each", "ec", "ed", "edu", "ee", "ef", "effect", "eg", "ei", "eight", "eighty", "either", "ej", "el", "eleven", "else", "elsewhere", "em", "empty", "en", "end", "ending", "enough", "entirely", "eo", "ep", "eq", "er", "es", "especially", "est", "et", "et-al", "etc", "eu", "ev", "even", "ever", "every", "everybody", "everyone", "everything", "everywhere", "ex", "exactly", "example", "except", "ey", "f", "f2", "fa", "far", "fc", "few", "ff", "fi", "fifteen", "fifth", "fify", "fill", "find", "fire", "first", "five", "fix", "fj", "fl", "fn", "fo", "followed", "following", "follows", "for", "former", "formerly", "forth", "forty", "found", "four", "fr", "from", "front", "fs", "ft", "fu", "full", "further", "furthermore", "fy", "g", "ga", "gave", "ge", "get", "gets", "getting", "gi", "give", "given", "gives", "giving", "gj", "gl", "go", "goes", "going", "gone", "got", "gotten", "gr", "greetings", "gs", "gy", "h", "h2", "h3", "had", "hadn", "hadn't", "happens", "hardly", "has", "hasn", "hasnt", "hasn't", "have", "haven", "haven't", "having", "he", "hed", "he'd", "he'll", "hello", "help", "hence", "her", "here", "hereafter", "hereby", "herein", "heres", "here's", "hereupon", "hers", "herself", "hes", "he's", "hh", "hi", "hid", "him", "himself", "his", "hither", "hj", "ho", "home", "hopefully", "how", "howbeit", "however", "how's", "hr", "hs", "http", "hu", "hundred", "hy", "i", "i2", "i3", "i4", "i6", "i7", "i8", "ia", "ib", "ibid", "ic", "id", "i'd", "ie", "if", "ig", "ignored", "ih", "ii", "ij", "il", "i'll", "im", "i'm", "immediate", "immediately", "importance", "important", "in", "inasmuch", "inc", "indeed", "index", "indicate", "indicated", "indicates", "information", "inner", "insofar", "instead", "interest", "into", "invention", "inward", "io", "ip", "iq", "ir", "is", "isn", "isn't", "it", "itd", "it'd", "it'll", "its", "it's", "itself", "iv", "i've", "ix", "iy", "iz", "j", "jj", "jr", "js", "jt", "ju", "just", "k", "ke", "keep", "keeps", "kept", "kg", "kj", "km", "know", "known", "knows", "ko", "l", "l2", "la", "largely", "last", "lately", "later", "latter", "latterly", "lb", "lc", "le", "least", "les", "less", "lest", "let", "lets", "let's", "lf", "like", "liked", "likely", "line", "little", "lj", "ll", "ll", "ln", "lo", "look", "looking", "looks", "los", "lr", "ls", "lt", "ltd", "m", "m2", "ma", "made", "mainly", "make", "makes", "many", "may", "maybe", "me", "mean", "means", "meantime", "meanwhile", "merely", "mg", "might", "mightn", "mightn't", "mill", "million", "mine", "miss", "ml", "mn", "mo", "more", "moreover", "most", "mostly", "move", "mr", "mrs", "ms", "mt", "mu", "much", "mug", "must", "mustn", "mustn't", "my", "myself", "n", "n2", "na", "name", "namely", "nay", "nc", "nd", "ne", "near", "nearly", "necessarily", "necessary", "need", "needn", "needn't", "needs", "neither", "never", "nevertheless", "new", "next", "ng", "ni", "nine", "ninety", "nj", "nl", "nn", "no", "nobody", "non", "none", "nonetheless", "noone", "nor", "normally", "nos", "not", "noted", "nothing", "novel", "now", "nowhere", "nr", "ns", "nt", "ny", "o", "oa", "ob", "obtain", "obtained", "obviously", "oc", "od", "of", "off", "often", "og", "oh", "oi", "oj", "ok", "okay", "ol", "old", "om", "omitted", "on", "once", "one", "ones", "only", "onto", "oo", "op", "oq", "or", "ord", "os", "ot", "other", "others", "otherwise", "ou", "ought", "our", "ours", "ourselves", "out", "outside", "over", "overall", "ow", "owing", "own", "ox", "oz", "p", "p1", "p2", "p3", "page", "pagecount", "pages", "par", "part", "particular", "particularly", "pas", "past", "pc", "pd", "pe", "per", "perhaps", "pf", "ph", "pi", "pj", "pk", "pl", "placed", "please", "plus", "pm", "pn", "po", "poorly", "possible", "possibly", "potentially", "pp", "pq", "pr", "predominantly", "present", "presumably", "previously", "primarily", "probably", "promptly", "proud", "provides", "ps", "pt", "pu", "put", "py", "q", "qj", "qu", "que", "quickly", "quite", "qv", "r", "r2", "ra", "ran", "rather", "rc", "rd", "re", "readily", "really", "reasonably", "recent", "recently", "ref", "refs", "regarding", "regardless", "regards", "related", "relatively", "research", "research-articl", "respectively", "resulted", "resulting", "results", "rf", "rh", "ri", "right", "rj", "rl", "rm", "rn", "ro", "rq", "rr", "rs", "rt", "ru", "run", "rv", "ry", "s", "s2", "sa", "said", "same", "saw", "say", "saying", "says", "sc", "sd", "se", "sec", "second", "secondly", "section", "see", "seeing", "seem", "seemed", "seeming", "seems", "seen", "self", "selves", "sensible", "sent", "serious", "seriously", "seven", "several", "sf", "shall", "shan", "shan't", "she", "shed", "she'd", "she'll", "shes", "she's", "should", "shouldn", "shouldn't", "should've", "show", "showed", "shown", "showns", "shows", "si", "side", "significant", "significantly", "similar", "similarly", "since", "sincere", "six", "sixty", "sj", "sl", "slightly", "sm", "sn", "so", "some", "somebody", "somehow", "someone", "somethan", "something", "sometime", "sometimes", "somewhat", "somewhere", "soon", "sorry", "sp", "specifically", "specified", "specify", "specifying", "sq", "sr", "ss", "st", "still", "stop", "strongly", "sub", "substantially", "successfully", "such", "sufficiently", "suggest", "sup", "sure", "sy", "system", "sz", "t", "t1", "t2", "t3", "take", "taken", "taking", "tb", "tc", "td", "te", "tell", "ten", "tends", "tf", "th", "than", "thank", "thanks", "thanx", "that", "that'll", "thats", "that's", "that've", "the", "their", "theirs", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "thered", "therefore", "therein", "there'll", "thereof", "therere", "theres", "there's", "thereto", "thereupon", "there've", "these", "they", "theyd", "they'd", "they'll", "theyre", "they're", "they've", "thickv", "thin", "think", "third", "this", "thorough", "thoroughly", "those", "thou", "though", "thoughh", "thousand", "three", "throug", "through", "throughout", "thru", "thus", "ti", "til", "tip", "tj", "tl", "tm", "tn", "to", "together", "too", "took", "top", "toward", "towards", "tp", "tq", "tr", "tried", "tries", "truly", "try", "trying", "ts", "t's", "tt", "tv", "twelve", "twenty", "twice", "two", "tx", "u", "u201d", "ue", "ui", "uj", "uk", "um", "un", "under", "unfortunately", "unless", "unlike", "unlikely", "until", "unto", "uo", "up", "upon", "ups", "ur", "us", "use", "used", "useful", "usefully", "usefulness", "uses", "using", "usually", "ut", "v", "va", "value", "various", "vd", "ve", "ve", "very", "via", "viz", "vj", "vo", "vol", "vols", "volumtype", "vq", "vs", "vt", "vu", "w", "wa", "want", "wants", "was", "wasn", "wasnt", "wasn't", "way", "we", "wed", "we'd", "welcome", "well", "we'll", "well-b", "went", "were", "we're", "weren", "werent", "weren't", "we've", "what", "whatever", "what'll", "whats", "what's", "when", "whence", "whenever", "when's", "where", "whereafter", "whereas", "whereby", "wherein", "wheres", "where's", "whereupon", "wherever", "whether", "which", "while", "whim", "whither", "who", "whod", "whoever", "whole", "who'll", "whom", "whomever", "whos", "who's", "whose", "why", "why's", "wi", "widely", "will", "willing", "wish", "with", "within", "without", "wo", "won", "wonder", "wont", "won't", "words", "world", "would", "wouldn", "wouldnt", "wouldn't", "www", "x", "x1", "x2", "x3", "xf", "xi", "xj", "xk", "xl", "xn", "xo", "xs", "xt", "xv", "xx", "y", "y2", "yes", "yet", "yj", "yl", "you", "youd", "you'd", "you'll", "your", "youre", "you're", "yours", "yourself", "yourselves", "you've", "yr", "ys", "yt", "z", "zero", "zi", "zz", "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "aren't", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "can't",
                                         "cannot", "could", "couldn't", "did", "didn't", "do", "does", "doesn't", "doing", "don't", "down", "during", "each", "few", "for", "from", "further", "had", "hadn't", "has", "hasn't", "have", "haven't", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "isn't", "it", "it's", "its", "itself", "let's", "me", "more", "most", "mustn't", "my", "myself", "no", "nor", "not", "of", "off", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "shan't", "she", "she'd", "she'll", "she's", "should", "shouldn't", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "wasn't", "we", "we'd", "we'll", "we're", "we've", "were", "weren't", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "won't", "would", "wouldn't", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves", "a's", "able", "about", "above", "according", "accordingly", "across", "actually", "after", "afterwards", "again", "against", "ain't", "all", "allow", "allows", "almost", "alone", "along", "already", "also", "although", "always", "am", "among", "amongst", "an", "and", "another", "any", "anybody", "anyhow", "anyone", "anything", "anyway", "anyways", "anywhere", "apart", "appear", "appreciate", "appropriate", "are", "aren't", "around", "as", "aside", "ask", "asking", "associated", "at", "available", "away", "awfully", "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", "behind", "being", "believe", "below", "beside", "besides", "best", "better", "between", "beyond", "both", "brief", "but", "by", "c'mon", "c's", "came", "can", "can't", "cannot", "cant", "cause", "causes", "certain", "certainly", "changes", "clearly", "co", "com", "come", "comes", "concerning", "consequently", "consider", "considering", "contain", "containing", "contains", "corresponding", "could", "couldn't", "course", "currently", "definitely", "described", "despite", "did", "didn't", "different", "do", "does", "doesn't", "doing", "don't", "done", "down", "downwards", "during", "each", "edu", "eg", "eight", "either", "else", "elsewhere", "enough", "entirely", "especially", "et", "etc", "even", "ever", "every", "everybody", "everyone", "everything", "everywhere", "ex", "exactly", "example", "except", "far", "few", "fifth", "first", "five", "followed", "following", "follows", "for", "former", "formerly", "forth", "four", "from", "further", "furthermore", "get", "gets", "getting", "given", "gives", "go", "goes", "going", "gone", "got", "gotten", "greetings", "had", "hadn't", "happens", "hardly", "has", "hasn't", "have", "haven't", "having", "he", "he's", "hello", "help", "hence", "her", "here", "here's", "hereafter", "hereby", "herein", "hereupon", "hers", "herself", "hi", "him", "himself", "his", "hither", "hopefully", "how", "howbeit", "however", "i'd", "i'll", "i'm", "i've", "ie", "if", "ignored", "immediate", "in", "inasmuch", "inc", "indeed", "indicate", "indicated", "indicates", "inner", "insofar", "instead", "into", "inward", "is", "isn't", "it", "it'd", "it'll", "it's", "its", "itself", "just", "keep", "keeps", "kept", "know", "known", "knows", "last", "lately", "later", "latter", "latterly", "least", "less", "lest", "let", "let's", "like", "liked", "likely", "little", "look", "looking", "looks", "ltd", "mainly", "many", "may", "maybe", "me", "mean", "meanwhile", "merely", "might", "more", "moreover", "most", "mostly", "much", "must", "my", "myself", "name", "namely", "nd", "near", "nearly", "necessary", "need", "needs", "neither", "never", "nevertheless", "new", "next", "nine", "no", "nobody", "non", "none", "noone", "nor", "normally", "not", "nothing", "novel", "now", "nowhere", "obviously", "of", "off", "often", "oh", "ok", "okay", "old", "on", "once", "one", "ones", "only", "onto", "or", "other", "others", "otherwise", "ought", "our", "ours", "ourselves", "out", "outside", "over", "overall", "own", "particular", "particularly", "per", "perhaps", "placed", "please", "plus", "possible", "presumably", "probably", "provides", "que", "quite", "qv", "rather", "rd", "re", "really", "reasonably", "regarding", "regardless", "regards", "relatively", "respectively", "right", "said", "same", "saw", "say", "saying", "says", "second", "secondly", "see", "seeing", "seem", "seemed", "seeming", "seems", "seen", "self", "selves", "sensible", "sent", "serious", "seriously", "seven", "several", "shall", "she", "should", "shouldn't", "since", "six", "so", "some", "somebody", "somehow", "someone", "something", "sometime", "sometimes", "somewhat", "somewhere", "soon", "sorry", "specified", "specify", "specifying", "still", "sub", "such", "sup", "sure", "t's", "take", "taken", "tell", "tends", "th", "than", "thank", "thanks", "thanx", "that", "that's", "thats", "the", "their", "theirs", "them", "themselves", "then", "thence", "there", "there's", "thereafter", "thereby", "therefore", "therein", "theres", "thereupon", "these", "they", "they'd", "they'll", "they're", "they've", "think", "third", "this", "thorough", "thoroughly", "those", "though", "three", "through", "throughout", "thru", "thus", "to", "together", "too", "took", "toward", "towards", "tried", "tries", "truly", "try", "trying", "twice", "two", "un", "under", "unfortunately", "unless", "unlikely", "until", "unto", "up", "upon", "us", "use", "used", "useful", "uses", "using", "usually", "value", "various", "very", "via", "viz", "vs", "want", "wants", "was", "wasn't", "way", "we", "we'd", "we'll", "we're", "we've", "welcome", "well", "went", "were", "weren't", "what", "what's", "whatever", "when", "whence", "whenever", "where", "where's", "whereafter", "whereas", "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while", "whither", "who", "who's", "whoever", "whole", "whom", "whose", "why", "will", "willing", "wish", "with", "within", "without", "won't", "wonder", "would", "wouldn't", "yes", "yet", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves", "zero", "I", "a", "about", "an", "are", "as", "at", "be", "by", "com", "for", "from", "how", "in", "is", "it", "of", "on", "or", "that", "the", "this", "to", "was", "what", "when", "where", "who", "will", "with", "the", "www", "a", "able", "about", "above", "abst", "accordance", "according", "accordingly", "across", "act", "actually", "added", "adj", "affected", "affecting", "affects", "after", "afterwards", "again", "against", "ah", "all", "almost", "alone", "along", "already", "also", "although", "always", "am", "among", "amongst", "an", "and", "announce", "another", "any", "anybody", "anyhow", "anymore", "anyone", "anything", "anyway", "anyways", "anywhere", "apparently", "approximately", "are", "aren", "arent", "arise", "around", "as", "aside", "ask", "asking", "at", "auth", "available", "away", "awfully", "b", "back", "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", "begin", "beginning", "beginnings", "begins", "behind", "being", "believe", "below", "beside", "besides", "between", "beyond", "biol", "both", "brief", "briefly", "but", "by", "c", "ca", "came", "can", "cannot", "can't", "cause", "causes", "certain", "certainly", "co", "com", "come", "comes", "contain", "containing", "contains", "could", "couldnt", "d", "date", "did", "didn't", "different", "do", "does", "doesn't", "doing", "done", "don't", "down", "downwards", "due", "during", "e", "each", "ed", "edu", "effect", "eg", "eight", "eighty", "either", "else", "elsewhere", "end", "ending", "enough", "especially", "et", "et-al", "etc", "even", "ever", "every", "everybody", "everyone", "everything", "everywhere", "ex", "except", "f", "far", "few", "ff", "fifth", "first", "five", "fix", "followed", "following", "follows", "for", "former", "formerly", "forth", "found", "four", "from", "further", "furthermore", "g", "gave", "get", "gets", "getting", "give", "given", "gives", "giving", "go", "goes", "gone", "got", "gotten", "h", "had", "happens", "hardly", "has", "hasn't", "have", "haven't", "having", "he", "hed", "hence", "her", "here", "hereafter", "hereby", "herein", "heres", "hereupon", "hers", "herself", "hes", "hi", "hid", "him", "himself", "his", "hither", "home", "how", "howbeit", "however", "hundred", "i", "id", "ie", "if", "i'll", "im", "immediate", "immediately", "importance", "important", "in", "inc", "indeed", "index", "information", "instead", "into", "invention", "inward", "is", "isn't", "it", "itd", "it'll", "its", "itself", "i've", "j", "just", "k", "keep", "keeps", "kept", "kg", "km", "know", "known", "knows", "l", "largely", "last", "lately", "later", "latter", "latterly", "least", "less", "lest", "let", "lets", "like", "liked", "likely", "line", "little", "'ll", "look", "looking", "looks", "ltd", "m", "made", "mainly", "make", "makes", "many", "may", "maybe", "me", "mean", "means", "meantime", "meanwhile", "merely", "mg", "might", "million", "miss", "ml", "more", "moreover", "most", "mostly", "mr", "mrs", "much", "mug", "must", "my", "myself", "n", "na", "name", "namely", "nay", "nd", "near", "nearly", "necessarily", "necessary", "need", "needs", "neither", "never", "nevertheless", "new", "next", "nine", "ninety", "no", "nobody", "non", "none", "nonetheless", "noone", "nor", "normally", "nos", "not", "noted", "nothing", "now", "nowhere", "o", "obtain", "obtained", "obviously", "of", "off", "often", "oh", "ok", "okay", "old", "omitted", "on", "once", "one", "ones", "only", "onto", "or", "ord", "other", "others", "otherwise", "ought", "our", "ours", "ourselves", "out",
                                         "outside", "over", "overall", "owing", "own", "p", "page", "pages", "part", "particular", "particularly", "past", "per", "perhaps", "placed", "please", "plus", "poorly", "possible", "possibly", "potentially", "pp", "predominantly", "present", "previously", "primarily", "probably", "promptly", "proud", "provides", "put", "q", "que", "quickly", "quite", "qv", "r", "ran", "rather", "rd", "re", "readily", "really", "recent", "recently", "ref", "refs", "regarding", "regardless", "regards", "related", "relatively", "research", "respectively", "resulted", "resulting", "results", "right", "run", "s", "said", "same", "saw", "say", "saying", "says", "sec", "section", "see", "seeing", "seem", "seemed", "seeming", "seems", "seen", "self", "selves", "sent", "seven", "several", "shall", "she", "shed", "she'll", "shes", "should", "shouldn't", "show", "showed", "shown", "showns", "shows", "significant", "significantly", "similar", "similarly", "since", "six", "slightly", "so", "some", "somebody", "somehow", "someone", "somethan", "something", "sometime", "sometimes", "somewhat", "somewhere", "soon", "sorry", "specifically", "specified", "specify", "specifying", "still", "stop", "strongly", "sub", "substantially", "successfully", "such", "sufficiently", "suggest", "sup", "sure", "t", "take", "taken", "taking", "tell", "tends", "th", "than", "thank", "thanks", "thanx", "that", "that'll", "thats", "that've", "the", "their", "theirs", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "thered", "therefore", "therein", "there'll", "thereof", "therere", "theres", "thereto", "thereupon", "there've", "these", "they", "theyd", "they'll", "theyre", "they've", "think", "this", "those", "thou", "though", "thoughh", "thousand", "throug", "through", "throughout", "thru", "thus", "til", "tip", "to", "together", "too", "took", "toward", "towards", "tried", "tries", "truly", "try", "trying", "ts", "twice", "two", "u", "un", "under", "unfortunately", "unless", "unlike", "unlikely", "until", "unto", "up", "upon", "ups", "us", "use", "used", "useful", "usefully", "usefulness", "uses", "using", "usually", "v", "value", "various", "'ve", "very", "via", "viz", "vol", "vols", "vs", "w", "want", "wants", "was", "wasnt", "way", "we", "wed", "welcome", "we'll", "went", "were", "werent", "we've", "what", "whatever", "what'll", "whats", "when", "whence", "whenever", "where", "whereafter", "whereas", "whereby", "wherein", "wheres", "whereupon", "wherever", "whether", "which", "while", "whim", "whither", "who", "whod", "whoever", "whole", "who'll", "whom", "whomever", "whos", "whose", "why", "widely", "willing", "wish", "with", "within", "without", "wont", "words", "world", "would", "wouldnt", "www", "x", "y", "yes", "yet", "you", "youd", "you'll", "your", "youre", "yours", "yourself", "yourselves", "you've", "z", "zero"]
stop_words_set = set(stop_words)
nlp = spacy.load('en_core_web_sm')

combinations = [f"{letter1}{letter2}" for letter1 in 'abcdefghijklmnopqrstuvwxyz' for letter2 in 'abcdefghijklmnopqrstuvwxyz']
combinations_set = set(combinations)

def detect_encoding(text):
    result = chardet.detect(text.encode())
    return result['encoding'], result['confidence']

def lemmatize_word(word):
    return lemmatizer.lemmatize(word, pos='n')

def lemmatize_and_filter(word):
    lemma = lemmatizer.lemmatize(word, pos='n')
    return lemma if lemma in set(nltk.corpus.words.words()) else ''

def remove_repeating_letters(s):
    words = s.split()
    result = []
    for word in words:
        new_word = re.sub(r'(.)\1{2,}', r'\1', word)
        result.append(new_word)
    return ' '.join(result)

def remove_stopwords(text):
    return ' '.join([word for word in text.split() if word.lower() not in stop_words_set])

def remove_unimportant_words(text):
    doc = nlp(text)
    tokens = [token.text for token in doc if not token.is_stop and token.is_alpha]
    return ' '.join(tokens)

def remove_binaries(text):
    return ' '.join([word for word in text.split() if word.lower() not in combinations_set])

def lemmatize_and_remove_adverbs(sentence):
    doc = nlp(sentence)
    lemmatized_tokens = [token.lemma_ if token.pos_ != 'ADV' else '' for token in doc]
    lemmatized_tokens = [token for token in lemmatized_tokens if token]
    return ' '.join(lemmatized_tokens)

def preprocess_function(text):
    combinations_set = set(combinations)
    
    text = text.replace("<br />", "")
    text = text.replace("\n\n", "")
    text = text.replace("…", " ")
    text = re.sub('\w*\d\w*', ' ', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text.lower())
    text = re.sub(' +', ' ', text)
    text = remove_repeating_letters(text)
    text = emoji.replace_emoji(text, replace='')
    text = remove_repeating_letters(text)
    
    text = ' '.join([word for word in text.split() if langid.classify(word)[0] == 'en'])
    
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)
    
    text = re.sub('[^a-zA-Z\s]', '', text)
    text = ' '.join([lemmatize_word(word) for word in text.split()])
    text = ' '.join([lemmatize_and_filter(word) for word in text.split()])
    text = remove_stopwords(text)
    text = remove_unimportant_words(text)
    text = remove_binaries(text)
    text = lemmatize_and_remove_adverbs(text)
    
    custom_stopwords = ['excellent', 'outstanding', 'superb', 'fantastic', 'terrific', 'marvelous', 'wonderful',
                        'exceptional', 'admirable', 'splendid', 'poor', 'inferior', 'subpar', 'mediocre', 'lousy',
                        'terrible', 'awful', 'horrible', 'abysmal', 'dismal', 'overwhelming', 'exemplary',
                        'extraordinary', 'remarkable', 'unparalleled', 'exceptional', 'unsurpassed', 'superlative',
                        'peerless', 'incomparable', 'atrocious', 'dreadful', 'deplorable', 'appalling', 'catastrophic',
                        'abominable', 'monstrous', 'detestable', 'reprehensible', 'unbearable', 'good', 'bad', 'great',
                        'eatery', 'diner', 'bistro', 'cafe', 'brasserie', 'tavern', 'cafeteria', 'grill', 'pub',
                        'trattoria', 'location', 'spot', 'venue', 'area', 'site', 'locale', 'setting', 'region',
                        'space', 'position', 'restaurant', 'place', 'sommeli', 'cha', 'second', 'minute', 'hour', 'day', 'week',
                        'fortnight', 'month', 'year', 'decade', 'century', 'millennium', 'moment', 'quarter', 'half',
                        'nighttime', 'midnight', 'noon', 'future', 'era', 'epoch', 'age', 'period', 'interval', 'schedule',
                        'calendar', 'clock', 'watch', 'stopwatch', 'timer', 'chronometer', 'timepiece', 'o\'clock', 'a.m.',
                        'p.m.', 'yesterday', 'today', 'tomorrow', 'dawn', 'dusk', 'zone', 'daylight', 'lunar', 'solar',
                        'calendar', 'leap', 'gregorian', 'julian', 'sidereal', 'equinox', 'solstice', 'century', 'millennium', 'san']
    text = ' '.join([word for word in text.split() if word not in custom_stopwords])
    
    return text

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/salimkilinc/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/salimkilinc/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [63]:
new_document = "Due to last-minute reservations at the bar next to another individual from New Orleans area. We both thought the restaurant was overrated and overpriced (food good, not outstanding at these prices) certainly compared to even Paris or New York City. But was particularly annoying to me is I thought they missed giving me 1 of the courses and rather than replacing it up as each of the courses were very tiny, they said no, you're given a course. Obviously they do not believe that the customers always right."
preprocessed_text = preprocess_function(new_document)
print(preprocessed_text)

reservation bar individual think food price city annoying think tiny customer


In [35]:
new_document = "Dining at Providence in Los Angeles was a culinary voyage that celebrated the flavors of the sea and the art of fine dining. The restaurant’s elegant and contemporary ambiance set the stage for an exceptional meal. The menu was a symphony of fresh seafood and seasonal ingredients, highlighting the richness of Californian cuisine. The tasting menu was a journey into creative dishes and exquisite flavors. The staff’s knowledge of the menu and their dedication to culinary excellence made the dining experience truly exceptional. Pairing the dishes with wines from the extensive list was a delightful exploration of the region’s gastronomy. Providence is not just a restaurant; it’s a tribute to the beauty of fresh, locally sourced ingredients and a must-visit for anyone seeking an exceptional gastronomic experience in Los Angeles."
preprocessed_text = preprocess_function(new_document)
print(preprocessed_text)

dine providence culinary voyage celebrate flavor sea art fine dining elegant contemporary set stage meal menu symphony fresh seasonal tasting menu creative dish exquisite flavor knowledge menu dedication culinary dining experience dish wine extensive list exploration gastronomy providence tribute beauty fresh visit seek gastronomic experience


In [61]:
new_document = "This is my first time to dine in this Michelin 2 star restaurant. My favorites were the langotti pasta with crab and uni, the lobster with matsutake mushrooms and the salmon with pepper truffle sauce with a side of sun chokes and white truffle shavings. The apple sorbet with apple chip was refreshingly delightful. Margo and the other staff members made this 3 hour dining experience remarkable."
preprocessed_text = preprocess_function(new_document)
print(preprocessed_text)

time dine star favorite crab lobster mushroom salmon pepper choke white shaving apple apple chip staff member dining experience


**Prediction**

In [48]:
def classify_new_document(new_document, nmf_model, vectorizer, topic_labels):
    preprocessed_document = preprocess_function(new_document)

    new_doc_word = vectorizer.transform([preprocessed_document])

    new_doc_topic = nmf_model.transform(new_doc_word)

    topic_distribution = pd.DataFrame(new_doc_topic.round(10), columns=topic_labels.values())
    topic_distribution["Dominant Topic"] = topic_distribution.idxmax(axis=1)

    return topic_distribution

In [64]:
result = classify_new_document(new_document, nmf_model, vectorizer, topic_labels)
result

Unnamed: 0,Efficient Dining Service,Quality Food and Atmosphere,Wine and Culinary Delights,Ordering and Waiting,Attentive Dining Experience,Delicious Dinner Atmosphere,Timely Dining Experience,Flavorful Menu Options,Overall Dining Experience,Hotel Breakfast Delights,Dominant_Topic
0,0.019381,0.165058,0.014489,0.022426,0.010457,0.010826,0.014446,0.004233,0.006819,0.011728,Quality Food and Atmosphere


**Saving the Model for Streamlit Application**

In [58]:
from joblib import dump, load

dump(nmf_model, 'reviews_nmf_model.pkl')

['reviews_nmf_model.pkl']

In [59]:
dump(vectorizer, 'vectorizer.pkl')

['vectorizer.pkl']

In [60]:
dump(stop_words_set, 'stop_words_set.pkl')

['stop_words_set.pkl']