In [79]:
from google.colab import drive

drive.mount('/content/drive')

! cp -r --verbose '/content/drive/MyDrive/IAA2/NLP/dataset_cleaned.csv' .

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
'/content/drive/MyDrive/IAA2/NLP/dataset_cleaned.csv' -> './dataset_cleaned.csv'


In [80]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.feature_extraction import text

DATASET_FILE = "./dataset_cleaned.csv"

df = pd.read_csv(DATASET_FILE)
df

Unnamed: 0,text,stars,length,text_cleaned
0,I've only had food from here once and it wasn'...,1,68,food memorable panang curry balance flavor lik...
1,I will never return here again. Ever. I was ...,1,87,NOT_return ever sit booth wait dinner come scu...
2,I wish my experience was great as others. I di...,1,166,wish experience great others din wednesday nig...
3,Are the rosemary grapefruit scones supposed to...,1,81,rosemary grapefruit scone suppose taste like w...
4,Our takeout order was half wrong. Food was mis...,1,32,takeout order half wrong food miss portion siz...
...,...,...,...,...
24995,I was a loyal fan of Aroy before the ownership...,5,75,loyal fan aroy ownership change apprehensive v...
24996,Stopped here for a bite while wandering around...,5,55,stopped bite wander around faneuil hall pleasa...
24997,"A quiet place with excellent food, great music...",5,32,quiet place excellent food great music helpful...
24998,Super delicious food. Awesome vibe. I suffered...,5,41,super delicious food awesome vibe suffer disne...


In [81]:
df = df[df.stars.isin([1,2])]

In [82]:
print(df.text_cleaned[0])

food memorable panang curry balance flavor like taste coconut lemongrass lime lack east asia offer way sweet care owner thai eat anything cook well place claim serve thai food


In [83]:
print(df.text_cleaned[1])

NOT_return ever sit booth wait dinner come scurry mouse booth dining room immediately get leave inform front desk issue blow nothing say oh ya mouse problem NOT_big deal certainly report department health disgust


In [84]:
def display_topics(model, feature_names, num_top_words, topic_names=None):
    '''Given an NMF model, feature_names, and number of top words, print topic number and its top feature names, up to specified number of top words.'''
    # iterate through topics in topic-term matrix, 'H' aka
    # model.components_
    for ix, topic in enumerate(model.components_):
        #print topic, topic number, and top words
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i] \
             for i in topic.argsort()[:-num_top_words - 1:-1]]))

In [85]:
# Store TF-IDF Vectorizer
vectorizer = TfidfVectorizer (max_df = .8, min_df = .01)
# Fit and Transform speech noun text to a TF-IDF Doc-Term Matrix
X = vectorizer.fit_transform(df.text_cleaned)
# Create data-frame of Doc-Term Matrix with nouns as column names
data_dtm_noun = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
# Set President's Names as Index
data_dtm_noun.index = df.index
# Visually inspect Document Term Matrix
data_dtm_noun.head()



Unnamed: 0,00,10,100,11,12,15,18,20,25,30,...,wrong,yeah,year,yell,yelp,yes,yesterday,yet,young,zero
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.392054,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.449125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [86]:
nmf_model = NMF(15)
# Learn an NMF model for given Document Term Matrix 'V' 
# Extract the document-topic matrix 'W'
doc_topic = nmf_model.fit_transform(data_dtm_noun)
# Extract top words from the topic-term matrix 'H'
topics = ['mauvais accueil','pas bon gout','mauvaise pizza','livraison retardée','rapport qualité/prix mauvais','mauvais service','mauvais burger','trop d\'attente','mauvais poulet','mauvaise ambiance au bar','mauvaise 2eme visite','manager rude et arrogant','mauvais sandwich','mauvais sushi','mauvaise experience d\'habitue']
display_topics(nmf_model, vectorizer.get_feature_names(), 15, topics)




Topic: ' mauvais accueil '
place, like, really, get, look, try, people, want, good, staff, love, give, coffee, eat, seem

Topic: ' pas bon gout '
good, taste, like, dish, sauce, price, flavor, menu, restaurant, really, much, pretty, bland, well, small

Topic: ' mauvaise pizza '
pizza, crust, cheese, topping, slice, delivery, good, pie, sauce, thin, cold, call, deliver, eat, soggy

Topic: ' livraison retardée '
order, take, delivery, get, wrong, call, food, come, deliver, half, drink, item, receive, tell, ask

Topic: ' rapport qualité/prix mauvais '
food, service, bad, good, slow, terrible, horrible, restaurant, ever, great, mediocre, poor, quality, cold, average

Topic: ' mauvais service '
table, come, server, waitress, ask, waiter, take, sit, seat, water, bring, check, restaurant, meal, leave

Topic: ' mauvais burger '
burger, fry, bun, onion, cheese, good, medium, cook, well, get, cold, eat, rare, bacon, ring

Topic: ' trop d'attente '
wait, minute, hour, 30, 15, 20, 10, long, min, 



In [87]:
doc_topic.shape
df_topic = pd.DataFrame(doc_topic,columns = topics)
df_topic

Unnamed: 0,mauvais accueil,pas bon gout,mauvaise pizza,livraison retardée,rapport qualité/prix mauvais,mauvais service,mauvais burger,trop d'attente,mauvais poulet,mauvaise ambiance au bar,mauvaise 2eme visite,manager rude et arrogant,mauvais sandwich,mauvais sushi,mauvaise experience d'habitue
0,0.022586,0.060059,0.000000,0.000000,0.035117,0.000000,0.000916,0.000000,0.010683,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,0.000000,0.000000,0.000000,0.000000,0.000000,0.023299,0.000000,0.026612,0.000493,0.004329,0.000000,0.035078,0.004079,0.000000,0.002008
2,0.000000,0.029875,0.000000,0.003352,0.016163,0.023924,0.000000,0.000000,0.059848,0.024692,0.003234,0.011264,0.000000,0.005904,0.023389
3,0.014167,0.059952,0.000000,0.000000,0.000000,0.014919,0.000206,0.000000,0.000000,0.006559,0.007687,0.003795,0.000000,0.000000,0.019467
4,0.027982,0.005371,0.000000,0.050737,0.016896,0.000000,0.000000,0.017325,0.001044,0.001094,0.000000,0.000000,0.004848,0.001252,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0.000000,0.033866,0.000509,0.003357,0.000000,0.000000,0.000000,0.000000,0.002802,0.000000,0.000000,0.000000,0.004272,0.000000,0.000000
9996,0.034937,0.056163,0.063227,0.008135,0.000000,0.001797,0.001676,0.000000,0.007232,0.000000,0.014799,0.017893,0.060759,0.000000,0.011422
9997,0.000000,0.052957,0.000000,0.035215,0.021266,0.033709,0.000554,0.000000,0.000000,0.016996,0.003633,0.038171,0.000000,0.000000,0.004455
9998,0.000000,0.038281,0.010027,0.000000,0.016411,0.000000,0.000000,0.000000,0.002230,0.000000,0.000000,0.000000,0.045615,0.000000,0.023334


In [88]:
reviews=[]
for topic in topics:
  reviews.append(df_topic.nlargest(3,topic).index.values)
print(reviews)

[array([ 920, 2490,  123]), array([9098, 5043, 5838]), array([5484, 6518,  976]), array([6369, 4146, 2365]), array([ 375, 6301, 1727]), array([9601, 6166, 4712]), array([6270, 7346, 6536]), array([5558, 4098, 1032]), array([9130, 1114, 2875]), array([5669, 9897, 6987]), array([9983, 7517, 3719]), array([1510,  278, 4502]), array([6384, 7258, 8503]), array([6286, 9041, 9480]), array([4079,  463, 2517])]


In [89]:
for index in reviews:
  print(index)

[ 920 2490  123]
[9098 5043 5838]
[5484 6518  976]
[6369 4146 2365]
[ 375 6301 1727]
[9601 6166 4712]
[6270 7346 6536]
[5558 4098 1032]
[9130 1114 2875]
[5669 9897 6987]
[9983 7517 3719]
[1510  278 4502]
[6384 7258 8503]
[6286 9041 9480]
[4079  463 2517]


In [90]:
print(topics)

['mauvais accueil', 'pas bon gout', 'mauvaise pizza', 'livraison retardée', 'rapport qualité/prix mauvais', 'mauvais service', 'mauvais burger', "trop d'attente", 'mauvais poulet', 'mauvaise ambiance au bar', 'mauvaise 2eme visite', 'manager rude et arrogant', 'mauvais sandwich', 'mauvais sushi', "mauvaise experience d'habitue"]


In [91]:
for i in range(len(reviews)):
  for j in range(len(reviews[0])):
    print(df.text[reviews[i][j]])
  print(topics[i],'----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------')
  print('\n')

This place used to be a cool, chill place. Now its a bunch of neanderthal bouncers hopped up on steroids acting like the can do whatever they want. There are so many better places in davis square where they are glad you are visiting their business. Sad that the burren is now the worst place in davis.
Heard that this is a very popular place in Boston. Went there on a Friday night. The bouncers were rude and racist. Not a ideal place to get inot the weekend mood. With so many places around, would recommend going to the other places around
I cant recomend this place its anti gun
I value my safety
Other places are gun friendly
mauvais accueil ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------


I love Greek food and have eaten at many Greek restaurants in Atlanta and New York.  Since I live less than 1.5 miles from this relatively new restaurant,

In [92]:
import pickle
with open('model_file','wb') as file:
  pickle.dump(nmf_model, file)
file.close()

In [93]:
with open('doc_model','wb') as file:
  pickle.dump(doc_topic, file)
file.close()