In [30]:
!pip freeze > requirements.txt  

In [31]:
!pip install streamlit
!pip install textblob
!pip install contractions

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [32]:
from google.colab import drive

drive.mount('/content/drive')

! cp -r --verbose '/content/drive/MyDrive/IAA2/NLP/model' .
! cp -r --verbose '/content/drive/MyDrive/IAA2/NLP/vectorizer' .

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
'/content/drive/MyDrive/IAA2/NLP/model' -> './model'
'/content/drive/MyDrive/IAA2/NLP/vectorizer' -> './vectorizer'


In [33]:
VECTORIZER_FILE = "./vectorizer"
MODEL_FILE = "./model"

In [34]:
import pickle

with (open(MODEL_FILE, "rb")) as f:
  model = pickle.load(f)

model

NMF(n_components=15)

In [35]:
with (open(VECTORIZER_FILE, "rb")) as f:
  vectorizer = pickle.load(f)
  
vectorizer

TfidfVectorizer(max_df=0.8, min_df=0.01)

In [36]:
import numpy as np
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import contractions
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [37]:
tokenizer = RegexpTokenizer(r'\w+')
def tokenize_text(text):
    text_processed = " ".join(tokenizer.tokenize(text))
    return text_processed

In [38]:
import en_core_web_sm
nlp = en_core_web_sm.load(disable=['parser', 'tagger', 'ner'])

lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    
    tokens_tagged = nltk.pos_tag(nltk.word_tokenize(text))
    lemmatized_text_list = list()
    
    for word, tag in tokens_tagged:
        if tag.startswith('J'):
            lemmatized_text_list.append(lemmatizer.lemmatize(word,'a')) # Lemmatise adjectives. Not doing anything since we remove all adjective
        elif tag.startswith('V'):
            lemmatized_text_list.append(lemmatizer.lemmatize(word,'v')) # Lemmatise verbs
        elif tag.startswith('N'):
            lemmatized_text_list.append(lemmatizer.lemmatize(word,'n')) # Lemmatise nouns
        elif tag.startswith('R'):
            lemmatized_text_list.append(lemmatizer.lemmatize(word,'r')) # Lemmatise adverbs
        else:
            lemmatized_text_list.append(lemmatizer.lemmatize(word)) # If no tags has been found, perform a non specific lemmatisation
    
    return " ".join(lemmatized_text_list)

  config_value=config["nlp"][key],


In [39]:
def normalize_text(text):
    return " ".join([word.lower() for word in text.split()])

In [40]:
def contraction_text(text):
    return contractions.fix(text)

In [41]:
negative_words = ['not', 'no', 'never', 'nor', 'hardly', 'barely']
negative_prefix = "NOT_"

def get_negative_token(text):
    tokens = text.split()
    negative_idx = [i+1 for i in range(len(tokens)-1) if tokens[i] in negative_words]
    for idx in negative_idx:
        if idx < len(tokens):
            tokens[idx]= negative_prefix + tokens[idx]
    
    tokens = [token for i,token in enumerate(tokens) if i+1 not in negative_idx]
    
    return " ".join(tokens)

In [42]:
from spacy.lang.en.stop_words import STOP_WORDS

def remove_stopwords(text):
    english_stopwords = stopwords.words("english") + list(STOP_WORDS) + ["tell", "restaurant"]
    
    return " ".join([word for word in text.split() if word not in english_stopwords])


In [43]:
def preprocess_text(text):
    
    # Tokenize review
    text = tokenize_text(text)
    
    # Lemmatize review
    text = lemmatize_text(text)
    
    # Normalize review
    text = normalize_text(text)
    
    # Remove contractions
    text = contraction_text(text)

    # Get negative tokens
    text = get_negative_token(text)
    
    # Remove stopwords
    text = remove_stopwords(text)
    
    return text

In [44]:
from textblob import TextBlob

txt = "I will never return here again.  Ever.  I was sitting in my booth waiting for dinner to come out and out scurried a mouse from under my booth and through the dining room.  After immediately getting up to leave, I informed the front desk of this issue and they blew it off as if it was nothing!  They said, oh ya, we have a mouse problem as if it was no big deal.  I will certainly be reporting this to the department of health- that is disgusting!"
blob = TextBlob(txt)
polarity = blob.sentiment.polarity
print(polarity)
txt = preprocess_text(txt)
txt = [txt]
x = vectorizer.transform(txt)
doc_topic = model.transform(x)
print(doc_topic[0])
index=[]
print(np.argsort(doc_topic, axis = 1 ))
argsort = np.argsort(doc_topic, axis = 1 )
for i in range(2):
  index.append(argsort[0][len(argsort[0])-(i+1)])
print(index)
topic_list = ['mauvais accueil','pas bon gout','mauvaise pizza','livraison retardée','rapport qualité/prix mauvais','mauvais service','mauvais burger','trop d\'attente','mauvais poulet','mauvaise ambiance au bar','mauvaise 2eme visite','manager rude et arrogant','mauvais sandwich','mauvais sushi','mauvaise experience d\'habitue']
topics=[]
for ind in index:
  topics.append(topic_list[ind])
print(topics)

-0.2619047619047619
[0.         0.         0.         0.         0.         0.02447996
 0.         0.02527711 0.         0.0054033  0.00042327 0.01748273
 0.00203393 0.         0.        ]
[[ 0  1  2  3  4  6  8 13 14 10 12  9 11  5  7]]
[7, 5]
["trop d'attente", 'mauvais service']


  "X does not have valid feature names, but"


In [45]:
from textblob import TextBlob
import pandas as pd

def fonction_prediction(model, vectorizer, n_topics, text):
  topic_list = ['qualité de l\'accueil','goût','qualité des pizzas','livraison en retard','rapport qualité/prix mauvais','qualité du service','qualité des burgers','trop d\'attente','qualité du poulet','ambiance au bar','2ème visite','staff/management','qualité des sandwichs','qualité des sushis','mauvaise expérience d\'habitués']
  blob = TextBlob(text)
  polarity = blob.sentiment.polarity
  text = preprocess_text(text)
  text = [text]
  X = vectorizer.transform(text)
  doc_topic = model.transform(X)
  index=[]
  argsort = np.argsort(doc_topic, axis = 1 )
  for i in range(n_topics):
    index.append(argsort[0][len(argsort[0])-(i+1)])
  topics=[]
  for ind in index:
    topics.append(topic_list[ind])
  return polarity,topics





In [46]:
txt = "First time in town not knowing where to go and not wanting fast food, based on proximity and reviews, opted for here. Happy hour rita...not too bad for shelf.  Wanted small app while I decided so chose to go with small guac...not so good, seems like an over processed version  and a bit generic, definitely not fresh.  Didn't get plate or napkins either.   I asked the waiter if I should go with garlic or fajita shrimp...he suggested fajita so ok...plate of rice and beans came out as hot to the touch as the cast iron pan...rice was crunchy and burnt on top. Waiter brought me another plate of rice, he didn't need to bother, flavorless.  Beans...bland.  Shrimp and veg not flavorful at all and shrimp was overdone.  I had to ask for plate and silverware. ..guess it wasn't that important.  SItting in the bar area is definitely at your own risk,  as is anywhere,  I  learned a lot about the national  champions tonite thanks to the loud talker at the bar...who wasn't this loud 30 mins ago lol.  Bottom line this is a half step above the   chain of many across the street.  Authentic is found at origin or neighborhoods purely ethnic...this place doesn't nearly deserve the praise and appreciation it gets from locals let along anyone...including the chihuahuas. Thank God for happy hour"
blob = TextBlob(txt)
polarity = blob.sentiment.polarity
print(polarity)
salut = fonction_prediction(model,vectorizer,4,txt)
print(salut)

0.21990476190476194
(0.21990476190476194, ['pas bon gout', 'mauvaise ambiance au bar', 'mauvais service', "trop d'attente"])


  "X does not have valid feature names, but"
