In [None]:
import pandas as pd
import gensim
import warnings

In [None]:
pd.set_option('display.max_colwidth', None)

In [None]:
df = pd.read_table("Restaurant_Reviews.tsv")

In [None]:
df.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday off Rick Steve recommendation and loved it.,1
4,The selection on the menu was great and so were the prices.,1


In [None]:
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

# Data Preparation

In [None]:
!pip install contractions
import contractions
import nltk
import re
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)
nltk.download('omw-1.4')
from nltk.tokenize import word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

def fix_contractions(text):
    expanded_words = []   
    for word in text.split():
        expanded_words.append(contractions.fix(word))  
   
    return ' '.join(expanded_words)

def clean(text):
    return re.sub(r"\s+", " ", re.sub(r"[^\sA-Za-z0-9]", "", re.sub(r'[^\w\s]', ' ', text))).lower()
    # return " ".join(map(spellchecker.correction, re.sub(r"\s+", " ", text).lower().split()))

def tokenize(text):
    return word_tokenize(text)

STOP_WORDS = set(stopwords.words("english"))
STOP_WORDS.remove("not")
STOP_WORDS.remove("no")

def filter_stopwords(tokenized_text):
    return list(filter(lambda x: x not in STOP_WORDS, tokenized_text))

def lemmatize(filtered_text):
    return list(map(WordNetLemmatizer().lemmatize, filtered_text))

# def lemmatize_custom(sentence):
#     return " ".join(list(map(WordNetLemmatizer().lemmatize, filter_stopword(word_tokenize(sentence)))))

def stemming(lemmatized_text):
    return " ".join(list(map(SnowballStemmer("english").stem, lemmatized_text)))

def preprocess(text):
    pipeline = [fix_contractions, clean, tokenize, filter_stopwords, lemmatize]
    for process in pipeline:
        text = process(text)

    return " ".join(text)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Collecting textsearch>=0.0.21
  Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Collecting pyahocorasick
  Downloading pyahocorasick-2.0.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl (104 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.5/104.5 KB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting anyascii
  Downloading anyascii-0.3.1-py3-none-any.whl (287 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m287.5/287.5 KB[0m [31m32.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyahocorasick, anyascii, textsearch, contractions
Successfully installed anyascii-0.3.1 contractions-0.1.73 pyahocorasick-2.0.0 textsearch-0.0.24


[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [None]:
df["cleaned_review"] = df["Review"].apply(preprocess)

In [None]:
positive_df = df[df["Liked"] == 1]
negative_df = df[df["Liked"] == 0]

# Model

## LDA

In [None]:
#Source: https://github.com/marcmuon/nlp_yelp_review_unsupervised/tree/master/notebooks

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

def bigrams(words, bi_min = 15):
    bigram = gensim.models.Phrases(words, min_count = bi_min)
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    return bigram_mod

def get_corpus(df, column):
    words = list(sent_to_words(df[column]))
    bigram_mod = bigrams(words)
    bigram = [bigram_mod[word] for word in words]
    id2word = gensim.corpora.Dictionary(bigram)
    id2word.filter_extremes(no_below = 10)
    id2word.compactify()
    corpus = [id2word.doc2bow(text) for text in bigram]
    
    return corpus, id2word, bigram

In [None]:
positive_corpus, positive_id2word, positive_bigram = get_corpus(positive_df, "cleaned_review")
negative_corpus, negative_id2word, negative_bigram = get_corpus(negative_df, "cleaned_review")

In [None]:
import logging

with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    lda_positive = gensim.models.ldamulticore.LdaMulticore(
                           corpus = positive_corpus,
                           num_topics = 8, 
                           id2word = positive_id2word,
                           per_word_topics = True)

with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    lda_negative = gensim.models.ldamulticore.LdaMulticore(
                           corpus = negative_corpus,
                           num_topics = 8, 
                           id2word = negative_id2word,
                           per_word_topics = True)



In [None]:
lda_positive.print_topics(10, num_words = 15)

[(0,
  '0.171*"great" + 0.080*"food" + 0.070*"place" + 0.070*"friendly" + 0.070*"restaurant" + 0.059*"service" + 0.043*"staff" + 0.043*"really" + 0.038*"nice" + 0.027*"good" + 0.027*"always" + 0.027*"one" + 0.022*"not" + 0.022*"go" + 0.022*"love"'),
 (1,
  '0.241*"good" + 0.190*"food" + 0.089*"service" + 0.045*"delicious" + 0.045*"like" + 0.039*"love" + 0.032*"pretty" + 0.026*"place" + 0.026*"friendly" + 0.026*"even" + 0.020*"time" + 0.020*"experience" + 0.013*"great" + 0.013*"nice" + 0.013*"best"'),
 (2,
  '0.099*"good" + 0.086*"service" + 0.067*"great" + 0.060*"amazing" + 0.053*"nice" + 0.047*"food" + 0.047*"not" + 0.047*"fresh" + 0.040*"also" + 0.034*"pizza" + 0.034*"price" + 0.034*"fantastic" + 0.027*"menu" + 0.027*"atmosphere" + 0.027*"go"'),
 (3,
  '0.208*"great" + 0.076*"place" + 0.076*"made" + 0.058*"not" + 0.058*"pizza" + 0.048*"service" + 0.048*"like" + 0.039*"price" + 0.029*"really" + 0.029*"time" + 0.029*"server" + 0.029*"experience" + 0.020*"good" + 0.020*"food" + 0.020*"l

In [None]:
lda_negative.print_topics(10, num_words = 15)

[(0,
  '0.254*"not" + 0.098*"ever" + 0.074*"minute" + 0.074*"no" + 0.050*"food" + 0.050*"worst" + 0.049*"get" + 0.038*"never" + 0.038*"would" + 0.038*"got" + 0.038*"restaurant" + 0.038*"bland" + 0.026*"like" + 0.026*"one" + 0.025*"disappointed"'),
 (1,
  '0.287*"not" + 0.128*"back" + 0.090*"really" + 0.052*"table" + 0.040*"food" + 0.040*"one" + 0.040*"get" + 0.040*"going" + 0.027*"place" + 0.027*"ever" + 0.027*"go" + 0.027*"restaurant" + 0.021*"much" + 0.021*"like" + 0.014*"service"'),
 (2,
  '0.135*"came" + 0.135*"bland" + 0.091*"food" + 0.069*"not" + 0.069*"good" + 0.047*"service" + 0.047*"like" + 0.047*"bad" + 0.047*"minute" + 0.025*"back" + 0.025*"time" + 0.025*"would" + 0.025*"one" + 0.025*"never" + 0.025*"table"'),
 (3,
  '0.123*"worst" + 0.108*"service" + 0.092*"like" + 0.078*"food" + 0.077*"bad" + 0.071*"go" + 0.062*"eat" + 0.047*"no" + 0.047*"one" + 0.039*"back" + 0.039*"ever" + 0.032*"not" + 0.032*"would" + 0.032*"slow" + 0.024*"never"'),
 (4,
  '0.228*"not" + 0.196*"food" + 

In [None]:
positive_topic_vec = []
for i in range(len(positive_df)):
    top_topics = lda_positive.get_document_topics(positive_corpus[i], minimum_probability = 0.0)
    topic_values = sorted(top_topics, key = lambda x: x[1])[-1]
    positive_topic_vec += [topic_values]

positive_topic_set = list(map(lambda x: x if x[1] - 0.33333335 > 0 else (4, 0), positive_topic_vec))
positive_topic = list(map(lambda x: x[0], positive_topic_set))
positive_topic_values = list(map(lambda x: x[1], positive_topic_set))

In [None]:
positive_df["topic"] = positive_topic
positive_df["topic_prob"] = positive_topic_values

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  positive_df["topic"] = positive_topic
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  positive_df["topic_prob"] = positive_topic_values


In [None]:
positive_df

Unnamed: 0,Review,Liked,cleaned_review,topic,topic_prob
0,Wow... Loved this place.,1,wow loved place,7,0.708167
3,Stopped by during the late May bank holiday off Rick Steve recommendation and loved it.,1,stopped late may bank holiday rick steve recommendation loved,2,0.561919
4,The selection on the menu was great and so were the prices.,1,selection menu great price,3,0.780826
8,The fries were great too.,1,fry great,3,0.562255
9,A great touch.,1,great touch,3,0.562239
...,...,...,...,...,...
899,"Overall, a great experience.",1,overall great experience,3,0.708163
901,Their regular toasted bread was equally satisfying with the occasional pats of butter... Mmmm...!,1,regular toasted bread equally satisfying occasional pat butter mmmm,4,0.000000
907,The chips and sals a here is amazing!!!!!!!!!!!!!!!!!!!,1,chip sals amazing,2,0.562127
909,This is my new fav Vegas buffet spot.,1,new fav vega buffet spot,6,0.562434
