In [1]:
import pandas as pd
import sqlite3
import os
import numpy as np
import jinja2
from IPython.display import display, HTML
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from itertools import chain
from sspipe import p, px
import warnings

from sklearn.feature_extraction.text import CountVectorizer
from spacy.lang.en.stop_words import STOP_WORDS as stopwords
from sklearn.decomposition import LatentDirichletAllocation
import pyLDAvis
import pyLDAvis.sklearn
import spacy

warnings.filterwarnings("ignore")

  from imp import reload


In [2]:
db = sqlite3.connect('database.db')

chipotle_negative_reviews_query = """
SELECT * FROM resturants_review
WHERE name = 'Chipotle Mexican Grill'
and stars <= 2
"""

chipotle_negative_reviews = pd.read_sql(chipotle_negative_reviews_query, db)
chipotle_negative_reviews.sample(5)

Unnamed: 0,business_id,review_id,date,stars,text,name,tokenized
1312,BvomXXWqtKSkEfS037gcfw,hGoABk5EG9-H05DOhe_Crg,2021-09-18 20:36:50,1,This Chipotle is trifling and dirty. They're a...,Chipotle Mexican Grill,thi chipotl trifl dirti theyr alway one two to...
326,iAP8eZ847uCHLS-r81jUFg,nk-i5xUu7LRHPFIQPQPcbQ,2021-11-04 03:45:22,1,Worst Chipotle I have ever been to. Countless ...,Chipotle Mexican Grill,worst chipotl ever countless time ingredi dure...
4551,dcq-bXnuPzeauvI8x-AMEQ,9UZrZfO_NMb_WmYQzbOZ9g,2021-10-07 02:35:18,1,What is happening at Chipotle? The quality of...,Chipotle Mexican Grill,happen chipotl qualiti food prepar go tube mea...
1106,ZnY77IphK8pTi_jPvSAA9A,8_QdTa-KK0eALaYkkhWRYw,2021-03-12 01:01:19,2,Whoever is packing mobile orders is NOT doing ...,Chipotle Mexican Grill,whoever pack mobil order good job order twice ...
2047,Zf3tMqJQOflOY7yPj5SJKw,rUH4_leXpqcA2BpIAQC8sQ,2015-02-11 18:50:36,1,I was sooo excited when I heard that Chipotle ...,Chipotle Mexican Grill,wa sooo excit heard chipotl wa come downtown i...


In [3]:
def set_phase(x):
    
    if x >= pd.to_datetime('2014-07-01') and x < pd.to_datetime('2015-07-01'):
        return 'before_breakout'
    elif x >= pd.to_datetime('2015-07-01') and x < pd.to_datetime('2016-07-01'):
        return 'during_breakout'
    elif x >= pd.to_datetime('2016-07-01') and x < pd.to_datetime('2017-07-01'):
        return 'after_breakout'
    else:
        return None

chipotle_negative_reviews['date'] = chipotle_negative_reviews['date'].apply(lambda x: pd.to_datetime(x).date())
chipotle_negative_reviews['phase'] = chipotle_negative_reviews['date'].apply(lambda x: set_phase(x))
chipotle_negative_reviews.head()

Unnamed: 0,business_id,review_id,date,stars,text,name,tokenized,phase
0,L7i_5DydYEKwPLfcDBRYDA,HdTUAwNaPQ_vzMiwVdsmiA,2016-10-26,1,WORST location out there. I eat Chipotle 4+ t...,Chipotle Mexican Grill,worst locat eat chipotl 4 time week everi time...,after_breakout
1,tNOLXgYTykXmLaAZnvo1vg,4zmPh5c6m6u0M_zUtxsykg,2017-11-19,1,My family and I love Chipotle! We choose thi...,Chipotle Mexican Grill,famili love chipotl choos thi chain often cons...,
2,bKgyXSQUGA8IHGt8ne5NuA,nS_PY3gWNJnwldVXyyrE-A,2016-09-06,1,"We received a coupon for a buy one-get-one, so...",Chipotle Mexican Grill,receiv coupon buy onegeton decid tri thi place...,after_breakout
3,F-eHPbdh9bl8aeYDRws4BQ,ImsibThFiiO-Ed1KYh4sEA,2015-01-19,1,Just left there...horrible service and miserab...,Chipotle Mexican Grill,left therehorr servic miser employe never go m...,before_breakout
4,2VvIvL-Dyp6QTk0-KfYeDg,8eNci67vFhNUMZQnXG_kYw,2017-12-16,2,Maybe this place is still new.. I don't know.....,Chipotle Mexican Grill,mayb thi place still new dont know ill give mo...,


In [11]:
# if all words in food_related_sickness are in lda_display.topic_info.Term.tolist() then return True
def topic_contains_food_related_words(words, topic_terms):
    return all(word in topic_terms for word in words)

def create_model(n_components, df, count_text_vectors):
    lda_para_model = LatentDirichletAllocation(n_components=n_components , random_state=1, n_jobs=-1)
    lda_para_model.fit(count_text_vectors)
    return lda_para_model
    
   
def get_number_of_topics(df, food_related_sickness):
    count_text_vectorizer = CountVectorizer()
    count_text_vectors = count_text_vectorizer.fit_transform(df['tokenized'])

    range_of_components = range(5, 105)
    for i in range_of_components:
        lda_para_model = create_model(i, df, count_text_vectors)
        lda_display = pyLDAvis.sklearn.prepare(lda_para_model, count_text_vectors, count_text_vectorizer)
        topic_terms = lda_display.topic_info.Term.tolist()
        all_terms_found = topic_contains_food_related_words(food_related_sickness, topic_terms)

        if all_terms_found:
            print("Found all terms when topic's n_components is {}".format(i))
            break

        if range_of_components.stop - 5 == i:
            print("No n_components found where topic set will contain %s" % food_related_sickness)
        
        return i

In [None]:
food_related_sickness = ['food', 'poison', 'sick', 'stomach']

for phase in ['before_breakout', 'during_breakout', 'after_breakout']:
    df = chipotle_negative_reviews.query('phase == @phase')
    print('\nPhase: {}'.format(phase))
    get_number_of_topics(df, food_related_sickness)


Phase: before_breakout
Found all terms when topic's n_components is 24

Phase: during_breakout
Found all terms when topic's n_components is 14

Phase: after_breakout
Found all terms when topic's n_components is 12


In [15]:
chipotle_negative_reviews['year'] = pd.to_datetime(chipotle_negative_reviews['date']).dt.year
chipotle_negative_reviews.year.unique()

array([2016, 2017, 2015, 2018, 2012, 2014, 2013, 2008, 2010, 2019, 2011,
       2020, 2021, 2022, 2009, 2007], dtype=int64)

In [None]:
topic_year_n = dict()
for year in range(2013, 2022):
    df = chipotle_negative_reviews.query('year == @year')
    print('\nYear: {}'.format(year))
    n = get_number_of_topics(df, food_related_sickness)
    topic_year_n[year] = n


Year: 2013
Found all terms when topic's n_components is 10

Year: 2014
Found all terms when topic's n_components is 9

Year: 2015
Found all terms when topic's n_components is 12

Year: 2016
Found all terms when topic's n_components is 23

Year: 2017
Found all terms when topic's n_components is 38

Year: 2018
Found all terms when topic's n_components is 13

Year: 2019
Found all terms when topic's n_components is 30

Year: 2020
Found all terms when topic's n_components is 33

Year: 2021
Found all terms when topic's n_components is 34
