# Part A

## Function prep: create count vectorizer

In [1]:
import pandas as pd
import re
import numpy as np
from collections import Counter
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.porter import *
from nltk.tokenize import sent_tokenize, word_tokenize
from tqdm import tqdm
nltk.download('punkt') 
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/sherryliu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sherryliu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
def get_vec(words, stop=None):
    '''Set up countvectorizer with several parameters, print shape of vectorizer and return vectorizer in DataFrame'''
    vectorizer = CountVectorizer(stop_words=stop, lowercase=True, min_df=0.001) # only keep keywords that appear in more than 0.1% of the reviews
    X = vectorizer.fit_transform(words) 
    X = X.toarray()
    print(X.shape)
    feature = vectorizer.get_feature_names()
    corpus_df = pd.DataFrame(X, columns=feature)
    return corpus_df

## Pre-processing: observe potential stopwords and decide regex cleaning target

In [3]:
good_review = open('g_amazon.txt', 'r')
good_review = good_review.readlines()

In [5]:
poor_review = open('p_amazon.txt', 'r')
poor_review = poor_review.readlines()

In [6]:
good_review_vec = get_vec(good_review, 'english')
good_review_vec.sum().sort_values(ascending=False)[:50]

(102217, 1531)


great        27723
br           22019
love         17024
loves        13564
old          11165
fun          11123
good         10534
just         10126
little        9499
game          9498
like          9163
year          8834
kids          8556
loved         8250
34            8163
perfect       7934
really        7486
product       7240
play          7112
son           7089
daughter      7002
quality       6959
time          6937
toy           6832
cute          6635
nice          6185
bought        6147
set           6032
easy          5631
price         5456
got           5122
gift          5057
use           4783
awesome       4242
grandson      3993
birthday      3935
recommend     3672
buy           3601
happy         3478
make          3398
playing       3385
size          3336
excellent     3211
fast          3106
best          3009
came          2994
small         2980
box           2960
cards         2940
don           2929
dtype: int64

In [7]:
poor_review_vec = get_vec(poor_review, 'english')
poor_review_vec.sum().sort_values(ascending=False)[:50]

(12700, 1875)


br              2149
34              1606
like            1572
product         1572
just            1466
money           1405
disappointed    1306
work            1260
don             1252
buy             1218
got             1082
did             1053
quality         1016
didn            1014
bought          1009
broke            955
time             953
cheap            909
box              905
waste            877
small            861
use              859
item             807
received         800
toy              782
old              774
really           760
good             739
came             705
plastic          664
does             660
doesn            658
son              653
pieces           637
return           627
way              614
water            586
year             580
kids             574
broken           565
used             563
worth            548
picture          534
ordered          533
day              529
thing            499
make             498
poor         

In [8]:
poor_review_vec[poor_review_vec['don'] > 0]

Unnamed: 0,00,10,100,1000,11,12,13,14,15,150,...,years,yellow,yes,yesterday,young,younger,youtube,yr,zero,zipper
0,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
40,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12666,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12667,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12681,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12685,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
poor_review[14]

'"Sadness! My daughter used her own money that she worked hard to earn on these bubbles that she has been seeing on tv, and they don\'t work. We can barely get any bubbles to leave the blower and when we do they pop instantly! Very disappointed. Wish she could get her money back to buy a toy without false advertising. Don\'t buy! Zero stars!"\n'

By observing results of count vectorizer without any word processing, we can find there are several parts that can be refined:

1. "br" should be removed and added into stopwords list.
2. "34" and other numbers should be excluded by applying regex rule in count vectorizer.
3. "year" and "old" can be interprted together as "_year_old_". In addition, different ways of expression should take into account. (years-old, year-old, yr old...) 
4. "child" and "kid" have the same meaning and they can be interprted together as "_kid_". (also for son, daughter, grandson, grandaughter)
5. "birthday" and "bday" have the same meaning and they can be interprted together as "_birthday_".
6. We can observe that "work", "don", "didn" appear in poor review count vectorizer. They can be interprted together as "_broken_" since they usually state "don/'t work" in the reviews. 
7. "buy", "order", "purchase" have the same meaning and they can be interprted together as "_buy_".

## Revise count vectorizer & Lemmitization & Regex cleaning

The reason why I choose lemmitization is that I think results in lemma are easier to interpret when doing word counts in comparison with stem. Stem is more difficult to interpret since you have to get back to the original text so that you can figure out what exact words are used. In contrast, lemma returns the base or dictionary form of a word and it's more intuitive.

In [32]:
stopword_list = stopwords.words("english")
stopword_list.append('br')

def get_vec(words, stop=stopword_list):
    '''Revise the function by changing stop_words and adding token_pattern'''
    vectorizer = CountVectorizer(stop_words=stop, lowercase=True, min_df=0.001, token_pattern=r'[a-zA-Z\_]{3,}')
    X = vectorizer.fit_transform(words) 
    X = X.toarray()
    print(X.shape)
    feature = vectorizer.get_feature_names()
    corpus_df = pd.DataFrame(X, columns=feature)
    return corpus_df

In [13]:
## Reference: https://gist.github.com/gaurav5430/9fce93759eb2f6b1697883c3782f30de#file-nltk-lemmatize-sentences-py
lemmatizer = WordNetLemmatizer()

# function to convert nltk tag to wordnet tag
def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:          
        return None

def lemmatize_sentence(sentence):
    #tokenize the sentence and find the POS tag for each token
    nltk_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))  
    #tuple of (token, wordnet_tag)
    wordnet_tagged = map(lambda x: (x[0], nltk_tag_to_wordnet_tag(x[1])), nltk_tagged)
    lemmatized_sentence = []
    for word, tag in wordnet_tagged:
        if tag is None:
            #if there is no available tag, append the token as is
            lemmatized_sentence.append(word)
        else:        
            #else use the tag to lemmatize the token
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
    return " ".join(lemmatized_sentence)

def lem(sentence):
    '''Intake a list of review, lemmatize them and return a new list'''
    result_lem = []
    for s in tqdm(sentence):
        s_lem = lemmatize_sentence(s)
        result_lem.append(s_lem)
    return result_lem

In [25]:
def word_replace_pre_lem(line):
    '''Before lemmitization, do the regex cleaning for _broken_'''
    line = re.sub(r'\b((breaks?(ing)?)|(broken?)|(((do(es)?)|(did))n\'t work))\b', '_broken_', line)
    return line

In [26]:
def word_replace_post_lem(line):
    '''After lemmitization, do the regex cleaning for _year_old_, _kid_, _birthday_ and _buy_'''
    line = re.sub(r'\b([0-9]+-?)?y(ea)?rs?( |-)old\b', '_year_old_', line)
    line = re.sub(r'\b((child)|(kid)|((grand)?(son|daughter)))\b', '_kid_', line)
    line = re.sub(r'\bb(irth)?day\b', '_birthday_', line)
    line = re.sub(r'\b((buy)|(order)|(purchase))\b', '_buy_', line)
    return line

## Data processing

In [30]:
review_all = []
for r in good_review:
    review_all.append(r)
for r in poor_review:
    review_all.append(r)
review_all_pre_lem = [word_replace_pre_lem(review.lower()) for review in review_all]

In [31]:
review_all_lem = lem(review_all_pre_lem)

100%|██████████| 114917/114917 [02:06<00:00, 911.50it/s] 


In [34]:
review_all_done = [word_replace_post_lem(review) for review in review_all_lem]

In [36]:
review_all_vec = get_vec(review_all_done)
review_all_vec.sum().sort_values(ascending=False)[:50]

(114917, 1418)


love          39408
_kid_         39344
great         28292
_buy_         21606
get           16789
one           15728
play          13351
make          13197
good          13090
like          12582
game          11559
fun           11384
well          11061
use           10995
toy           10233
_year_old_    10127
little        10008
product        9473
time           9181
would          9134
look           9029
come           8447
really         8246
work           8040
perfect        7994
quality        7991
set            7728
cute           6893
nice           6416
easy           6365
price          6329
much           5986
gift           5946
take           5537
even           5414
card           5299
also           5213
put            5185
piece          4963
small          4939
want           4814
color          4736
_birthday_     4635
lot            4538
recommend      4533
thing          4509
think          4351
awesome        4287
figure         4250
box            4202


In [37]:
review_all_vec

Unnamed: 0,_birthday_,_broken_,_buy_,_kid_,_year_old_,aaa,ability,able,absolute,absolutely,...,year,yellow,yes,yesterday,yet,young,youtube,zero,zip,zipper
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
114912,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
114913,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
114914,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
114915,0,0,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Part B

## data prep

In [129]:
df = pd.read_csv('mcdonalds.csv')

In [130]:
def vec(words, stop=None):
    '''Create count vectorizer'''
    vectorizer = CountVectorizer(stop_words=stop, lowercase=True, token_pattern=r'[a-zA-Z]{3,}')
    X = vectorizer.fit_transform(words) 
    X = X.toarray()
    print(X.shape)    
    feature = vectorizer.get_feature_names()
    corpus_df = pd.DataFrame(X, columns=feature)
    return corpus_df

In [131]:
result = df['review'].tolist()

## Pre-processing: observe potential stopwords and decide regex cleaning target

In [132]:
result_vec = vec(result)
result_vec.sum().sort_values(ascending=False)[:50]

(1525, 8010)


the          7112
and          4315
this         1892
was          1812
for          1694
they         1575
that         1413
you          1390
not          1053
but          1008
have          961
food          887
mcdonald      852
order         850
with          834
there         806
one           769
are           724
drive         693
get           678
just          596
here          594
when          585
mcdonalds     580
had           577
service       540
time          533
out           511
like          501
thru          478
place         477
what          476
all           455
can           449
were          446
only          434
your          430
because       405
don           398
their         392
location      388
from          373
about         371
been          366
people        353
she           344
would         333
back          332
even          328
fries         314
dtype: int64

In [133]:
result_vec

Unnamed: 0,aaaaaaaahhhhhhhhhhh,abbreviated,abc,ability,able,abode,abour,about,above,abrams,...,zak,zax,zee,zeke,zero,zesty,zip,zombie,zombies,zoom
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1520,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1521,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1522,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1523,0,0,0,0,0,0,0,2,0,0,...,0,0,0,0,1,0,0,0,0,0


## Regex cleaning

The reason why I didn't use underline to highlight regex cleaning result is that there are some noises appearing in the result if I set up regex rule in count vectorizer as "[a-zA-Z\\_]{3,}". 

In [134]:
def word_replace(line):
    line = re.sub(r'\b(drive[ -]thru|drive[ -]through)\b', 'drivethrough', line)
    line = re.sub(r'\b(services?|staff|managers?|employees?|(customer )?experiences?)\b', 'service', line)
    line = re.sub(r'\b(food|coffees?|chicken( nuggets)?|sandwich(es)?|sauce|breakfast|fries|(ham)?burgers?|ice cream|ketchup|big mac)\b', 'food', line)
    line = re.sub(r'\b(mcdonald(\')?s?|mcds?)\b', 'mcdonald', line)
    return line

In [135]:
result_regex = [word_replace(review.lower()) for review in result]

## Perform stemming: 6043 dimensions

In [136]:
### Reference: https://www.datacamp.com/community/tutorials/stemming-lemmatization-python
stemmer = PorterStemmer()

def stem(sentence):
    '''Intake a list of review, stem them and return a new list'''
    all_sentence = []
    for s in tqdm(sentence):
        token_words = nltk.word_tokenize(s)
        stem_sentence = []
        for word in token_words:
            stem_sentence.append(stemmer.stem(word))
            stem_sentence.append(" ")
        all_sentence.append("".join(stem_sentence))
    return all_sentence

In [137]:
result_stem = stem(result_regex)

100%|██████████| 1525/1525 [00:02<00:00, 570.96it/s]


In [138]:
result_stem_vec = vec(result_stem)
result_stem_vec.sum().sort_values(ascending=False)[:50]

(1525, 6043)


the             7038
and             4316
food            2484
thi             1888
for             1694
mcdonald        1579
they            1575
that            1423
you             1386
servic          1307
order           1294
not             1069
have            1051
but             1008
with             835
there            805
get              804
one              783
are              742
time             736
just             596
here             594
drivethrough     590
had              588
when             585
place            545
like             521
out              511
what             479
were             462
all              453
locat            446
your             434
onli             433
wait             431
becaus           405
their            394
would            377
from             373
about            371
ask              367
been             366
peopl            354
can              348
even             347
back             340
she              339
did          

In [139]:
result_stem_vec

Unnamed: 0,aaaaaaaahhhhhhhhhhh,abbrevi,abc,abil,abl,abod,abour,about,abov,abram,...,zak,zax,zee,zeke,zero,zesti,zip,zombi,zombie,zoom
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1520,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1521,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1522,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1523,0,0,0,0,0,0,0,2,0,0,...,0,0,0,0,1,0,0,0,0,0


## Perform lemmitization: 6612 dimensions

In [148]:
result_lem = lem(result_regex)

100%|██████████| 1525/1525 [00:05<00:00, 295.33it/s]


In [150]:
result_lem_vec = vec(result_lem)
result_lem_vec.sum().sort_values(ascending=False)[:50]

(1525, 6612)


the             7112
and             4315
food            2483
have            1896
this            1892
for             1694
mcdonald        1579
they            1575
that            1413
you             1390
service         1328
order           1285
get             1162
not             1068
but             1008
with             834
there            807
one              788
time             735
just             596
here             594
drivethrough     590
when             585
place            545
like             516
out              512
what             476
all              455
say              447
take             442
only             434
your             430
bad              429
wait             424
location         419
because          405
their            392
give             389
make             389
come             386
would            377
from             373
about            371
ask              365
good             362
people           355
can              348
she          

In [151]:
result_lem_vec

Unnamed: 0,aaaaaaaahhhhhhhhhhh,abbreviate,abc,ability,able,abode,abour,about,above,abrams,...,yuppie,zak,zax,zee,zeke,zero,zesty,zip,zombie,zoom
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1520,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1521,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1522,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1523,0,0,0,0,0,0,0,2,0,0,...,0,0,0,0,0,1,0,0,0,0


## Perform lemmitization and remove stopwords: 6363 dimensions

In [152]:
result_lem_remove_vec = vec(result_lem, 'english')
result_lem_remove_vec.sum().sort_values(ascending=False)[:50]

(1525, 6363)


food            2483
mcdonald        1579
service         1328
order           1285
time             735
just             596
drivethrough     590
place            545
like             516
say              447
bad              429
wait             424
location         419
make             389
come             386
ask              365
good             362
people           355
work             315
customer         306
minute           285
want             277
line             274
window           270
know             265
look             253
tell             253
eat              252
right            246
fast             243
really           239
think            219
need             218
way              204
wrong            184
long             183
meal             182
inside           181
star             179
thing            177
car              176
drive            171
slow             170
try              170
pay              169
leave            169
review           167
day          

In [153]:
result_lem_remove_vec

Unnamed: 0,aaaaaaaahhhhhhhhhhh,abbreviate,abc,ability,able,abode,abour,abrams,abrasive,absolute,...,yuppie,zak,zax,zee,zeke,zero,zesty,zip,zombie,zoom
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1520,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1521,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1522,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1523,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
