In [33]:
#for working on dataframe
import pandas as pd
import re
#for tokenizing and filtering
import nltk
from nltk import word_tokenize
#for lemmatizing
from nltk.stem import WordNetLemmatizer
#for sentiment analysis
import random
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.svm import SVC

In [2]:
review_data = pd.read_csv('review_data.csv')
review_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 395 entries, 0 to 394
Data columns (total 10 columns):
Unnamed: 0     395 non-null int64
business_id    395 non-null object
cool           395 non-null int64
date           395 non-null object
funny          395 non-null int64
review_id      395 non-null object
stars          395 non-null int64
text           395 non-null object
useful         395 non-null int64
user_id        395 non-null object
dtypes: int64(5), object(5)
memory usage: 30.9+ KB


# Text mining

In [3]:
reviews = review_data.loc[:,['review_id','text']]
reviews.head()

Unnamed: 0,review_id,text
0,x7mDIiDB3jEiPGPHOmDzyw,The pizza was okay. Not the best I've had. I p...
1,VluIpojME0yKOcRKI5L0Iw,"came here on Monday, no line. its good, but no..."
2,nsB9JAeghk0zOaSulSm9Yw,This place is truly a secret! Its so hidden t...
3,pcn01EMERpCpHEcoaohdEg,This SECRET PIZZA was a secret for about 5 st...
4,0mFtAPTmInbXHqDjX9eiOg,this is the BEST place to grab a quick slice o...


In [16]:
txt = []
rid = []
for index,row in reviews.iterrows():
    text = row['text']
    text = text.split('. ')
    txt.extend(text)
    for i in range(0,len(text)):
        rid.append(row['review_id'])

t = pd.DataFrame(txt, columns=['text'])
i = pd.DataFrame(rid, columns=['rid'])
review_text = pd.concat([i,t], axis=1)
review_text.head()

Unnamed: 0,rid,text
0,x7mDIiDB3jEiPGPHOmDzyw,The pizza was okay
1,x7mDIiDB3jEiPGPHOmDzyw,Not the best I've had
2,x7mDIiDB3jEiPGPHOmDzyw,I prefer Biaggio's on Flamingo / Fort Apache
3,x7mDIiDB3jEiPGPHOmDzyw,The chef there can make a MUCH better NY style...
4,x7mDIiDB3jEiPGPHOmDzyw,The pizzeria @ Cosmo was over priced for the q...


In [17]:
token_list = []
for index,row in review_text.iterrows():
    text = row['text'].lower()
    tokens = nltk.word_tokenize(text)
    token_list.append([tokens])

token_col = pd.DataFrame(token_list, columns=['tokens'])
review_text = pd.concat([review_text, token_col], axis=1)
review_text.head()

Unnamed: 0,rid,text,tokens
0,x7mDIiDB3jEiPGPHOmDzyw,The pizza was okay,"[the, pizza, was, okay]"
1,x7mDIiDB3jEiPGPHOmDzyw,Not the best I've had,"[not, the, best, i, 've, had]"
2,x7mDIiDB3jEiPGPHOmDzyw,I prefer Biaggio's on Flamingo / Fort Apache,"[i, prefer, biaggio, 's, on, flamingo, /, fort..."
3,x7mDIiDB3jEiPGPHOmDzyw,The chef there can make a MUCH better NY style...,"[the, chef, there, can, make, a, much, better,..."
4,x7mDIiDB3jEiPGPHOmDzyw,The pizzeria @ Cosmo was over priced for the q...,"[the, pizzeria, @, cosmo, was, over, priced, f..."


In [18]:
ttext = []
for index,row in review_text.iterrows():
    filtered_text = row['tokens']
    tagged_text = nltk.pos_tag(filtered_text) 
    ttext.append([tagged_text])

tagged_col = pd.DataFrame(ttext, columns=['tagged_tokens'])
review_text = pd.concat([review_text, tagged_col], axis=1)
review_text.head()

Unnamed: 0,rid,text,tokens,tagged_tokens
0,x7mDIiDB3jEiPGPHOmDzyw,The pizza was okay,"[the, pizza, was, okay]","[(the, DT), (pizza, NN), (was, VBD), (okay, JJ)]"
1,x7mDIiDB3jEiPGPHOmDzyw,Not the best I've had,"[not, the, best, i, 've, had]","[(not, RB), (the, DT), (best, JJS), (i, NN), (..."
2,x7mDIiDB3jEiPGPHOmDzyw,I prefer Biaggio's on Flamingo / Fort Apache,"[i, prefer, biaggio, 's, on, flamingo, /, fort...","[(i, NN), (prefer, VBP), (biaggio, NN), ('s, P..."
3,x7mDIiDB3jEiPGPHOmDzyw,The chef there can make a MUCH better NY style...,"[the, chef, there, can, make, a, much, better,...","[(the, DT), (chef, NN), (there, EX), (can, MD)..."
4,x7mDIiDB3jEiPGPHOmDzyw,The pizzeria @ Cosmo was over priced for the q...,"[the, pizzeria, @, cosmo, was, over, priced, f...","[(the, DT), (pizzeria, NN), (@, NNP), (cosmo, ..."


In [19]:
ntext = []
for index,row in review_text.iterrows():
    tagged_text = row['tagged_tokens']
    nouns = []
    for tt in tagged_text:
        tag = tt[1]
        if tag in ['NN','NNS','NNP','NNPS']:
            nouns.append(tt[0])
    ntext.append([nouns])

noun_col = pd.DataFrame(ntext, columns=['nouns'])
review_text = pd.concat([review_text, noun_col], axis=1)
review_text.head()

Unnamed: 0,rid,text,tokens,tagged_tokens,nouns
0,x7mDIiDB3jEiPGPHOmDzyw,The pizza was okay,"[the, pizza, was, okay]","[(the, DT), (pizza, NN), (was, VBD), (okay, JJ)]",[pizza]
1,x7mDIiDB3jEiPGPHOmDzyw,Not the best I've had,"[not, the, best, i, 've, had]","[(not, RB), (the, DT), (best, JJS), (i, NN), (...",[i]
2,x7mDIiDB3jEiPGPHOmDzyw,I prefer Biaggio's on Flamingo / Fort Apache,"[i, prefer, biaggio, 's, on, flamingo, /, fort...","[(i, NN), (prefer, VBP), (biaggio, NN), ('s, P...","[i, biaggio, flamingo, /, fort, apache]"
3,x7mDIiDB3jEiPGPHOmDzyw,The chef there can make a MUCH better NY style...,"[the, chef, there, can, make, a, much, better,...","[(the, DT), (chef, NN), (there, EX), (can, MD)...","[chef, style, pizza]"
4,x7mDIiDB3jEiPGPHOmDzyw,The pizzeria @ Cosmo was over priced for the q...,"[the, pizzeria, @, cosmo, was, over, priced, f...","[(the, DT), (pizzeria, NN), (@, NNP), (cosmo, ...","[pizzeria, @, cosmo, quality, lack, personalit..."


## Noun filtering

In [20]:
filtered_nouns = review_text['nouns']
filtered_nouns = filtered_nouns.tolist()

lemmatizer = WordNetLemmatizer()
lemm_nouns = []
for fn in filtered_nouns:
    lf = []
    for f in fn:
        lf.append(lemmatizer.lemmatize(f))
    lemm_nouns.append(lf)
lemm_nouns[0:5]

[['pizza'],
 ['i'],
 ['i', 'biaggio', 'flamingo', '/', 'fort', 'apache'],
 ['chef', 'style', 'pizza'],
 ['pizzeria', '@', 'cosmo', 'quality', 'lack', 'personality', 'food']]

# Frequent feature extraction

In [21]:
features = {}
for nouns in lemm_nouns:
    keys = features.keys()
    visited = []
    for n in nouns:
        if n not in visited:
            if n in keys:
                features[n] += 1
            else:
                features.update({n: 1})

In [100]:
frequent_ftr = []
threshold = int(review_text['text'].count() * 0.03)
for k,v in features.items():
    if v >= threshold:
        frequent_ftr.append(k)

# Feature report

In [101]:
for k,v in features.items():
    if k in frequent_ftr:
        print(v, ' people commented on ', k)

125  people commented on  line
306  people commented on  place
86  people commented on  time
826  people commented on  pizza
65  people commented on  pepperoni
95  people commented on  vega
56  people commented on  hallway
491  people commented on  i
60  people commented on  strip
57  people commented on  cheese
60  people commented on  pie
78  people commented on  floor
286  people commented on  slice
85  people commented on  night
90  people commented on  crust


# Sentiment analysis
### (not working)

In [104]:
from nltk.corpus import movie_reviews as pc

documents = [(list(pc.words(fileid)), category)
             for category in pc.categories()
             for fileid in pc.fileids(category)]

random.shuffle(documents)

all_words = []
for w in pc.words():
    all_words.append(w.lower())

all_words = nltk.FreqDist(all_words)

In [105]:
word_features = list(all_words.keys())[:3000]

def find_features(document):
    words = set(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)

    return features

featuresets = [(find_features(rev), category) for (rev, category) in documents]
#featuresets[0:5]

In [106]:
training_set = featuresets
SVC_classifier = SklearnClassifier(SVC())
SVC_classifier.train(training_set)

<SklearnClassifier(SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))>

In [119]:
dat1 = []
for index,row in review_text.iterrows():
    words = row['tokens']
    features = {}
    for w in word_features:
        if w in words:
            features[w] = 'pos'
    dat1.append(features)

testing_set = dat1

In [121]:
for i in range(10,20):
    print(review_text['text'][i])
    print(SVC_classifier.classify(testing_set[i]))

 price is reasonable, it is hard to find
neg
but that was the fun of it trying to find it
neg
 no name, no sign
neg
 if your clubbing @ the Cosmo and your have the munchies this is your place
neg
 they use fresh toppings, crust is thin and very tasty.
neg
This place is truly a secret!  Its so hidden that there are no signs in the directory of the restaurant itself except for the line of people waiting to enjoy a slice of the best pizza they ever had
neg
 It is thin crust pizza, crispy crust and delicious toppings!  If you can go for lunch there is less of a line.
Their pepperoni was nice and oily just the way i like it
neg
This SECRET  PIZZA was a secret for about 5 stays at the Cosmo
neg
Now that I know where it is its my favorite late night snack
neg
Ask for it well done and the crust will be perfect
neg
