# Yelp reviews classification with advanced ML

## 1: Reading in the Yelp Reviews

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import pandas as pd
import numpy as np
import scipy as sp
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from nltk.stem.snowball import SnowballStemmer
%matplotlib inline

ModuleNotFoundError: No module named 'nltk'

In [2]:
yelp = pd.read_csv('data/yelp.csv')

In [4]:
yelp.head()

Unnamed: 0,business_id,date,review_id,stars,text,type,user_id,cool,useful,funny
0,9yKzy9PApeiPPOUJEtnvkg,2011-01-26,fWKvX83p0-ka4JS3dc6E5A,5,My wife took me here on my birthday for breakf...,review,rLtl8ZkDX5vH5nAx9C3q5Q,2,5,0
1,ZRJwVLyzEJq1VAihDhYiow,2011-07-27,IjZ33sJrzXqU-0X6U8NwyA,5,I have no idea why some people give bad review...,review,0a2KyEL0d3Yb1V6aivbIuQ,0,0,0
2,6oRAC4uyJCsJl1X0WZpVSA,2012-06-14,IESLBzqUCLdSzSqm0eCSxQ,4,love the gyro plate. Rice is so good and I als...,review,0hT2KtfLiobPvh6cDC8JQg,0,1,0
3,_1QQZuf4zZOyFCvXc0o6Vg,2010-05-27,G-WvGaISbqqaMHlNnByodA,5,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",review,uZetl9T0NcROGOyFfughhg,1,2,0
4,6ozycU1RpktNG2-1BroVtw,2012-01-05,1uJFq2r5QfJG_6ExMRCaGw,5,General Manager Scott Petello is a good egg!!!...,review,vYmM4KTsC8ZfQBg-j5MWkw,0,0,0


In [4]:
# create a new DataFrame that only contains the 5-star and 1-star reviews
yelp_best_worst = yelp
#yelp_best_worst = yelp[(yelp.stars==5) | (yelp.stars==1)]

In [5]:
# define X and y
X = yelp_best_worst.text
y = yelp_best_worst.stars

In [7]:
# split the new DataFrame into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

## 2: Tokenization

- **What:** Separate text into units such as sentences or words
- **Why:** Gives structure to previously unstructured text
- **Notes:** Relatively easy with English language text, not easy with some languages

In [8]:
# use CountVectorizer to create document-term matrices from X_train and X_test
vect = CountVectorizer()

In [9]:
X_train_dtm = vect.fit_transform(X_train)

In [10]:
X_test_dtm = vect.transform(X_test)

In [11]:
X_train.shape

(7500,)

In [12]:
X_test.shape

(2500,)

In [13]:
# rows are documents, columns are terms (aka "tokens" or "features")
X_train_dtm.shape

(7500, 25797)

In [14]:
X_test_dtm.shape

(2500, 25797)

In [15]:
# last 50 features
print (vect.get_feature_names()[-100:])

['yums', 'yumtastic', 'yumzoid', 'yung', 'yup', 'yupha', 'yuppie', 'yuppies', 'yuppified', 'yuppy', 'yur', 'yuri', 'yusef', 'yusefs', 'yuukk', 'yuuuummmmae', 'yuuuuummmmmyyy', 'yuxiang', 'yuyuyummy', 'yuzu', 'yyyeeaahhhh', 'yyyyy', 'z11', 'za', 'zabba', 'zach', 'zag', 'zagat', 'zam', 'zanella', 'zankou', 'zanos', 'zap', 'zapped', 'zappos', 'zatsiki', 'zealand', 'zealous', 'zebra', 'zen', 'zero', 'zest', 'zesty', 'zexperience', 'zha', 'zhou', 'zia', 'zichini', 'zig', 'zilch', 'zillion', 'zin', 'zinburger', 'zinburgergeist', 'zinc', 'zinfandel', 'zing', 'zip', 'zipcar', 'zipper', 'zippers', 'zipps', 'zippy', 'ziti', 'zoe', 'zoey', 'zoftik', 'zola', 'zombie', 'zombies', 'zone', 'zoned', 'zoners', 'zones', 'zoning', 'zoo', 'zoom', 'zoomed', 'zoos', 'zoyo', 'zpizza', 'zu', 'zucca', 'zucchini', 'zuccini', 'zuch', 'zuchinni', 'zuma', 'zumba', 'zupa', 'zur', 'zuzu', 'zuzus', 'zweigel', 'zwiebel', 'zy', 'zzed', 'zzzzzzzzzzzzzzzzz', 'école', 'òc']


In [16]:
temp = pd.DataFrame(X_train_dtm.toarray())
temp.columns = vect.get_feature_names()
temp.head()

Unnamed: 0,00,000,007,00a,00am,00pm,01,02,04,05,...,zur,zuzu,zuzus,zweigel,zwiebel,zy,zzed,zzzzzzzzzzzzzzzzz,école,òc
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


- **lowercase:** boolean, True by default
- Convert all characters to lowercase before tokenizing.

In [17]:
# don't convert to lowercase
vect = CountVectorizer(lowercase=False)
X_train_dtm = vect.fit_transform(X_train)
X_train_dtm.shape

(7500, 32420)

- **ngram_range:** tuple (min_n, max_n)
- The lower and upper boundary of the range of n-values for different n-grams to be extracted. All values of n such that min_n <= n <= max_n will be used.

In [18]:
# include 1-grams and 2-grams
vect = CountVectorizer(ngram_range=(1, 3))
X_train_dtm = vect.fit_transform(X_train)
X_train_dtm.shape

(7500, 1025755)

In [19]:
# last 50 features
print (vect.get_feature_names()[-100:])

['zucchini on steady', 'zucchini pieces', 'zucchini pieces amongst', 'zucchini red', 'zucchini red bell', 'zucchini rounds', 'zucchini rounds chicken', 'zucchini shisito', 'zucchini shisito peppers', 'zucchini squash', 'zucchini squash onion', 'zucchini strips', 'zucchini strips appetizer', 'zucchini the', 'zucchini the shisito', 'zucchini the steak', 'zucchini veal', 'zucchini veal demi', 'zucchini very', 'zucchini very good', 'zucchini was', 'zucchini was just', 'zucchini we', 'zucchini we all', 'zucchini with', 'zucchini with no', 'zucchini with some', 'zuccini', 'zuccini italian', 'zuccini italian beef', 'zuccini so', 'zuccini so they', 'zuch', 'zuch and', 'zuch and asparagus', 'zuchinni', 'zuchinni peppers', 'zuchinni peppers it', 'zuchinni the', 'zuchinni the sampler', 'zuchinni wtf', 'zuchinni wtf chopped', 'zuma', 'zuma and', 'zuma and don', 'zuma because', 'zuma because the', 'zuma roka', 'zuma roka group', 'zuma since', 'zuma since started', 'zumba', 'zumba class', 'zumba cla

**Predicting the star rating:**

In [20]:
# use default options for CountVectorizer
#vect = CountVectorizer()
vect = CountVectorizer()
# create document-term matrices
X_train_dtm = vect.fit_transform(X_train)
X_test_dtm = vect.transform(X_test)

# use Naive Bayes to predict the star rating
nb = MultinomialNB()
nb.fit(X_train_dtm, y_train)
y_pred_class = nb.predict(X_test_dtm)

# calculate accuracy
print (metrics.accuracy_score(y_test, y_pred_class))
print(nb.score(X_test_dtm,y_test))

0.4712
0.4712


In [21]:
# use default options for CountVectorizer
#vect = CountVectorizer()
from sklearn.linear_model import LogisticRegression
vect = CountVectorizer()
# create document-term matrices
X_train_dtm = vect.fit_transform(X_train)
X_test_dtm = vect.transform(X_test)

# use Naive Bayes to predict the star rating
nb = MultinomialNB()
nb.fit(X_train_dtm, y_train)
y_pred_class = nb.predict(X_test_dtm)

# calculate accuracy
print (metrics.accuracy_score(y_test, y_pred_class))
print(nb.score(X_test_dtm,y_test))

0.4712
0.4712


In [22]:
X_test

9953    "Hipster,Trendy" ????-I think NOT !!!! Very di...
3850    My husband & I have been going to one Zipp's o...
4962    Perfect place for a hot summer day.  Amazing p...
3886    I personally like this Fry's. I think most of ...
5437    Successfully blending the idea of bar food and...
                              ...                        
6955    The Service is great.  The food is ok.  The sa...
557     Good drinks, great food, and cool music. There...
2455    Haji-Baba is unique by being both a restaurant...
3920    My husband and I loved it.  The food was delic...
6405    So i'm not here to buy spices or shop for shee...
Name: text, Length: 2500, dtype: object

In [23]:
nb.predict_proba(X_test_dtm)

array([[7.07713376e-05, 1.31392768e-02, 4.18069724e-02, 9.41057071e-01,
        3.92590829e-03],
       [4.71804306e-22, 9.61444165e-13, 7.25375413e-06, 7.10342456e-01,
        2.89650290e-01],
       [8.90517685e-09, 4.04108559e-07, 9.91633806e-04, 1.88280984e-01,
        8.10726969e-01],
       ...,
       [3.24138524e-15, 1.88863063e-09, 6.97775908e-04, 9.90164027e-01,
        9.13819544e-03],
       [6.70973612e-09, 2.33005016e-07, 3.97013603e-06, 2.55207386e-01,
        7.44788404e-01],
       [5.49292292e-41, 2.76175489e-27, 1.91898561e-09, 9.48516811e-01,
        5.14831871e-02]])

In [24]:
nb.predict(X_test_dtm)

array([4, 4, 5, ..., 4, 5, 4], dtype=int64)

In [25]:
X_test

9953    "Hipster,Trendy" ????-I think NOT !!!! Very di...
3850    My husband & I have been going to one Zipp's o...
4962    Perfect place for a hot summer day.  Amazing p...
3886    I personally like this Fry's. I think most of ...
5437    Successfully blending the idea of bar food and...
                              ...                        
6955    The Service is great.  The food is ok.  The sa...
557     Good drinks, great food, and cool music. There...
2455    Haji-Baba is unique by being both a restaurant...
3920    My husband and I loved it.  The food was delic...
6405    So i'm not here to buy spices or shop for shee...
Name: text, Length: 2500, dtype: object

In [26]:
predicted = nb.predict(X_test_dtm)
from sklearn.metrics import confusion_matrix,classification_report
print(confusion_matrix(y_test,predicted))
print(classification_report(y_test,predicted))

[[ 55  14  24  65  27]
 [ 28  16  41 122  27]
 [  5   7  35 281  37]
 [  7   0  16 629 232]
 [  6   4   6 373 443]]
              precision    recall  f1-score   support

           1       0.54      0.30      0.38       185
           2       0.39      0.07      0.12       234
           3       0.29      0.10      0.14       365
           4       0.43      0.71      0.53       884
           5       0.58      0.53      0.55       832

    accuracy                           0.47      2500
   macro avg       0.45      0.34      0.35      2500
weighted avg       0.46      0.47      0.43      2500



In [49]:
# calculate null accuracy
y_test_binary = np.where(y_test==5, 1, 0)
max(y_test_binary.mean(), 1 - y_test_binary.mean())

0.8199608610567515

In [64]:
# define a function that accepts a vectorizer and calculates the accuracy
def tokenize_test(vect):
    X_train_dtm = vect.fit_transform(X_train)
    print ('Features: ', X_train_dtm.shape[1])
    X_test_dtm = vect.transform(X_test)
    nb = MultinomialNB()
    nb.fit(X_train_dtm, y_train)
    y_pred_class = nb.predict(X_test_dtm)
    print("Training Accuracy")
    print(nb.score(X_train_dtm,y_train))
    print("Testing Accuracy")
    print(nb.score(X_test_dtm,y_test))

In [65]:
# include 1-grams and 2-grams
vect = CountVectorizer(ngram_range=(1, 1))
tokenize_test(vect)

Features:  25797
Training Accuracy
0.7801333333333333
Testing Accuracy
0.4712


## 3: Stopword Removal

- **What:** Remove common words that will likely appear in any text
- **Why:** They don't tell you much about your text

In [37]:
# show vectorizer options
vect

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

- **stop_words:** string {'english'}, list, or None (default)
- If 'english', a built-in stop word list for English is used.
- If a list, that list is assumed to contain stop words, all of which will be removed from the resulting tokens.
- If None, no stop words will be used. max_df can be set to a value in the range [0.7, 1.0) to automatically detect and filter stop words based on intra corpus document frequency of terms.

In [38]:
vect = CountVectorizer(stop_words='english')

In [39]:
# set of stop words
print(vect.get_stop_words())

frozenset({'formerly', 'done', 'whoever', 'toward', 'more', 'much', 'and', 'nothing', 'yet', 'he', 'on', 'only', 'now', 'beside', 'against', 'has', 'the', 'many', 'found', 'bill', 'ourselves', 'yourself', 'over', 'that', 'is', 'almost', 'whence', 'another', 'eleven', 'alone', 'ltd', 'even', 'thereafter', 'give', 'hereby', 'rather', 'should', 'whenever', 'who', 'our', 'six', 'behind', 'being', 'full', 'must', 'nevertheless', 'still', 'three', 'front', 'below', 'hence', 'been', 'fill', 'any', 'can', 'seeming', 'then', 'etc', 'eight', 'empty', 'except', 'beyond', 'anyone', 'first', 'somehow', 'me', 'whereas', 'noone', 'own', 'two', 'their', 'him', 'do', 'however', 'by', 'sixty', 'few', 'mostly', 'else', 'per', 'whereby', 'whole', 'whatever', 'hasnt', 'whom', 'these', 'those', 'thereby', 'con', 'never', 'top', 'keep', 'yourselves', 'could', 'well', 'same', 'into', 'thru', 'call', 'therein', 'this', 'we', 'my', 'anywhere', 'interest', 'couldnt', 'thereupon', 'put', 'out', 'hereupon', 'for',

In [40]:
# remove English stop words
vect = CountVectorizer(stop_words='english')
tokenize_test(vect)

Features:  16528
Training Accuracy
0.9758485639686684
Testing Accuracy
0.9158512720156555


In [41]:
# without stopwords, dtm size
vect = CountVectorizer()
vect.fit_transform(X_train)

<3064x16825 sparse matrix of type '<class 'numpy.int64'>'
	with 237720 stored elements in Compressed Sparse Row format>

In [42]:
# with stopwords, dtm size
vect = CountVectorizer(stop_words='english')
vect.fit_transform(X_train)

<3064x16528 sparse matrix of type '<class 'numpy.int64'>'
	with 143743 stored elements in Compressed Sparse Row format>

In [43]:
my_additional_stop_words = ['abcd']

from sklearn.feature_extraction import text 
stop_words = text.ENGLISH_STOP_WORDS.union(my_additional_stop_words)

In [44]:
# with stopwords, dtm size
vect = CountVectorizer(stop_words=stop_words)
vect.fit_transform(X_train)

<3064x16528 sparse matrix of type '<class 'numpy.int64'>'
	with 143743 stored elements in Compressed Sparse Row format>

In [45]:
# remove updated stop words 
vect = CountVectorizer(stop_words=stop_words)
tokenize_test(vect)

Features:  16528
Training Accuracy
0.9758485639686684
Testing Accuracy
0.9158512720156555


## 4: Other CountVectorizer Options

- **max_features:** int or None, default=None
- If not None, build a vocabulary that only consider the top max_features ordered by term frequency across the corpus.

In [46]:
# remove English stop words and only keep 100 features
vect = CountVectorizer(stop_words='english', max_features=1)
tokenize_test(vect)

Features:  1
Training Accuracy
0.8156005221932114
Testing Accuracy
0.8199608610567515


In [47]:
# all 100 features
print (vect.get_feature_names())

['place']


In [48]:
# include 1-grams and 2-grams, and limit the number of features
vect = CountVectorizer(ngram_range=(1, 5), max_features=70000)
tokenize_test(vect)

Features:  70000
Training Accuracy
0.9918407310704961
Testing Accuracy
0.9246575342465754


- **min_df:** float in range [0.0, 1.0] or int, default=1
- When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold. This value is also called cut-off in the literature. If float, the parameter represents a proportion of documents, integer absolute counts.

In [49]:
# include 1-grams and 2-grams, and only include terms that appear at least 2 times
vect = CountVectorizer(ngram_range=(1, 2), min_df=2)
tokenize_test(vect)

Features:  43957
Training Accuracy
0.9895561357702349
Testing Accuracy
0.9324853228962818


In [50]:
vect = CountVectorizer(ngram_range=(1, 5), min_df=2)
tokenize_test(vect)

Features:  76347
Training Accuracy
0.9924934725848564
Testing Accuracy
0.9246575342465754
