In [1]:
import pandas as pd
import nltk
#import spacy
#nlp = spacy.load('en_core_web_lg')

path = r'C:\Users\user\Desktop\Revature\Projects\Yelp\yelp_reviews_sample.csv'

yelpSmall = pd.read_csv(path)


In [3]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer()), #This creates vectors
                      ('tfidf', TfidfTransformer()), #This calls the TFIDF transformer.
                      ('clf', MultinomialNB()),]) #This calls the Naive Bayes algorithm.
text_clf = text_clf.fit(yelpSmall['text'], yelpSmall['stars'])

In [5]:
import numpy as np
test_data = yelpSmall.sample(frac=1) #A random sample of the dataframe.
predicted = text_clf.predict(test_data['text'])
print(np.mean(predicted == test_data['stars']))

0.64646


In [6]:
from sklearn.metrics import mean_absolute_error, r2_score, accuracy_score

print(mean_absolute_error(test_data['stars'], predicted), r2_score(test_data['stars'], predicted), accuracy_score(test_data['stars'], predicted))

0.51304 0.5390510013777268 0.64646


<blockquote> How come the r2 score is so low, but accuracy is fine and mean absolute error is fine? From the MAE, it looks like we're just one star off from the right answer on average. Double check my code, but I believe the reason for this is that r2 score is meant to show how well we can get a linear prediction from our variables. In short, r2 score isn't meant for classification problems. I found this explanation: https://stats.stackexchange.com/questions/273133/interpretation-of-r-squared-score-of-a-neural-network-for-classification </blockquote>

In [5]:
print(predicted[:20], test_data['stars'].iloc[:20])

[5. 1. 5. 5. 5. 5. 5. 5. 4. 5. 5. 5. 5. 5. 4. 5. 5. 5. 4. 5.] 31610    2.0
49888    1.0
78457    1.0
21593    5.0
79034    5.0
24797    5.0
70057    5.0
69402    4.0
9252     3.0
91822    5.0
64513    4.0
32932    3.0
55230    5.0
72791    3.0
46187    4.0
26327    4.0
54427    5.0
63154    3.0
25613    4.0
39833    5.0
Name: stars, dtype: float64


In [6]:
#C-type classifier is basically useless. It's way too slow and not good for large datasets.
# from sklearn.svm import SVC

# text_clf = Pipeline([('vect', CountVectorizer()), #This creates vectors
#                       ('tfidf', TfidfTransformer()), #This calls the TFIDF transformer.
#                       ('clf', SVC()),]) #This calls the Support Vector classifier algorithm.
# text_clf = text_clf.fit(yelpSmall['text'], yelpSmall['stars'])
# test_data = yelpSmall.sample(frac=1) #A random sample of the dataframe.
# predicted = text_clf.predict(test_data['text'])
# print(np.mean(predicted == test_data['stars']))
# print(mean_absolute_error(test_data['stars'], predicted), r2_score(test_data['stars'], predicted), accuracy_score(test_data['stars'], predicted))

In [15]:
from sklearn.linear_model import SGDClassifier
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english')) #Stop words didn't help. It may be an issue with the code here.

text_clf = Pipeline([('vect', CountVectorizer(stop_words = stop_words)), #This creates vectors
                      ('tfidf', TfidfTransformer()), #This calls the TFIDF transformer.
                      ('clf', SGDClassifier()),]) #This calls the Support Vector classifier algorithm.
text_clf = text_clf.fit(yelpSmall['text'], yelpSmall['stars'])
test_data = yelpSmall.sample(frac=1) #A random sample of the dataframe.
predicted = text_clf.predict(test_data['text'])
print(np.mean(predicted == test_data['stars']))
print(mean_absolute_error(test_data['stars'], predicted), r2_score(test_data['stars'], predicted), accuracy_score(test_data['stars'], predicted))

0.67375
0.46738 0.5858416162407456 0.67375


In [8]:
'''from sklearn.model_selection import GridSearchCV
parameters = {'vect__ngram_range': [(1, 1), (1, 3)],
              'vect__lowercase': (True, False),
              }
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(yelpSmall['text'], yelpSmall['stars'])
print(gs_clf.best_score_)
print(gs_clf.best_params_)
best_predictions = gs_clf.predict(test_data['text'])
print(mean_absolute_error(test_data['stars'], best_predictions), r2_score(test_data['stars'], best_predictions), accuracy_score(test_data['stars'], best_predictions)) '''
#I called GridSearchCV. Sadly, nothing improved.
#Lowercase and unigram gave the better results.

"from sklearn.model_selection import GridSearchCV\nparameters = {'vect__ngram_range': [(1, 1), (1, 3)],\n              'vect__lowercase': (True, False),\n              }\ngs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)\ngs_clf = gs_clf.fit(yelpSmall['text'], yelpSmall['stars'])\nprint(gs_clf.best_score_)\nprint(gs_clf.best_params_)\nbest_predictions = gs_clf.predict(test_data['text'])\nprint(mean_absolute_error(test_data['stars'], best_predictions), r2_score(test_data['stars'], best_predictions), accuracy_score(test_data['stars'], best_predictions)) "

In [9]:
print(CountVectorizer(yelpSmall['text'].loc[:20]))

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8',
                input=0     Total bill for this horrible service? Over $8G...
1     I *adore* Travis at the Hard Rock's new Kelly ...
2     I have to say that this office really has it t...
3     Went in for a lunch. Steak sandwich was delici...
4     Today was my second out of three sessions I ha...
5     I'll be the first to admit that...
17    I love chinese food and I love mexican food. W...
18    We've been a huge Slim's fan since they opened...
19    Good selection of classes of beers and mains. ...
20    Our family LOVES the food here. Quick, friendl...
Name: text, dtype: object,
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)


In [7]:
#Due to the possibility of misspellings, I felt it would be worth trying an analyzer that goes by characters. Accuracy improved slightly.
text_clf = Pipeline([('vect', CountVectorizer(stop_words = stop_words, analyzer='char_wb', ngram_range=(5,5))), #This creates vectors
                      ('tfidf', TfidfTransformer()), #This calls the TFIDF transformer.
                      ('clf', SGDClassifier()),]) #This calls the Support Vector classifier algorithm.
text_clf = text_clf.fit(yelpSmall['text'], yelpSmall['stars'])
test_data = yelpSmall.sample(frac=1) #A random sample of the dataframe.
predicted = text_clf.predict(test_data['text'])
print(np.mean(predicted == test_data['stars']))
print(mean_absolute_error(test_data['stars'], predicted), r2_score(test_data['stars'], predicted), accuracy_score(test_data['stars'], predicted))

0.68967
0.43422 0.6257317969926526 0.68967
