# Bad McDonald's

Aspect-based sentiment analysis

In [1]:
import pandas as pd
pd.options.display.max_colwidth = 10
from scipy.stats import wilcoxon
from cytoolz import identity, sliding_window, concat

## Load data

In [2]:
df = pd.read_csv('mcdonalds.csv')
df.sum(axis=0)

_unit_id        103617...
review          I'm no...
Cost                   54
OrderProblem          336
Filthy                130
RudeService           503
ScaryMcDs             135
MissingFood            54
BadFood               261
SlowService           363
negative             1471
dtype: object

In [3]:
len(df)

1525

## Tokenize and normalize text

In [4]:
import spacy
from spacy.matcher import Matcher
nlp = spacy.load('en', disable=['ner', 'parser'])

In [5]:
def tokenize(text):
    return [tok.lower_ for tok in nlp.tokenizer(text)]
df['tokens'] = df['review'].apply(tokenize)

## Import sklearn

In [6]:
from sklearn.model_selection import *
from sklearn.feature_selection import *
from sklearn.linear_model import *
from sklearn.svm import *
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import *
from sklearn.metrics import *

## Logistic regression

In [7]:
model1 = make_pipeline(CountVectorizer(analyzer=identity), 
                       LogisticRegression())     

In [8]:
params1 = {'logisticregression__C': [0.001, 0.01, 0.1, 1.0, 10.0],
           'countvectorizer__min_df': [1, 2, 5, 10],
           'countvectorizer__max_df': [0.1, 0.25, 0.5, 0.75, 1.0]}
grid_search1 = GridSearchCV(model1, params1, scoring=('f1'),
                            n_jobs=-1, verbose=1, return_train_score=True)
grid_search1.fit(df['tokens'], df['RudeService'])   

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   10.5s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:   16.4s finished


GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('countvectorizer', CountVectorizer(analyzer=<cyfunction identity at 0x7ffb4dfc0100>,
        binary=False, decode_error='strict', dtype=<class 'numpy.int64'>,
        encoding='utf-8', input='content', lowercase=True, max_df=1.0,
        max_features=None, min_df=1, ngram_range=(1, 1), prepr...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'logisticregression__C': [0.001, 0.01, 0.1, 1.0, 10.0], 'countvectorizer__min_df': [1, 2, 5, 10], 'countvectorizer__max_df': [0.1, 0.25, 0.5, 0.75, 1.0]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='f1', verbose=1)

In [9]:
grid_search1.best_params_

{'countvectorizer__max_df': 0.75,
 'countvectorizer__min_df': 5,
 'logisticregression__C': 1.0}

In [10]:
folds = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)

In [11]:
model1.set_params(**grid_search1.best_params_)
score1 = pd.DataFrame(cross_validate(model1, df['tokens'], df['RudeService'],
                           scoring=('precision','recall', 'f1'),
                           cv=folds, n_jobs=-1))
score1

Unnamed: 0,fit_time,score_time,test_f1,test_precision,test_recall,train_f1,train_precision,train_recall
0,0.098054,0.019786,0.527473,0.6,0.470588,0.985442,0.997732,0.973451
1,0.16647,0.03768,0.652174,0.731707,0.588235,0.986577,0.997738,0.975664
2,0.10679,0.028172,0.72,0.734694,0.705882,0.986607,0.995495,0.977876
3,0.172483,0.038654,0.651685,0.74359,0.58,0.988889,0.995526,0.98234
4,0.167271,0.039769,0.631579,0.666667,0.6,0.983203,0.997727,0.969095
5,0.17394,0.042096,0.643678,0.756757,0.56,0.98434,0.997732,0.971302
6,0.162483,0.039697,0.688172,0.744186,0.64,0.987737,0.997748,0.977925
7,0.164443,0.037652,0.646465,0.653061,0.64,0.985507,0.995495,0.975717
8,0.156315,0.040415,0.637363,0.707317,0.58,0.989967,1.0,0.980132
9,0.154638,0.019874,0.62069,0.72973,0.54,0.988864,0.997753,0.980132


In [12]:
score1[['test_f1', 'test_precision', 'test_recall']].mean()

test_f1           0.641928
test_precision    0.706771
test_recall       0.590471
dtype: float64

-----

## Tf-Idf

In [13]:
model2 = make_pipeline(CountVectorizer(analyzer=identity), 
                       TfidfTransformer(),
                       LogisticRegression())     

In [14]:
params2 = {'tfidftransformer__norm': ['l2', None],
           'tfidftransformer__use_idf': [True, False],
           'tfidftransformer__sublinear_tf': [True, False],
           'logisticregression__C': [0.001, 0.01, 0.1, 1.0, 10.0],
           'countvectorizer__min_df': [1, 2, 5],
           'countvectorizer__max_df': [0.1, 0.25, 0.5, 0.75, 1.0]}
grid_search2 = GridSearchCV(model2, params2, 
                            scoring=('f1'),
                            n_jobs=-1, verbose=1, return_train_score=True)
grid_search2.fit(df['tokens'], df['RudeService'])   

Fitting 3 folds for each of 600 candidates, totalling 1800 fits


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   13.1s
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   26.3s
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 1800 out of 1800 | elapsed:  1.7min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('countvectorizer', CountVectorizer(analyzer=<cyfunction identity at 0x7ffb4dfc0100>,
        binary=False, decode_error='strict', dtype=<class 'numpy.int64'>,
        encoding='utf-8', input='content', lowercase=True, max_df=1.0,
        max_features=None, min_df=1, ngram_range=(1, 1), prepr...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'tfidftransformer__norm': ['l2', None], 'tfidftransformer__use_idf': [True, False], 'tfidftransformer__sublinear_tf': [True, False], 'logisticregression__C': [0.001, 0.01, 0.1, 1.0, 10.0], 'countvectorizer__min_df': [1, 2, 5], 'countvectorizer__max_df': [0.1, 0.25, 0.5, 0.75, 1.0]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='f1', verbose=1)

In [15]:
grid_search2.best_params_

{'countvectorizer__max_df': 0.5,
 'countvectorizer__min_df': 5,
 'logisticregression__C': 10.0,
 'tfidftransformer__norm': 'l2',
 'tfidftransformer__sublinear_tf': True,
 'tfidftransformer__use_idf': False}

In [16]:
model2.set_params(**grid_search2.best_params_)
score2 = pd.DataFrame(cross_validate(model2, df['tokens'], df['RudeService'], 
                        scoring=('precision','recall', 'f1'),
                        cv=folds, n_jobs=-1))
score2

Unnamed: 0,fit_time,score_time,test_f1,test_precision,test_recall,train_f1,train_precision,train_recall
0,0.089102,0.021869,0.55814,0.685714,0.470588,0.94198,0.969555,0.915929
1,0.139671,0.040998,0.617021,0.674419,0.568627,0.935927,0.969194,0.904867
2,0.084302,0.027665,0.707071,0.729167,0.686275,0.937286,0.967059,0.909292
3,0.14098,0.040804,0.650602,0.818182,0.54,0.945578,0.972028,0.92053
4,0.140508,0.041059,0.666667,0.72093,0.62,0.943567,0.965358,0.922737
5,0.147316,0.043221,0.641975,0.83871,0.52,0.935154,0.964789,0.907285
6,0.142095,0.04094,0.744681,0.795455,0.7,0.931663,0.962353,0.90287
7,0.165903,0.040704,0.652632,0.688889,0.62,0.939841,0.96729,0.913907
8,0.131896,0.043049,0.574713,0.675676,0.5,0.936508,0.962704,0.9117
9,0.131927,0.035744,0.615385,0.857143,0.48,0.942111,0.969626,0.916115


In [17]:
score2[['test_f1', 'test_precision', 'test_recall']].mean()

test_f1           0.642889
test_precision    0.748428
test_recall       0.570549
dtype: float64

## Investigate features

In [18]:
def print_top_words(M, k=10):
    V = M.named_steps['countvectorizer'].get_feature_names()
    coef = M.named_steps['logisticregression'].coef_[0]
    for w in coef.argsort()[-k:][::-1]:
        print(f'{V[w]:15s} {coef[w]:6.3f}')

In [19]:
model2.fit(df['tokens'], df['RudeService'])
print_top_words(model2, k=20)

rude            12.720
service          7.267
horrible         5.908
staff            5.413
unfriendly       5.200
customer         4.804
attitude         4.433
terrible         4.211
worst            4.150
says             4.053
management       4.047
me               3.938
lazy             3.764
employees        3.657
bit              3.635
manager          3.548
her              3.530
wanted           3.486
talk             3.434
give             3.409


## Model comparison

In [20]:
wilcoxon(score2['test_precision'], score1['test_precision'], 
         zero_method='pratt', correction=True)

WilcoxonResult(statistic=9.0, pvalue=0.066545721343716141)

In [21]:
wilcoxon(score2['test_recall'], score1['test_recall'], 
         zero_method='pratt', correction=True)

WilcoxonResult(statistic=12.5, pvalue=0.13889992157439635)

## Error analysis

In [22]:
train = df[:1000]
test = df[1000:]
model2.fit(train['tokens'], train['RudeService'])
pred = model2.predict(test['tokens'])

In [23]:
pd.options.display.max_colwidth = 250
test[pred!=test['RudeService']]

Unnamed: 0,_unit_id,city,review,Cost,OrderProblem,Filthy,RudeService,ScaryMcDs,MissingFood,BadFood,SlowService,negative,tokens
1002,679456674,Houston,"100% worst McDonald's on the planet. We have given them chance after chance and I just cant do it anymore. Our order is NEVER right, you better check your order including inside your burger before you even walk away from the counter or leave the ...",False,True,False,True,False,False,False,False,True,"[100, %, worst, mcdonald, 's, on, the, planet, ., we, have, given, them, chance, after, chance, and, i, just, ca, nt, do, it, anymore, ., our, order, is, never, right, ,, you, better, check, your, order, including, inside, your, burger, before, y..."
1003,679456675,Houston,Horrible service. Our cashier was no where near friendly. The only reason I gave this McDonalds 2 stars was because the supervisor/manager stepped in to be nice. Other than that I would have given it 1/2 a star.,False,False,False,True,False,False,False,False,True,"[horrible, service, ., our, cashier, was, no, where, near, friendly, ., the, only, reason, i, gave, this, mcdonalds, 2, stars, was, because, the, supervisor, /, manager, stepped, in, to, be, nice, ., other, than, that, i, would, have, given, it, ..."
1006,679456679,Houston,This McDonald's is a mess. They had nothing to drink except tea and took me 20 minutes to get a breakfast sandwich. I could lay an egg faster than it takes them to cook one.,True,True,True,True,True,True,True,True,True,"[this, mcdonald, 's, is, a, mess, ., they, had, nothing, to, drink, except, tea, and, took, me, 20, minutes, to, get, a, breakfast, sandwich, ., i, could, lay, an, egg, faster, than, it, takes, them, to, cook, one, .]"
1009,679456682,Houston,"I normally don't take the time come out and review a fast-food place, cause most of the time they suck.. I mean lets face the facts.. they're garbage food to fill our need for crap, run by idiots and adolescents who don't give a crap. Back in the...",False,True,False,True,False,False,False,False,True,"[i, normally, do, n't, take, the, time, come, out, and, review, a, fast, -, food, place, ,, cause, most, of, the, time, they, suck, .., i, mean, lets, face, the, facts, .., they, 're, garbage, food, to, fill, our, need, for, crap, ,, run, by, idi..."
1015,679456688,Houston,"I've never seen any reason to add a review to a McDonald's; who checks a review on fast food prior to walking in the door? However, this McDonald's has consistently under-delivered and I'm honestly beyond frustrated at this point.I moved to an ap...",False,False,False,True,False,False,False,False,True,"[i, 've, never, seen, any, reason, to, add, a, review, to, a, mcdonald, 's, ;, who, checks, a, review, on, fast, food, prior, to, walking, in, the, door, ?, however, ,, this, mcdonald, 's, has, consistently, under, -, delivered, and, i, 'm, hones..."
1018,679456691,Houston,Unsanitary place! I walked in because I was filling up for gas instead of taking the drive thru. There are flies EVERYWHERE. Literally everywhere. I didn't stay long enough to order my food. Staff was standing around doing nothing - totally unpro...,False,False,True,True,False,False,False,False,True,"[unsanitary, place, !, i, walked, in, because, i, was, filling, up, for, gas, instead, of, taking, the, drive, thru, ., there, are, flies, everywhere, ., literally, everywhere, ., i, did, n't, stay, long, enough, to, order, my, food, ., staff, wa..."
1019,679456692,Houston,one of the most horrible mikey ds ive ever visited. connected to the chevron. they are always out of something. fries are never hot. management is not friendly. redbox outside has scracted dvds. USELESS! nuff said.,False,False,False,True,False,True,True,False,True,"[one, of, the, most, horrible, mikey, ds, i, ve, ever, visited, ., connected, to, the, chevron, ., they, are, always, out, of, something, ., fries, are, never, hot, ., management, is, not, friendly, ., redbox, outside, has, scracted, dvds, ., use..."
1025,679456698,Houston,This McDonald's has the lowest hiring standards in all of Houston. I went to the drive thru with my fiancee and she only ordered a large drink - half Sprite half HiC orange and the drive thru guy said they couldn't do it because of the way the so...,False,False,False,True,False,False,False,False,True,"[this, mcdonald, 's, has, the, lowest, hiring, standards, in, all, of, houston, ., i, went, to, the, drive, thru, with, my, fiancee, and, she, only, ordered, a, large, drink, -, half, sprite, half, hic, orange, and, the, drive, thru, guy, said, t..."
1029,679456702,Houston,"Was denied service because the store manger did not like how I ordered my food.One side salad, and one chicken patty.",False,True,False,True,False,False,False,False,True,"[was, denied, service, because, the, store, manger, did, not, like, how, i, ordered, my, food, ., one, side, salad, ,, and, one, chicken, patty, .]"
1039,679456712,Houston,"Why has mc Donald's gotten so slow... I hate coming here anyway. And, the service does not make it any better.",False,False,False,False,False,False,False,True,True,"[why, has, mc, donald, 's, gotten, so, slow, ..., i, hate, coming, here, anyway, ., and, ,, the, service, does, not, make, it, any, better, .]"


------