# Modeling of Yelp dataset

In [1]:
random_state = 0

In [2]:
import pandas as pd
import matplotlib
import seaborn as sns
import os
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.metrics import classification_report
from time import time

In [3]:
data_dir = '../../data/'
os.listdir(data_dir)

['review.csv']

In [4]:
t = time()
df = pd.read_csv(data_dir + 'review.csv')
elapsed = time() - t
print("----- DataFrame loaded"
      "\nin {0:.2f} seconds".format(elapsed) +
      "\nwith {0:,} rows\nand {1:,} columns"
      .format(df.shape[0], df.shape[1]) +
      "\n-- Column names:\n", df.columns)

----- DataFrame loaded
in 65.14 seconds
with 6,685,900 rows
and 9 columns
-- Column names:
 Index(['stars', 'review_id', 'user_id', 'funny', 'text', 'date', 'useful',
       'cool', 'business_id'],
      dtype='object')


## Drop missing values

In [5]:
mask1 = df['text'].isnull()
df = df[~mask1]
print("Records with no review text were dropped. {0:,} records remain in the DataFrame.".format(len(df)))

Records with no review text were dropped. 6,685,898 records remain in the DataFrame.


## Train and test classification algorithms

In [6]:
def tokenizer(text):
    return text.split()
tokenizer('runners like running and thus they run')

['runners', 'like', 'running', 'and', 'thus', 'they', 'run']

In [7]:
X = df['text']
y = df['stars']
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.3, random_state=random_state, stratify=y)

### Multinomial Naive Bayes

In [9]:
t = time()

bow = CountVectorizer(ngram_range=(1,1), tokenizer=tokenizer)

multinb_tfidf = Pipeline([('vect', bow), ('clf', MultinomialNB())])

multinb_tfidf.fit(X_train, y_train)

y_pred = multinb_tfidf.predict(X_test)

elapsed = time() - t

print("Model fit, took {0:,.2f} seconds ({1:,.2f} minutes)".format(elapsed, elapsed / 60))
print(classification_report(y_test, y_pred))

Model fit, took 667.16 seconds (11.12 minutes)
              precision    recall  f1-score   support

         1.0       0.64      0.74      0.68    300647
         2.0       0.35      0.26      0.30    162718
         3.0       0.43      0.35      0.38    221784
         4.0       0.46      0.63      0.53    440696
         5.0       0.82      0.71      0.76    879925

    accuracy                           0.62   2005770
   macro avg       0.54      0.54      0.53   2005770
weighted avg       0.63      0.62      0.62   2005770



### Complement Naive Bayes

In [8]:
t = time()

bow = CountVectorizer(ngram_range=(1,1), tokenizer=tokenizer)

complnb_tfidf = Pipeline([('vect', bow), ('clf', ComplementNB())])

complnb_tfidf.fit(X_train, y_train)

y_pred = complnb_tfidf.predict(X_test)

elapsed = time() - t

print("Model fit, took {0:,.2f} seconds ({1:,.2f} minutes)".format(elapsed, elapsed / 60))
print(classification_report(y_test, y_pred))

Model fit, took 663.00 seconds (11.05 minutes)
              precision    recall  f1-score   support

         1.0       0.55      0.89      0.68    300647
         2.0       0.44      0.08      0.13    162718
         3.0       0.43      0.16      0.24    221784
         4.0       0.46      0.51      0.48    440696
         5.0       0.78      0.81      0.80    879925

    accuracy                           0.63   2005770
   macro avg       0.53      0.49      0.47   2005770
weighted avg       0.61      0.63      0.59   2005770



### Logistic Regression

In [8]:
t = time()

bow = CountVectorizer(ngram_range=(1,1), tokenizer=tokenizer)

lr_tfidf = Pipeline([('vect', bow), ('clf', LogisticRegression(random_state=random_state, verbose=1,
                                                               solver='lbfgs', multi_class='ovr',
                                                               penalty='l2', C=1.0, n_jobs=-1))])

lr_tfidf.fit(X_train, y_train)

y_pred = lr_tfidf.predict(X_test)

elapsed = time() - t

print("Model fit, took {0:,.2f} seconds ({1:,.2f} minutes)".format(elapsed, elapsed / 60))
print(classification_report(y_test, y_pred))

[Parallel(n_jobs=-1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed: 63.1min finished


Model fit, took 4,454.82 seconds (74.25 minutes)
              precision    recall  f1-score   support

         1.0       0.74      0.85      0.79    300647
         2.0       0.52      0.23      0.32    162718
         3.0       0.53      0.34      0.41    221784
         4.0       0.53      0.42      0.47    440696
         5.0       0.73      0.91      0.81    879925

    accuracy                           0.68   2005770
   macro avg       0.61      0.55      0.56   2005770
weighted avg       0.65      0.68      0.65   2005770



### Linear Support Vector Classifier

In [None]:
t = time()

bow = CountVectorizer(ngram_range=(1,1), tokenizer=tokenizer)

lsvc_tfidf = Pipeline([('vect', bow), ('clf', LinearSVC(random_state=random_state,
                                                        penalty='l2', C=1.0))])

lsvc_tfidf.fit(X_train, y_train)

y_pred = lsvc_tfidf.predict(X_test)

elapsed = time() - t

print("Model fit, took {0:,.2f} seconds ({1:,.2f} minutes)".format(elapsed, elapsed / 60))
print(classification_report(y_test, y_pred))