In [40]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import MaxAbsScaler
from sklearn.metrics import accuracy_score

In [28]:
original_df = pd.read_json(
    "datasets/yelp_reviews.json", orient="records", lines=True
)
print(f"Dataset size: {original_df.size}")
original_df.count()

Dataset size: 14123376


review_id      1569264
user_id        1569264
business_id    1569264
stars          1569264
useful         1569264
funny          1569264
cool           1569264
text           1569264
date           1569264
dtype: int64

In [29]:
df = original_df.set_index("review_id")

df.drop(
    columns=[c for c in df.columns if c not in ["stars", "text"]],
    inplace=True,
)
assert (df.columns == ["stars", "text"]).all()

df.stars = df.stars - 1
assert ((df.stars >= 0) & (df.stars <= 4)).all()

df.text = df.text.str.replace("\n", " ")  # replace line breaks with whitespace
assert df[df.text.str.contains("\n")].empty

df.head()

Unnamed: 0_level_0,stars,text
review_id,Unnamed: 1_level_1,Unnamed: 2_level_1
fYPIcACr6deeTW9k1cSAtg,4,David & Scott have an awesome restaurant and t...
eaJEFDJnWzlaA8lDdQY7UA,4,"I'm a video game designer, and it's a pleasure..."
t7yI1IPHU7tU6xQ1xDkwoQ,4,I originally ordered my fireplace doors throug...
Rp4XamHh5nQVSoswA6SzwQ,4,Thanks to all previous reviewers helping us to...
oRTVi_h7hYiQptRzK3mTPg,0,The restaurant itself and overall vibe doesn't...


In [37]:
test_df = pd.concat([
    df[df.stars == star].sample(10_000, random_state=20) 
    for star in df.stars.unique()
])
assert (test_df.stars.value_counts(sort=False) == [10_000]*5).all()

train_df = df.drop(test_df.index).sample(130_000, random_state=20)
train_df.count()
assert (train_df.count() == [130_000]*2).all()

text_train, y_train, text_test, y_test = (
    train_df.text.values, train_df.stars.values, test_df.text.values, test_df.stars.values
)
assert text_train.shape == y_train.shape
assert text_test.shape == y_test.shape

In [4]:
#train, test = train_test_split(df, test_size=0.2, random_state=40)
#text_train, y_train, text_test, y_test = (
#    train.text.values, train.stars.values, test.text.values, test.stars.values
#)
#assert text_train.shape == y_train.shape
#assert text_test.shape == y_test.shape

In [45]:
train_df.stars.value_counts()

4    57903
3    28909
0    19090
2    14032
1    10066
Name: stars, dtype: int64

In [48]:
%%time
pipe = make_pipeline(
    CountVectorizer(max_features=50_000),
    MaxAbsScaler(copy=False),
    SGDClassifier(
        loss="log",  # logistic regression
        random_state=40,
    ),
    verbose=True,
)
param_grid = {
    "sgdclassifier__alpha": [0.00001, 0.0001, 0.001],
}
grid = GridSearchCV(pipe, param_grid, cv=5, n_jobs=-1)
grid.fit(text_train, y_train)
print(f"Best cross-validation score: {grid.best_score_}")
print(f"Best parameters: {grid.best_params_}")
print(f"Test score: {grid.score(text_test, y_test)}")

[Pipeline] ... (step 1 of 3) Processing countvectorizer, total=   7.0s
[Pipeline] ...... (step 2 of 3) Processing maxabsscaler, total=   0.5s
[Pipeline] ..... (step 3 of 3) Processing sgdclassifier, total=   3.9s
Best cross-validation score: 0.6544538461538462
Best parameters: {'sgdclassifier__alpha': 1e-05}
Test score: 0.52496
CPU times: user 17.7 s, sys: 2.84 s, total: 20.6 s
Wall time: 55 s


In [39]:
%%time
pipe = make_pipeline(
    TfidfVectorizer(max_features=50_000, norm=None),
    MaxAbsScaler(copy=False),
    SGDClassifier(
        loss="log",  # logistic regression
        random_state=40,
    ),
    verbose=True,
)
param_grid = {
    "sgdclassifier__alpha": [0.00001, 0.0001, 0.001],
}
grid = GridSearchCV(pipe, param_grid, cv=5, n_jobs=-1)
grid.fit(text_train, y_train)
print(f"Best cross-validation score: {grid.best_score_}")
print(f"Best parameters: {grid.best_params_}")
print(f"Test score: {grid.score(text_test, y_test)}")

[Pipeline] ... (step 1 of 3) Processing tfidfvectorizer, total=   7.1s
[Pipeline] ...... (step 2 of 3) Processing maxabsscaler, total=   0.3s
[Pipeline] ..... (step 3 of 3) Processing sgdclassifier, total=   4.1s
Best cross-validation score: 0.6544538461538462
Best parameters: {'sgdclassifier__alpha': 1e-05}
Test score: 0.52496
CPU times: user 17.9 s, sys: 2.77 s, total: 20.7 s
Wall time: 56.3 s
