In [67]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import MaxAbsScaler

In [70]:
original_df = pd.read_json(
    "datasets/yelp_reviews_sample.json", orient="records", lines=True
)
original_df.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,Q1sbwvVQXV2734tPgoKj4Q,hG7b0MtEbXx5QzbzE6C_VA,ujmEBvifdJM6h6RLv4wQIg,1,6,1,0,Total bill for this horrible service? Over $8G...,2013-05-07 04:34:36
1,GJXCdrto3ASJOqKeVWPi6Q,yXQM5uF2jS6es16SJzNHfg,NZnhc2sEQy3RmzKTZnqtwQ,5,0,0,0,I *adore* Travis at the Hard Rock's new Kelly ...,2017-01-14 21:30:33
2,2TzJjDVDEuAW6MR5Vuc1ug,n6-Gk65cPZL6Uz8qRm3NYw,WTqjgwHlXbSFevF32_DJVw,5,3,0,0,I have to say that this office really has it t...,2016-11-09 20:09:03
3,yi0R0Ugj_xUx_Nek0-_Qig,dacAIZ6fTM6mqwW5uxkskg,ikCg8xy5JIg_NGPx-MSIDA,5,0,0,0,Went in for a lunch. Steak sandwich was delici...,2018-01-09 20:56:38
4,11a8sVPMUFtaC7_ABRkmtw,ssoyf2_x0EQMed6fgHeMyQ,b1b1eb3uo-w561D0ZfCEiQ,1,7,0,0,Today was my second out of three sessions I ha...,2018-01-30 23:07:38


In [71]:
df = original_df.set_index("review_id")

df.drop(
    columns=[c for c in df.columns if c not in ["stars", "text"]],
    inplace=True,
)
assert (df.columns == ["stars", "text"]).all()

df.stars = df.stars - 1
assert ((df.stars >= 0) & (df.stars <= 4)).all()

df.text = df.text.str.replace("\n", " ")  # replace line breaks with whitespace
assert df[df.text.str.contains("\n")].empty

df.head()

Unnamed: 0_level_0,stars,text
review_id,Unnamed: 1_level_1,Unnamed: 2_level_1
Q1sbwvVQXV2734tPgoKj4Q,0,Total bill for this horrible service? Over $8G...
GJXCdrto3ASJOqKeVWPi6Q,4,I *adore* Travis at the Hard Rock's new Kelly ...
2TzJjDVDEuAW6MR5Vuc1ug,4,I have to say that this office really has it t...
yi0R0Ugj_xUx_Nek0-_Qig,4,Went in for a lunch. Steak sandwich was delici...
11a8sVPMUFtaC7_ABRkmtw,0,Today was my second out of three sessions I ha...


In [12]:
train, test = train_test_split(df, test_size=0.2, random_state=40)
text_train, y_train, text_test, y_test = (
    train.text.values, train.stars.values, test.text.values, test.stars.values
)
assert text_train.shape == y_train.shape
assert text_test.shape == y_test.shape

In [92]:
%%time
pipe = make_pipeline(
    CountVectorizer(max_features=50_000),
    MaxAbsScaler(copy=False),
    SGDClassifier(
        loss="log",  # logistic regression
        random_state=40,
    ),
    verbose=True,
)
param_grid = {
    "sgdclassifier__alpha": [0.00001, 0.0001, 0.001],
}
grid = GridSearchCV(pipe, param_grid, cv=5, n_jobs=-1)
grid.fit(text_train, y_train)
print(f"Best cross-validation score: {grid.best_score_}")
print(f"Best parameters: {grid.best_params_}")
print(f"Test score: {grid.score(text_test, y_test)}")

[Pipeline] ... (step 1 of 3) Processing countvectorizer, total=   6.3s
[Pipeline] ...... (step 2 of 3) Processing maxabsscaler, total=   0.5s
[Pipeline] ..... (step 3 of 3) Processing sgdclassifier, total=   2.5s
Best cross-validation score: 0.6455500000000001
Best parameters: {'sgdclassifier__alpha': 0.0001}
Test score: 0.64605
CPU times: user 13.3 s, sys: 1.65 s, total: 15 s
Wall time: 1min 13s


In [93]:
%%time
pipe = make_pipeline(
    TfidfVectorizer(max_features=50_000, norm=None),
    MaxAbsScaler(copy=False),
    SGDClassifier(
        loss="log",  # logistic regression
        random_state=40,
    ),
    verbose=True,
)
param_grid = {
    "sgdclassifier__alpha": [0.00001, 0.0001, 0.001],
}
grid = GridSearchCV(pipe, param_grid, cv=5, n_jobs=-1)
grid.fit(text_train, y_train)
print(f"Best cross-validation score: {grid.best_score_}")
print(f"Best parameters: {grid.best_params_}")
print(f"Test score: {grid.score(text_test, y_test)}")

[Pipeline] ... (step 1 of 3) Processing tfidfvectorizer, total=   6.4s
[Pipeline] ...... (step 2 of 3) Processing maxabsscaler, total=   0.3s
[Pipeline] ..... (step 3 of 3) Processing sgdclassifier, total=   2.5s
Best cross-validation score: 0.6455500000000001
Best parameters: {'sgdclassifier__alpha': 0.0001}
Test score: 0.64605
CPU times: user 13.1 s, sys: 2.01 s, total: 15.1 s
Wall time: 1min 18s
