In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline

import matplotlib.pyplot as plt
import seaborn as sns

import random
random.seed(42)

In [None]:
!pip install scikit-plot

In [None]:
import scikitplot as skplt

In [None]:
reviews = pd.read_csv('reviews.csv.gz')

In [None]:
reviews

In [None]:
review_text = reviews['Review Text'].values.astype('U')
age_group = (reviews['Age'] < 40).values

X_train, X_test, y_train, y_test = train_test_split(review_text, age_group, test_size=0.15, random_state=42)

# Train model

In [None]:
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier()),
])

parameters = {
    'vect__stop_words': ('english',),
    'vect__max_df': (0.5, 0.75, 1.0),
    #'vect__max_features': (None, 5000, 10000, 50000),
    'vect__ngram_range': ((1, 2),),  # unigrams or bigrams
    'tfidf__use_idf': (True, ),
    'tfidf__norm': ('l1', 'l2'),
    'clf__max_iter': (100, ),
    'clf__alpha': (0.00001, ),
    'clf__penalty': ('l2', ),
    # 'clf__max_iter': (10, 50, 80),
    'clf__loss': ('log', )
}

In [None]:
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)
estimator = grid_search.fit(X_train, y_train)

In [None]:
    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))

In [None]:
y_train_pred = estimator.predict(X_train)

In [None]:
ax=skplt.metrics.plot_confusion_matrix(y_train, y_train_pred, normalize=False)

In [None]:
y_test_probs = estimator.predict_proba(X_test)

skplt.metrics.plot_roc(y_test, y_test_probs,
                       title="Digits ROC Curve", figsize=(12,6));

# Play

In [None]:
selected_review = random.randint(0, len(X_test))

In [None]:
X_test[selected_review]

In [None]:
estimator.predict([ X_test[selected_review] ])[0]

In [None]:
y_test[selected_review]