https://scikit-learn.org/stable/auto_examples/text/plot_document_classification_20newsgroups.html#sphx-glr-auto-examples-text-plot-document-classification-20newsgroups-py

In [1]:
import logging
import numpy as np
import pandas
from optparse import OptionParser
import sys
from time import time
import matplotlib.pyplot as plt

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import RidgeClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import BernoulliNB, ComplementNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.extmath import density
from sklearn import metrics
from sklearn.metrics import f1_score

import os, sys, datetime
from google.colab import drive
drive.mount('/content/gdrive')
gitDir = "/content/gdrive/My Drive/nlp/"
os.chdir(gitDir + "training/")
os.chdir(gitDir)
print(os.listdir("."))

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
['.git', 'README.md', 'data', 'presentations', '.idea', 'training', 'blobs', '.gitignore', 'template.py', 'serverblobs', '.ipynb_checkpoints', 'report.ipynb']


In [0]:
def getCategory2IndexDict():
    d = {'Video DVD':                   0,
         'Music':                       1,
         'Books':                       2,
         'Mobile_Apps':                 3,
         'Digital_Video_Download':      4,
         'Digital_Music_Purchase':      5,
         'Toys':                        6,
         'Digital_Ebook_Purchase':      7,
         'PC':                          8,
         'Camera':                      9,
         'Wireless':                    10,
         'Electronics':                 11,
         'Video':                       12,
         'Sports':                      13,
         'Video Games':                 14,
         'Watches':                     15,
         'Shoes':                       16,
         'Home':                        17,
         'Musical Instruments':         18,
         'Baby':                        19,
         'Home Improvement':            20,
         'Home Entertainment':          21,
         'Office Products':             22,
         'Personal_Care_Appliances':    23,
         'Automotive':                  24,
         'Lawn and Garden':             25,
         'Luggage':                     26,
         'Kitchen':                     27,
         'Furniture':                   28,
         'Health & Personal Care':      29,
         'Software':                    30,
         'Grocery':                     31,
         'Pet Products':                32,
         'Beauty':                      33
        }
    return d

In [0]:
maxrows = 1000 * 100
frame = pandas.read_csv("data/cache/amazon_reviews_us_balanced.csv.shuffled", nrows=maxrows)
features = frame["review_body"].values
c2i = getCategory2IndexDict()
labels = [c2i[x] for x in frame["product_category"].values]

split = int(features.shape[0] * 0.8)
x_train = features[0:split]
y_train = labels[0:split]

x_test = features[split:]
y_test = labels[split:]

In [0]:
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english')
X_train = vectorizer.fit_transform(x_train)
X_test = vectorizer.transform(x_test)

In [0]:
def benchmark(clf):
    print('_' * 80)
    print("Training: ")
    print(clf)
    t0 = time()
    clf.fit(X_train, y_train)
    train_time = time() - t0
    print("train time: %0.3fs" % train_time)

    t0 = time()
    pred = clf.predict(X_test)
    test_time = time() - t0
    print("test time:  %0.3fs" % test_time)

    score = metrics.accuracy_score(y_test, pred)
    print("accuracy:   %0.3f" % score)

    if True:
        print("classification report:")
        print(metrics.classification_report(y_test, pred,))

    if False:
        print("confusion matrix:")
        print(metrics.confusion_matrix(y_test, pred))

    print()
    clf_descr = str(clf).split('(')[0]
    return clf_descr, score, train_time, test_time

In [6]:
p = Perceptron(max_iter=100, tol=1e-3)
%time benchmark(p)

knn = KNeighborsClassifier(n_neighbors=34)
%time benchmark(knn)

rf = RandomForestClassifier(n_estimators=10)
%time benchmark(rf)

svc = LinearSVC(penalty="l1", dual=False,tol=1e-3)
%time benchmark(svc)


________________________________________________________________________________
Training: 
Perceptron(alpha=0.0001, class_weight=None, early_stopping=False, eta0=1.0,
           fit_intercept=True, max_iter=100, n_iter_no_change=5, n_jobs=None,
           penalty=None, random_state=0, shuffle=True, tol=0.001,
           validation_fraction=0.1, verbose=0, warm_start=False)
train time: 7.245s
test time:  0.050s
accuracy:   0.449
classification report:
              precision    recall  f1-score   support

           0       0.33      0.29      0.31       602
           1       0.56      0.65      0.60       616
           2       0.38      0.37      0.38       592
           3       0.56      0.59      0.58       560
           4       0.37      0.48      0.42       551
           5       0.61      0.50      0.55       591
           6       0.39      0.32      0.35       576
           7       0.54      0.54      0.54       616
           8       0.37      0.31      0.34       548
   

  'precision', 'predicted', average, warn_for)


train time: 129.656s
test time:  0.422s
accuracy:   0.389
classification report:
              precision    recall  f1-score   support

           0       0.26      0.27      0.26       602
           1       0.51      0.65      0.57       616
           2       0.36      0.44      0.39       592
           3       0.41      0.66      0.50       560
           4       0.28      0.50      0.36       551
           5       0.47      0.53      0.50       591
           6       0.24      0.26      0.25       576
           7       0.54      0.56      0.55       616
           8       0.24      0.25      0.25       548
           9       0.50      0.55      0.52       611
          10       0.30      0.44      0.35       578
          11       0.26      0.25      0.25       613
          12       0.50      0.39      0.44       581
          13       0.19      0.14      0.16       611
          14       0.49      0.46      0.47       579
          15       0.58      0.70      0.63       576


('LinearSVC', 0.5428, 69.61615347862244, 0.03239250183105469)

In [7]:
print("Perceptron", f1_score(y_test, p.predict(X_test), average='micro') )
print("Random Forest", f1_score(y_test, rf.predict(X_test), average='micro') )
print("KNN", f1_score(y_test, knn.predict(X_test), average='micro') )
print("SVC", f1_score(y_test, svc.predict(X_test), average='micro') )

Perceptron 0.44905
Random Forest 0.389
KNN 0.05185
SVC 0.5428


# Test on German

In [0]:
frame = pandas.read_csv("data/cache/amazon_reviews_multilingual_DE_v1_00.tsv.shuffled", nrows=maxrows)
features = frame["review_body"].values
c2i = getCategory2IndexDict()
labels = [c2i[x] for x in frame["product_category"].values]

x_de = vectorizer.transform(features)
y_de = labels

In [9]:
print("Perceptron", f1_score(y_de, p.predict(x_de), average='micro') )
print("Random Forest", f1_score(y_de, rf.predict(x_de), average='micro') )
print("KNN", f1_score(y_de, knn.predict(x_de), average='micro') )
print("SVC", f1_score(y_de, svc.predict(x_de), average='micro') )

Perceptron 0.3263
Random Forest 0.44436
KNN 0.0409
SVC 0.39833
