# Text classification

In [45]:
# load various models from scikit-learn's library
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier

# also get some metrics to try
from sklearn.metrics import accuracy_score

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfVectorizer

import re

import numpy as np

from gensim.models.word2vec import Word2Vec, LineSentence
from gensim.models import KeyedVectors

from keras.callbacks import EarlyStopping
from keras.models import Sequential
from keras.layers import Embedding, Dense, LSTM, GlobalAveragePooling1D
from keras.optimizers import Adam
from keras.datasets import imdb
from keras.preprocessing import sequence

import nltk
from nltk.corpus import reuters
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

import timeit

## Getting data

### IMDB reviews sentiment analysis

This is a neural network ready dataset from [Keras](https://keras.io/datasets/#imdb-movie-reviews-sentiment-classification). The words in the dataset have already been converted into integer IDs, so you can't easily have a look at what's in there.

This is a sentiment analysis or polarity dataset, which means that the target labels are positive or negative. It's a relatively simpler task for a ML model to solve.

I'll be using dictionaries to store my data. After I've grabbed the data from keras, I join the integer IDs with spaces to make text for scikit-learn: scikit-learn's vectorizers expect strings.

In [2]:
imdb_data = {"name" : "imdb", "ovr" : False}
(a, b), (c, d) = imdb.load_data(num_words=50000)
imdb_data["X_train_ids"], imdb_data["y_train"], imdb_data["X_test_ids"], imdb_data["y_test"] = a, b, c, d

# For scikit-learn to like the input data, it will needs strings
imdb_data["X_train"] = [" ".join([str(x) for x in line]) for line in imdb_data["X_train_ids"]]
imdb_data["X_test"] = [" ".join([str(x) for x in line]) for line in imdb_data["X_test_ids"]]

Here's a look at what we're dealing with.

In [3]:
imdb_data["train_size"], imdb_data["test_size"] = len(imdb_data["X_train"]), len(imdb_data["X_test"])
imdb_data["avg_length"] = sum([len(i) for i in imdb_data["X_train_ids"]])/len(imdb_data["X_train_ids"])
imdb_data["vocab_size"] = len(set([i for j in imdb_data["X_train_ids"] for i in j]))

print(f"Observations in training data: {imdb_data['train_size']}; test data: {imdb_data['test_size']}")
print(f"Min number of words per line in training set: {min([len(i) for i in imdb_data['X_train_ids']])}")
print(f"Max number of words per line in training set: {max([len(i) for i in imdb_data['X_train_ids']])}")
print(f"Average number of words per line in training set: {imdb_data['avg_length']}")
print(f"Total vocabulary size: {imdb_data['vocab_size']}")

Observations in training data: 25000; test data: 25000
Min number of words per line in training set: 11
Max number of words per line in training set: 2494
Average number of words per line in training set: 238.71364
Total vocabulary size: 49998


### A lot of baby names

The US government has made available [baby names](https://catalog.data.gov/dataset/baby-names-from-social-security-card-applications-national-level-data) from social security card applications. These records go back to 1880 and also indicate the sex of the baby. I'll be trying to predict which names are male and which are female.

Once you've extracted the files to a folder, the following Python code will join them all into a csv file.

```
import os
import re

with open("babies.csv", "w") as w:
    for f in [f for f in os.listdir(os.getcwd()) if "txt" in str(f)]:
        with open(f) as f:
            year = re.search(r'[\d]{4}', f.name)[0]
            for line in f:
                w.write(year+","+line)
```

What I want to do is sort the names by year, remove all duplicates, and then split older and newer names into the training and test sets, respectively. This way, my machine learning task is correctly inferring the gender of newer names only having seen older ones.

Below you can see a sample of a few names.

In [4]:
with open("babies.csv") as f:
    baby_list = f.readlines()

# Sort by first 4 characters, the year
baby_list.sort(key=lambda x: x[:4])

print(baby_list[:5])

['1880,Mary,F,7065\n', '1880,Anna,F,2604\n', '1880,Emma,F,2003\n', '1880,Elizabeth,F,1939\n', '1880,Minnie,F,1746\n']


What I'm going to do is keep every combination of name and sex in a `set()`. Python sets do not keep duplicates and are very fast at `if x in y` operations, making them perfect for this work. With the names properly organized, it's easy to keep 20,000 for the test set.

In [5]:
baby_set = set()
unique_baby_list = []

for baby in baby_list:
    if " ".join(baby.split(",")[1:3]) in baby_set:
        pass
    else:
        baby_set.add(" ".join(baby.split(",")[1:3]))
        unique_baby_list.append(baby)

# Pick test set
baby_train = unique_baby_list[:-20000]
baby_test = unique_baby_list[-20000:]

baby_data = {"name" : "baby", "ovr" : False}
baby_data["X_train"] = [baby.split(",")[1] for baby in baby_train]
baby_data["X_test"] = [baby.split(",")[1] for baby in baby_test]
baby_data["y_train"] = [baby.split(",")[2] == "M" for baby in baby_train]
baby_data["y_test"] = [baby.split(",")[2] == "M" for baby in baby_test]

Since the data is split by character, we have a small vocabulary size. Even with the removal of duplicates, we still have 100,000+ records.

In [6]:
baby_data["train_size"], baby_data["test_size"] = len(baby_data["X_train"]), len(baby_data["X_test"])
baby_data["avg_length"] = sum([len(i) for i in baby_data["X_train"]])/len(baby_data["X_train"])
baby_data["vocab_size"] = len(set([i for j in baby_data["X_train"] for i in j]))

print(f"Observations in training data: {baby_data['train_size']}; test data: {len(baby_data['X_test'])}")
print(f"Min number of words per line in training set: {min([len(i) for i in baby_data['X_train']])}")
print(f"Max number of words per line in training set: {max([len(i) for i in baby_data['X_train']])}")
print(f"Average number of characters per name in training set: {baby_data['avg_length']}")
print(f"Total character vocabulary size: {baby_data['vocab_size']}")

Observations in training data: 87973; test data: 20000
Min number of words per line in training set: 2
Max number of words per line in training set: 15
Average number of characters per name in training set: 6.442374364861946
Total character vocabulary size: 52


### Newsgroup posts

These are categorized newsgroup posts you can get [from scikit-learn](http://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_20newsgroups.html) ([user guide](scikit-learn.org/stable/datasets/twenty_newsgroups.html). These are rather long and varied texts drawn from 18,000 posts. Each of these belong in a different topic. You can read a bit more about the dataset [here](http://qwone.com/~jason/20Newsgroups/).

In [7]:
ng_train_raw = fetch_20newsgroups(subset="train", remove=("headers", "footers", "quotes"))
ng_test_raw = fetch_20newsgroups(subset="test", remove=("headers", "footers", "quotes"))

Since this is a scikit-learn datasource, there are extras you can play with. For example, the target labels can be accessed this way:

In [8]:
print(ng_train_raw.target_names)

['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


An example post looks lie this. As you can see, these are multiple sentences.

In [9]:
print(ng_train_raw.data[0])

I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have on this funky looking car, please e-mail.


In [10]:
ng_data = {"name" : "newsgroup20", "ovr" : False}
ng_data.update({"X_train" : ng_train_raw.data, "y_train" : ng_train_raw.target})
ng_data.update({"X_test" : ng_test_raw.data, "y_test" : ng_test_raw.target})

On average these posts are shorter than the imdb reviews; however, there are some monster posts lurking in there.

The total vocabulary size of the newsgroup set is **much higher** than the others.

In [11]:
ng_data["train_size"], ng_data["test_size"] = len(ng_data["X_train"]), len(ng_data["X_test"])
ng_data["avg_length"] = sum([len(i.split(' ')) for i in ng_data["X_train"]])/len(ng_data["X_train"])
ng_data["vocab_size"] = len(set([i.lower() for j in ng_data["X_train"] for i in j.split(" ")]))

print(f"Observations in training data: {len(ng_data['X_train'])}; test data: {len(ng_data['X_test'])}")
print(f"Min number of words per line in training set: {min([len(i.split(' ')) for i in ng_data['X_train']])}")
print(f"Max number of words per line in training set: {max([len(i.split(' ')) for i in ng_data['X_train']])}")
print(f"Average number of words per line in training set: {ng_data['avg_length']}")
print(f"Total vocabulary size: {ng_data['vocab_size']}")

Observations in training data: 11314; test data: 7532
Min number of words per line in training set: 1
Max number of words per line in training set: 20083
Average number of words per line in training set: 206.15980201520242
Total vocabulary size: 282099


### Reuters newswire dataset

The Reuters dataset is a collection of short categorized news stories. I followed [Martin Thoma's blog post to get started](https://martin-thoma.com/nlp-reuters/).

We're usign the nltk version of the dataset, but I'm not sure what that is exactly. Our dataset has 14333 records, but the more popular [reuters-21578](https://archive.ics.uci.edu/ml/datasets/reuters-21578+text+categorization+collection) has 21578. Since that dataset was collected from 1987 newswire texts, I assume the one we're using is similar.

To get a copy of the Reuters data, you have to use `nltk.download("reuters")`.

In [12]:
def load_reuters():
    reuters_data = {"name" : "reuters", "ovr" : True}
    
    # The test and train sets are listed as IDs in the .fileids() member
    train_ids = list(filter(lambda x: x[:5] == "train", reuters.fileids()))
    test_ids = list(filter(lambda x: x[:4] == "test", reuters.fileids()))
    reuters_data["X_train"] = list(map(lambda x: reuters.raw(x), train_ids))
    reuters_data["X_test"] = list(map(lambda x: reuters.raw(x), test_ids))
    
    # The MultiLabelBinarizer will get you the 1s and 0s your model wants
    mlb = MultiLabelBinarizer(sparse_output=True)
    reuters_data["y_train"] = mlb.fit_transform(list(map(lambda x: reuters.categories(x), train_ids)))
    reuters_data["y_test"] = mlb.transform(list(map(lambda x: reuters.categories(x), test_ids)))
    
    return reuters_data
    
reuters_data = load_reuters()

The main challenge with the Reuters dataset are its large amount of classes and their multi-label nature. Models have to cope with these news items belonging to more than one category.

In [13]:
print(f"Example observation targets: {reuters.categories('test/14832')}")
print(f"Number of classes: {len(reuters.categories())}")
print(reuters.categories())

Example observation targets: ['corn', 'grain', 'rice', 'rubber', 'sugar', 'tin', 'trade']
Number of classes: 90
['acq', 'alum', 'barley', 'bop', 'carcass', 'castor-oil', 'cocoa', 'coconut', 'coconut-oil', 'coffee', 'copper', 'copra-cake', 'corn', 'cotton', 'cotton-oil', 'cpi', 'cpu', 'crude', 'dfl', 'dlr', 'dmk', 'earn', 'fuel', 'gas', 'gnp', 'gold', 'grain', 'groundnut', 'groundnut-oil', 'heat', 'hog', 'housing', 'income', 'instal-debt', 'interest', 'ipi', 'iron-steel', 'jet', 'jobs', 'l-cattle', 'lead', 'lei', 'lin-oil', 'livestock', 'lumber', 'meal-feed', 'money-fx', 'money-supply', 'naphtha', 'nat-gas', 'nickel', 'nkr', 'nzdlr', 'oat', 'oilseed', 'orange', 'palladium', 'palm-oil', 'palmkernel', 'pet-chem', 'platinum', 'potato', 'propane', 'rand', 'rape-oil', 'rapeseed', 'reserves', 'retail', 'rice', 'rubber', 'rye', 'ship', 'silver', 'sorghum', 'soy-meal', 'soy-oil', 'soybean', 'strategic-metal', 'sugar', 'sun-meal', 'sun-oil', 'sunseed', 'tea', 'tin', 'trade', 'veg-oil', 'wheat', 

Most of the observations only have one label.

In [14]:
print(f"Min number of target labels: {min([len(reuters.categories(i)) for i in reuters.fileids()])}")
print(f"Min number of target labels: {max([len(reuters.categories(i)) for i in reuters.fileids()])}")
print(f"Average number of target labels per observation: {sum([len(reuters.categories(i)) for i in reuters.fileids()])/len(reuters.fileids())}")

Min number of target labels: 1
Min number of target labels: 15
Average number of target labels per observation: 1.235446792732666


Although the stats below aren't as high as imdb and newsgroup20, the models will take longer to do the multi-label. I use an `ovr` flag to tell scikit-learn to treat this dataset as a one-vs-rest problem.

In [15]:
reuters_data["train_size"], reuters_data["test_size"] = len(reuters_data["X_train"]), len(reuters_data["X_test"])
reuters_data["avg_length"] = sum([len(i.split(' ')) for i in reuters_data["X_train"]])/len(reuters_data["X_train"])
reuters_data["vocab_size"] = len(set([i.lower() for j in reuters_data["X_train"] for i in j.split(" ")]))

print(f"Observations in training data: {len(ng_data['X_train'])}; test data: {len(reuters_data['X_test'])}")
print(f"Min number of words per line in training set: {min([len(i.split(' ')) for i in reuters_data['X_train']])}")
print(f"Max number of words per line in training set: {max([len(i.split(' ')) for i in reuters_data['X_train']])}")
print(f"Average number of words per line in training set: {sum([len(i.split(' ')) for i in reuters_data['X_train']])/len(reuters_data['X_train'])}")
print(f"Total vocabulary size: {len(set([i.lower() for j in reuters_data['X_train'] for i in j.split(' ')]))}")

Observations in training data: 11314; test data: 3019
Min number of words per line in training set: 3
Max number of words per line in training set: 1571
Average number of words per line in training set: 166.21688763032566
Total vocabulary size: 70000


## Convenience functions

Whenever we want to train on our datasets, we'll have to pre-process them and then train a bunch of different models on them. To do these things I've written some simple functions.

The `vectorize()` functions expects one of scikit-learn's [vectorizers](http://scikit-learn.org/stable/modules/feature_extraction.html) as its first argument, and then vectorizes the two datasets it's given.

In [16]:
def vectorize(vectorizer, x_train, x_test=None):
    train_vec = vectorizer.fit_transform(x_train)
    if x_test:
        test_vec = vectorizer.transform(x_test)
    else:
        test_vec = None
    return train_vec, test_vec

The `model_eval()` function iterates over models and datasets, training and evaluating each one. I had originally included more classification metrics, but I found that evaluating test sets so often can take up a lot of time. I'll stick with test accuracy as my main score.

When you wrap a model in the [`OneVsRestClassifier()`](http://scikit-learn.org/stable/modules/generated/sklearn.multiclass.OneVsRestClassifier.html) function, it'll be re-run for each label separately. This makes training take a lot more time.

In [17]:
def models_eval(models, datasets, train_key="X_train_vec", test_key="X_test_vec"):
    for dataset in datasets:
        print(f"{dataset['name']:20} train/test {dataset['train_size']}/{dataset['test_size']} total vocab {dataset['vocab_size']}")
        print(f"{20*' '}{57*'-'}")
        results = []
        for name, model in models.items():
            if dataset["ovr"]: model = OneVsRestClassifier(model)
            timer = timeit.default_timer()
            model.fit(dataset[train_key], dataset["y_train"])
            train_elapsed = timeit.default_timer() - timer
            timer = timeit.default_timer()
            train_acc = accuracy_score(y_true = dataset["y_train"], y_pred = model.predict(X=dataset[train_key]))
            test_acc = accuracy_score(y_true = dataset["y_test"], y_pred = model.predict(X=dataset[test_key]))
            eval_elapsed = timeit.default_timer() - timer
            results.append({
                "name" : name, 
                "model" : model, 
                "train_acc" : train_acc, 
                "test_acc" : test_acc, 
                "train_elapsed" : train_elapsed, 
                "eval_elapsed" : eval_elapsed
                })
        results.sort(key=lambda x: -x["test_acc"])
        for result in results:
            print("{:>19} | TRAIN {:5.1f}s | EVAL {:5.1f}s | TRAIN/TEST acc {:4.2f}/{:4.2f} |".format(
                result["name"], 
                result["train_elapsed"], 
                result["eval_elapsed"], 
                result["train_acc"], 
                result["test_acc"]
            ))
        print(20*" "+57*"-")

## Text classification with basic vectorization

We will start off out adventure with the easy-to-use [vectorizers](http://scikit-learn.org/stable/modules/feature_extraction.html) in scikit-learn. Without much effort these will give good results, which shows how useful a well-organized library like scikit-learn is.

My choice of models comes down to whatever will run reasonably fast. I learned about LogisticRegression's `C=` parameter from [Martin Thoma's blog post](https://martin-thoma.com/nlp-reuters/). It's a parameter that's easy to miss in the [scikit-learn documentation](http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html), but it sometimes gives really nice results.

As I'm starting with word-based vectorization, I won't use the `baby` dataset just yet. I'll need character-level vectorization for that.

In [43]:
list_of_models = {"Logistic" : LogisticRegression(solver="lbfgs", n_jobs = -1), 
                  "Logistic C=1000" : LogisticRegression(solver="lbfgs", n_jobs = -1, C=1000), 
                  "RandomForest 10" : RandomForestClassifier(n_jobs = -1), 
                  "RandomForest 100" : RandomForestClassifier(n_jobs = -1, n_estimators=100), 
                  "RndForest 100 MD25" : RandomForestClassifier(n_jobs = -1, n_estimators=100, max_depth=25), 
                  "DecisionTree" : DecisionTreeClassifier(), 
                  "DecisionTree MD25" : DecisionTreeClassifier(max_depth=25), 
                  "MultinomialNB":MultinomialNB()
                 }

list_of_datasets = [imdb_data, ng_data, reuters_data]

### The simplest bag of words

We'll start with the most simple. If a word is present, it gets a 1; otherwise it gets a 0.

Whenever we test for words and flag them to the models, we're using a technique called "bag of words". Even if we're identifiying short sequences of words, like the presence of "not good" or "red meat", it's still bag of words, or maybe called bag of n-grams. The common alternative is modelling the sequence of words directly, as a kind of time series.

In [19]:
for dataset in list_of_datasets:
    dataset["X_train_vec"], dataset["X_test_vec"] = vectorize(CountVectorizer(max_features=50000, binary=True), dataset["X_train"], dataset["X_test"])

models_eval(list_of_models, list_of_datasets)

imdb                 train/test 25000/25000 total vocab 49998
                    ---------------------------------------------------------
           Logistic | TRAIN   1.8s | EVAL   0.0s | TRAIN/TEST acc 1.00/0.87 |
    Logistic C=1000 | TRAIN   1.7s | EVAL   0.0s | TRAIN/TEST acc 1.00/0.86 |
   RandomForest 100 | TRAIN  12.0s | EVAL   1.1s | TRAIN/TEST acc 1.00/0.85 |
 RndForest 100 MD25 | TRAIN   2.3s | EVAL   0.9s | TRAIN/TEST acc 0.95/0.84 |
      MultinomialNB | TRAIN   0.0s | EVAL   0.0s | TRAIN/TEST acc 0.90/0.83 |
    RandomForest 10 | TRAIN   1.5s | EVAL   0.2s | TRAIN/TEST acc 0.99/0.75 |
  DecisionTree MD25 | TRAIN  10.8s | EVAL   0.0s | TRAIN/TEST acc 0.89/0.73 |
       DecisionTree | TRAIN  23.7s | EVAL   0.0s | TRAIN/TEST acc 1.00/0.71 |
                    ---------------------------------------------------------
newsgroup20          train/test 11314/7532 total vocab 282099
                    ---------------------------------------------------------
           Logisti

### tdidf with unigrams

Here is a better approach: the [Term Document Inverser Document Frequency (TD-IDF) vectorizer](http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html). This vectorizer will count word occurences in a sentence (or "document") but then divide these counts with how frequent each word appears in general. For example, if *aardvark* appears once in a sentence and once in the entire corpus, it gets a count of 1; however, if *and* appears once in a sentence but 10,000 times in the corpus, it gets a TD-IDF count of 0.0001. Normalizes your data in a way that gives an edge to rarer words but a penalty to more common words.

According to [this Wikipedia article](https://en.wikipedia.org/wiki/Tf%E2%80%93idf), 83% of text classification tasks use TD-IDF.

Anyways, I'll keep setting the maximum vocabulary size to 50,000 to make of these examples roughly comparable. Increasing the maximum vocabulary usually increases model accuracy. When scikit-learn is given a max vocabulary size, it'll only keep the most frequent words.

We should get **89%** for imdb, **68%** for newsgroup20, and **80%** for reuters. These are the amounts we'll try to beat afterwards.

In [20]:
for dataset in list_of_datasets:
    dataset["X_train_vec"], dataset["X_test_vec"] = vectorize(TfidfVectorizer(max_features=50000), dataset["X_train"], dataset["X_test"])

models_eval(list_of_models, list_of_datasets)

imdb                 train/test 25000/25000 total vocab 49998
                    ---------------------------------------------------------
           Logistic | TRAIN   1.2s | EVAL   0.0s | TRAIN/TEST acc 0.94/0.89 |
    Logistic C=1000 | TRAIN   1.7s | EVAL   0.0s | TRAIN/TEST acc 1.00/0.87 |
   RandomForest 100 | TRAIN  10.3s | EVAL   1.1s | TRAIN/TEST acc 1.00/0.84 |
 RndForest 100 MD25 | TRAIN   2.5s | EVAL   0.9s | TRAIN/TEST acc 0.96/0.83 |
      MultinomialNB | TRAIN   0.0s | EVAL   0.0s | TRAIN/TEST acc 0.91/0.83 |
    RandomForest 10 | TRAIN   1.4s | EVAL   0.5s | TRAIN/TEST acc 0.99/0.74 |
  DecisionTree MD25 | TRAIN  13.1s | EVAL   0.0s | TRAIN/TEST acc 0.91/0.72 |
       DecisionTree | TRAIN  24.6s | EVAL   0.0s | TRAIN/TEST acc 1.00/0.71 |
                    ---------------------------------------------------------
newsgroup20          train/test 11314/7532 total vocab 282099
                    ---------------------------------------------------------
    Logistic C=100

### tdidf with bigrams

We can help the models by informing them of some short word sequences, say sequences of two: these are called bi-grams. For example the imdb models will get a bit of extra help by knowing the presence of "not good" instead of only "not" and "good" separately; this helps in the sentiment analysis task.

We get slight decreases with **90%** for imdb, **67%** for newsgroup20, and **79%** for reuters. Some rarer words are important to the models, and they're being pushed out by more common bi-grams. Increasing the `max_features` to 75,000 doesn't make a lot of difference.

In [21]:
for dataset in list_of_datasets:
    dataset["X_train_vec"], dataset["X_test_vec"] = vectorize(TfidfVectorizer(max_features=50000, ngram_range = [1, 2]), dataset["X_train"], dataset["X_test"])

models_eval(list_of_models, list_of_datasets)

imdb                 train/test 25000/25000 total vocab 49998
                    ---------------------------------------------------------
           Logistic | TRAIN   2.3s | EVAL   0.0s | TRAIN/TEST acc 0.95/0.90 |
    Logistic C=1000 | TRAIN   2.5s | EVAL   0.0s | TRAIN/TEST acc 1.00/0.89 |
      MultinomialNB | TRAIN   0.0s | EVAL   0.0s | TRAIN/TEST acc 0.92/0.88 |
   RandomForest 100 | TRAIN  12.1s | EVAL   1.5s | TRAIN/TEST acc 1.00/0.85 |
 RndForest 100 MD25 | TRAIN   2.9s | EVAL   1.3s | TRAIN/TEST acc 0.96/0.85 |
    RandomForest 10 | TRAIN   1.6s | EVAL   0.6s | TRAIN/TEST acc 0.99/0.76 |
  DecisionTree MD25 | TRAIN  20.0s | EVAL   0.0s | TRAIN/TEST acc 0.91/0.72 |
       DecisionTree | TRAIN  36.8s | EVAL   0.0s | TRAIN/TEST acc 1.00/0.71 |
                    ---------------------------------------------------------
newsgroup20          train/test 11314/7532 total vocab 282099
                    ---------------------------------------------------------
    Logistic C=100

### Pre-processing

Before we move on to other things, we can try pre-processing our text data further. I got the lemmatization code [here](http://scikit-learn.org/stable/modules/feature_extraction.html#customizing-the-vectorizer-classes).

In [22]:
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]

Below you can see that this beats the best we had so far. It won't work with the imdb set. With 75,000 and 1,2 ngrams we get **67%** for newsgroup20 and **80%** for reuters. With only 1,1, we **81%** for reuters, which is out best so far. Newsgroup20 gets **66%**.

In [23]:
list_of_datasets = [ng_data, reuters_data]

In [24]:
for dataset in list_of_datasets:
    dataset["X_train_vec"], dataset["X_test_vec"] = vectorize(TfidfVectorizer(max_features=50000, ngram_range = [1, 2], 
                                                                              tokenizer=LemmaTokenizer(), stop_words="english"), 
                                                              dataset["X_train"], dataset["X_test"])

models_eval(list_of_models, list_of_datasets)

newsgroup20          train/test 11314/7532 total vocab 282099
                    ---------------------------------------------------------
    Logistic C=1000 | TRAIN  19.8s | EVAL   0.0s | TRAIN/TEST acc 0.97/0.66 |
           Logistic | TRAIN   8.5s | EVAL   0.0s | TRAIN/TEST acc 0.88/0.64 |
      MultinomialNB | TRAIN   0.0s | EVAL   0.0s | TRAIN/TEST acc 0.83/0.62 |
   RandomForest 100 | TRAIN   7.3s | EVAL   1.4s | TRAIN/TEST acc 0.97/0.60 |
 RndForest 100 MD25 | TRAIN   0.7s | EVAL   0.9s | TRAIN/TEST acc 0.76/0.56 |
    RandomForest 10 | TRAIN   1.0s | EVAL   0.4s | TRAIN/TEST acc 0.97/0.49 |
       DecisionTree | TRAIN   9.2s | EVAL   0.0s | TRAIN/TEST acc 0.97/0.42 |
  DecisionTree MD25 | TRAIN   3.3s | EVAL   0.0s | TRAIN/TEST acc 0.46/0.31 |
                    ---------------------------------------------------------
reuters              train/test 7769/3019 total vocab 70000
                    ---------------------------------------------------------
    Logistic C=1000 

### hashing trick with character ngrams

In [25]:
list_of_models = {"Logistic" : LogisticRegression(solver="lbfgs", n_jobs = -1), 
                  "Logistic C=1000" : LogisticRegression(solver="lbfgs", n_jobs = -1, C=1000), 
                  "RandomForest 10" : RandomForestClassifier(n_jobs = -1)
                 }

list_of_datasets = [imdb_data, baby_data, ng_data, reuters_data]

In [27]:
for dataset in list_of_datasets:
    dataset["X_train_vec"], dataset["X_test_vec"] = vectorize(HashingVectorizer(n_features = 50000, analyzer="char_wb", ngram_range=[2,5]), 
                                                              dataset["X_train"], dataset["X_test"])

models_eval(list_of_models, list_of_datasets)

imdb                 train/test 25000/25000 total vocab 49998
                    ---------------------------------------------------------
    Logistic C=1000 | TRAIN   7.0s | EVAL   0.1s | TRAIN/TEST acc 0.90/0.86 |
           Logistic | TRAIN   6.0s | EVAL   0.1s | TRAIN/TEST acc 0.81/0.80 |
    RandomForest 10 | TRAIN   2.6s | EVAL   0.7s | TRAIN/TEST acc 0.99/0.69 |
                    ---------------------------------------------------------
baby                 train/test 87973/20000 total vocab 52
                    ---------------------------------------------------------
           Logistic | TRAIN   1.9s | EVAL   0.0s | TRAIN/TEST acc 0.85/0.81 |
    Logistic C=1000 | TRAIN   1.9s | EVAL   0.0s | TRAIN/TEST acc 0.87/0.79 |
    RandomForest 10 | TRAIN  84.5s | EVAL   0.6s | TRAIN/TEST acc 0.90/0.78 |
                    ---------------------------------------------------------
newsgroup20          train/test 11314/7532 total vocab 282099
                    -----------------

## Training word embeddings

In [28]:
def preprocessor(x):
    return re.sub(r"[ ]+", " ", re.sub(r"[^\w]+", " ", x)).lower()

In [29]:
def w2v_prepare(dataset, by_words=True):
    if by_words:
        return [preprocessor(line).split() for line in dataset]
    else:
        return [list(line) for line in dataset]

def w2v_fit(text, size=100, alpha=0.025, window=5, min_count=5, workers=4, iter=5):
    w2v_model = Word2Vec(text, size=size, alpha=alpha, window=window, min_count=min_count, workers=workers)
    word_vectors = w2v_model.wv
    del w2v_model
    print(f"word2vec model has {len(word_vectors.vocab)} words")
    return word_vectors

In [30]:
ng_wv = w2v_fit(w2v_prepare(ng_data["X_train"]), min_count=1, iter=50, alpha=0.05)
baby_wv = w2v_fit(w2v_prepare(baby_data["X_train"], by_words=False), size=20)
reuters_wv = w2v_fit(w2v_prepare(reuters_data["X_train"]), min_count=1, iter=50, alpha=0.05)

word2vec model has 101675 words
word2vec model has 52 words
word2vec model has 26319 words


In [31]:
def w2v_transform(text, word_vectors):
    vocab = set(word_vectors.vocab)
    size = word_vectors.vector_size
    vectorized = []
    for line in text:
        line = list(filter(lambda x: x in vocab, line))
        if line:
            line = np.mean(list(map(lambda x: word_vectors[x], line)), axis=0)
            vectorized.append(line)
        else:
            vectorized.append(np.zeros(size))
    return np.array(vectorized)

In [32]:
ng_data["X_train_wv"] = w2v_transform(w2v_prepare(ng_data["X_train"]), ng_wv)
ng_data["X_test_wv"] = w2v_transform(w2v_prepare(ng_data["X_test"]), ng_wv)

baby_data["X_train_wv"] = w2v_transform(w2v_prepare(baby_data["X_train"], by_words=False), baby_wv)
baby_data["X_test_wv"] = w2v_transform(w2v_prepare(baby_data["X_test"], by_words=False), baby_wv)

reuters_data["X_train_wv"] = w2v_transform(w2v_prepare(reuters_data["X_train"]), reuters_wv)
reuters_data["X_test_wv"] = w2v_transform(w2v_prepare(reuters_data["X_test"]), reuters_wv)

In [33]:
list_of_models = {"Logistic" : LogisticRegression(solver="lbfgs", n_jobs = -1), 
                  "Logistic C=1000" : LogisticRegression(solver="lbfgs", n_jobs = -1, C=1000), 
                  "RandomForest 10" : RandomForestClassifier(n_jobs = -1), 
                  "RandomForest 100" : RandomForestClassifier(n_jobs = -1, n_estimators=100), 
                  "RandomForest 100/10" : RandomForestClassifier(n_jobs = -1, n_estimators=100, max_depth=10) 
                 }

list_of_datasets = [ng_data, baby_data, reuters_data]

In [34]:
models_eval(list_of_models, list_of_datasets, train_key="X_train_wv", test_key="X_test_wv")

newsgroup20          train/test 11314/7532 total vocab 282099
                    ---------------------------------------------------------
           Logistic | TRAIN   8.5s | EVAL   0.0s | TRAIN/TEST acc 0.53/0.47 |
    Logistic C=1000 | TRAIN   8.7s | EVAL   0.0s | TRAIN/TEST acc 0.54/0.47 |
RandomForest 100/10 | TRAIN   1.6s | EVAL   0.4s | TRAIN/TEST acc 0.90/0.40 |
   RandomForest 100 | TRAIN   2.2s | EVAL   0.6s | TRAIN/TEST acc 0.97/0.40 |
    RandomForest 10 | TRAIN   0.4s | EVAL   0.2s | TRAIN/TEST acc 0.97/0.31 |
                    ---------------------------------------------------------
baby                 train/test 87973/20000 total vocab 52
                    ---------------------------------------------------------
   RandomForest 100 | TRAIN   7.4s | EVAL   1.2s | TRAIN/TEST acc 0.87/0.65 |
RandomForest 100/10 | TRAIN   4.8s | EVAL   0.6s | TRAIN/TEST acc 0.75/0.64 |
    RandomForest 10 | TRAIN   1.0s | EVAL   0.3s | TRAIN/TEST acc 0.86/0.63 |
    Logistic C=1000 |

You can get a 3.6GB word vector file from [this blogger](http://mccormickml.com/2016/04/12/googles-pretrained-word2vec-model-in-python/) or this [archived Google Code post](https://code.google.com/archive/p/word2vec/)

In [35]:
googlenews = KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin', binary=True)
print(f"word2vec model has {len(googlenews.vocab)} words")

word2vec model has 3000000 words


In [36]:
ng_data["X_train_wv"] = w2v_transform(w2v_prepare(ng_data["X_train"]), googlenews)
ng_data["X_test_wv"] = w2v_transform(w2v_prepare(ng_data["X_test"]), googlenews)

reuters_data["X_train_wv"] = w2v_transform(w2v_prepare(reuters_data["X_train"]), googlenews)
reuters_data["X_test_wv"] = w2v_transform(w2v_prepare(reuters_data["X_test"]), googlenews)

In [37]:
list_of_models = {"Logistic" : LogisticRegression(solver="lbfgs", n_jobs = -1), 
                  "Logistic C=1000" : LogisticRegression(solver="lbfgs", n_jobs = -1, C=1000), 
                  "RandomForest 10" : RandomForestClassifier(n_jobs = -1), 
                  "RandomForest 100" : RandomForestClassifier(n_jobs = -1, n_estimators=100), 
                  "RandomForest 100/10" : RandomForestClassifier(n_jobs = -1, n_estimators=100, max_depth=10) 
                 }

list_of_datasets = [ng_data, reuters_data]

In [38]:
models_eval(list_of_models, [ng_data, reuters_data], train_key="X_train_wv", test_key="X_test_wv")

newsgroup20          train/test 11314/7532 total vocab 282099
                    ---------------------------------------------------------
           Logistic | TRAIN  11.0s | EVAL   0.0s | TRAIN/TEST acc 0.65/0.60 |
    Logistic C=1000 | TRAIN  20.5s | EVAL   0.0s | TRAIN/TEST acc 0.75/0.60 |
RandomForest 100/10 | TRAIN   2.6s | EVAL   0.5s | TRAIN/TEST acc 0.95/0.50 |
   RandomForest 100 | TRAIN   3.3s | EVAL   0.7s | TRAIN/TEST acc 0.97/0.50 |
    RandomForest 10 | TRAIN   0.5s | EVAL   0.2s | TRAIN/TEST acc 0.97/0.34 |
                    ---------------------------------------------------------
reuters              train/test 7769/3019 total vocab 70000
                    ---------------------------------------------------------
    Logistic C=1000 | TRAIN  97.4s | EVAL   0.3s | TRAIN/TEST acc 0.94/0.74 |
   RandomForest 100 | TRAIN  68.7s | EVAL  19.9s | TRAIN/TEST acc 0.99/0.64 |
           Logistic | TRAIN  69.6s | EVAL   0.3s | TRAIN/TEST acc 0.61/0.64 |
RandomForest 100/10 

## Neural networks

In [39]:
train_sentences = [preprocessor(line).split() for line in ng_train_raw.data]
test_sentences = [preprocessor(line).split() for line in ng_train_raw.data]

In [40]:
from collections import Counter

def keras_data(train_set, test_set, by_words=True, max_unigrams=50000):
    train_set = w2v_prepare(train_set, by_words)
    test_set = w2v_prepare(test_set, by_words)
    
    id2word = [i for line in train_set for i in line]
    
    if max_unigrams > 0:
        id2word = Counter(id2word)
        id2word = list(id2word.items())
        id2word.sort(key=lambda x: -x[1])
        id2word = [x[0] for x in id2word[:max_unigrams-1]]
        id2word = ["<NULL>"] + list(set(id2word))
    else:
        id2word = ["<NULL>"] + list(set(id2word))

    word2id = dict()
    vocab_size = len(id2word)
    print(f"Size of vocabulary: {vocab_size}")
    for i in range(vocab_size):
        word2id[id2word[i]] = i

    train_set = [[word2id.get(token, 0) for token in line] for line in train_set]
    test_set = [[word2id.get(token, 0) for token in line] for line in test_set]
    
    return train_set, test_set

In [41]:
ng_data["X_train_ids"], ng_data["X_test_ids"] = keras_data(ng_data["X_train"], ng_data["X_test"])
baby_data["X_train_ids"], baby_data["X_test_ids"] = keras_data(baby_data["X_train"], baby_data["X_test"], by_words=False)
reuters_data["X_train_ids"], reuters_data["X_test_ids"] = keras_data(reuters_data["X_train"], reuters_data["X_test"])

Size of vocabulary: 50000
Size of vocabulary: 53
Size of vocabulary: 26320


https://github.com/keras-team/keras/blob/master/examples/imdb_fasttext.py

In [46]:
x_train = sequence.pad_sequences(imdb_data["X_train_ids"], maxlen=400)
x_test = sequence.pad_sequences(imdb_data["X_test_ids"], maxlen=400)

model = Sequential()
model.add(Embedding(50000, 4, input_length=400))
model.add(GlobalAveragePooling1D())
model.add(Dense(1, activation='sigmoid'))

optimizer = Adam(lr=0.01)
    
model.compile(loss='binary_crossentropy',
    optimizer=optimizer,
    metrics=['accuracy'])

early_stop = EarlyStopping(min_delta=0.01, patience=2)

model.fit(x_train, imdb_data["y_train"],
          batch_size=32,
          epochs=10,
          callbacks=[early_stop],
          validation_data=(x_test, imdb_data["y_test"]))

Train on 25000 samples, validate on 25000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10


<keras.callbacks.History at 0x7f27fdf7add8>

In [47]:
x_train = sequence.pad_sequences(ng_data["X_train_ids"], maxlen=400)
x_test = sequence.pad_sequences(ng_data["X_test_ids"], maxlen=400)

model = Sequential()
model.add(Embedding(50000, 32, input_length=400))
model.add(GlobalAveragePooling1D())
model.add(Dense(len(ng_train_raw.target_names), activation='softmax'))

optimizer = Adam(lr=0.01)
    
model.compile(loss='sparse_categorical_crossentropy',
    optimizer=optimizer,
    metrics=['accuracy'])

early_stop = EarlyStopping(min_delta=0.01, patience=2)

model.fit(x_train, ng_data["y_train"],
          batch_size=64,
          epochs=10,
          callbacks=[early_stop],
          validation_data=(x_test, ng_data["y_test"]))

Train on 11314 samples, validate on 7532 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f27fe1759e8>

In [48]:
reuters_data["y_train"].shape

(7769, 90)

In [49]:
x_train = sequence.pad_sequences(reuters_data["X_train_ids"], maxlen=200)
x_test = sequence.pad_sequences(reuters_data["X_test_ids"], maxlen=200)

model = Sequential()
model.add(Embedding(50000, 64, input_length=200))
model.add(GlobalAveragePooling1D())
model.add(Dense(90, activation='softmax'))

optimizer = Adam(lr=0.01)
    
model.compile(loss='categorical_crossentropy',
    optimizer=optimizer,
    metrics=['accuracy'])

early_stop = EarlyStopping(min_delta=0.01, patience=2)

model.fit(x_train, reuters_data["y_train"],
          batch_size=64,
          epochs=10,
          callbacks=[early_stop],
          validation_data=(x_test, reuters_data["y_test"]))

Train on 7769 samples, validate on 3019 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10


<keras.callbacks.History at 0x7f27fd78f898>

https://github.com/keras-team/keras/blob/master/examples/imdb_lstm.py

In [None]:
x_train = sequence.pad_sequences(baby_data["X_train_ids"], maxlen=20)
x_test = sequence.pad_sequences(baby_data["X_test_ids"], maxlen=20)

model = Sequential()
model.add(Embedding(55, 64, input_length=20))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

optimizer = Adam(lr=0.01)
    
model.compile(loss='binary_crossentropy',
    optimizer=optimizer,
    metrics=['accuracy'])

early_stop = EarlyStopping(min_delta=0.01, patience=2)

model.fit(x_train, baby_data["y_train"],
          batch_size=64,
          epochs=10,
          callbacks=[early_stop],
          validation_data=(x_test, baby_data["y_test"]))

Train on 87973 samples, validate on 20000 samples
Epoch 1/10
Epoch 2/10
15680/87973 [====>.........................] - ETA: 17s - loss: 0.4349 - acc: 0.7975