In [1]:
# load various models from scikit-learn's library
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

# also get some metrics to try
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from sklearn.datasets import fetch_20newsgroups_vectorized, fetch_20newsgroups

from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer, CountVectorizer

import re

import numpy as np

from gensim.models.word2vec import Word2Vec, LineSentence

from keras.callbacks import EarlyStopping
from keras.preprocessing.text import hashing_trick
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, LSTM, BatchNormalization
from keras.layers import Embedding
from keras.layers import GlobalAveragePooling1D
from keras.optimizers import Adam
from keras.datasets import imdb

import nltk
from nltk.corpus import reuters

import timeit

Using TensorFlow backend.


IMDb dataset from keras.

In [7]:
imdb_data = {"name" : "imdb", "ovr" : False}
(a, b), (c, d) = imdb.load_data(num_words=20000)
imdb_data["X_train_ids"], imdb_data["y_train"], imdb_data["X_test_ids"], imdb_data["y_test"] = a, b, c, d

# For scikit-learn to like the input data, it will needs strings
imdb_data["X_train"] = [" ".join([str(x) for x in line]) for line in imdb_data["X_train_ids"]]
imdb_data["X_test"] = [" ".join([str(x) for x in line]) for line in imdb_data["X_test_ids"]]

Baby names from social security card applications. https://catalog.data.gov/dataset/baby-names-from-social-security-card-applications-national-level-data

```
import os
import re

with open("babies.csv", "w") as w:
    for f in [f for f in os.listdir(os.getcwd()) if "txt" in str(f)]:
        with open(f) as f:
            year = re.search(r'[\d]{4}', f.name)[0]
            for line in f:
                w.write(year+","+line)
```

In [5]:
with open("babies.csv") as f:
    baby_list = f.readlines()

baby_list.sort(key=lambda x: x[:4])

print(len(baby_list))
print(baby_list[:10])
print(baby_list[-10:])

In [20]:
baby_set = set()
unique_baby_list = []

for baby in baby_list:
    if " ".join(baby.split(",")[1:3]) in baby_set:
        pass
    else:
        baby_set.add(" ".join(baby.split(",")[1:3]))
        unique_baby_list.append(baby)

print(len(unique_baby_list))
print(unique_baby_list[:10])
print(unique_baby_list[-10:])

107973
['2017,Zavdiel,M,5\n', '2017,Zavonte,M,5\n', '2017,Zayer,M,5\n', '2017,Zechari,M,5\n', '2017,Zennith,M,5\n', '2017,Zeo,M,5\n', '2017,Zhiyuan,M,5\n', '2017,Zkari,M,5\n', '2017,Zohaan,M,5\n', '2017,Zykai,M,5\n']


In [23]:
baby_train = unique_baby_list[:-20000]
baby_test = unique_baby_list[-20000:]
print(len(baby_train))
print(len(baby_test))

87973
20000


Newsgroup20 dataset from scikit-learn.

In [3]:
ng_train_raw = fetch_20newsgroups(subset="train", remove=("headers", "footers", "quotes"))
ng_test_raw = fetch_20newsgroups(subset="test", remove=("headers", "footers", "quotes"))

In [4]:
print(ng_train_raw.target_names)

['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [5]:
print(ng_train_raw.data[0])

I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have on this funky looking car, please e-mail.


In [6]:
ng_data = {"name" : "newsgroup20", "ovr" : False}
ng_data.update({"X_train" : ng_train_raw.data, "y_train" : ng_train_raw.target})
ng_data.update({"X_test" : ng_test_raw.data, "y_test" : ng_test_raw.target})

Guide to using Reuters dataset here: https://martin-thoma.com/nlp-reuters/

To get a copy of the Reuters data, you have to use `nltk.download("reuters")`.

In [2]:
def load_reuters():
    reuters_data = {"name" : "reuters", "ovr" : True}
    
    # The test and train sets are listed as IDs in the .fileids() member
    train_ids = list(filter(lambda x: x[:5] == "train", reuters.fileids()))
    test_ids = list(filter(lambda x: x[:4] == "test", reuters.fileids()))
    reuters_data["X_train"] = list(map(lambda x: reuters.raw(x), train_ids))
    reuters_data["X_test"] = list(map(lambda x: reuters.raw(x), test_ids))
    
    # The MultiLabelBinarizer will get you the 1s and 0s your model wants
    mlb = MultiLabelBinarizer(sparse_output=True)
    reuters_data["y_train"] = mlb.fit_transform(list(map(lambda x: reuters.categories(x), train_ids)))
    reuters_data["y_test"] = mlb.transform(list(map(lambda x: reuters.categories(x), test_ids)))
    
    return reuters_data
    
reuters_data = load_reuters()

In [8]:
def vectorize(vectorizer, x_train, x_test=None):
    train_vec = vectorizer.fit_transform(x_train)
    if x_test:
        test_vec = vectorizer.transform(x_test)
    else:
        test_vec = None

    return train_vec, test_vec
    

In [41]:
list_of_models = {"Logistic" : LogisticRegression(solver="lbfgs", n_jobs = -1), 
                  "Logistic C=1000" : LogisticRegression(solver="lbfgs", n_jobs = -1, C=1000), 
                  "RandomForest 10" : RandomForestClassifier(n_jobs = -1), 
                  "RandomForest 200" : RandomForestClassifier(n_jobs = -1, n_estimators=200), 
                  "MultinomialNB":MultinomialNB()
                 }

In [28]:
def models_eval(models, datasets, train_key="X_train_vec", test_key="X_test_vec"):
    for dataset in datasets:
        print(f"{dataset['name']:20}{63*'-'}")
        results = []
        for name, model in models.items():
            if dataset["ovr"]: model = OneVsRestClassifier(model)
            timer = timeit.default_timer()
            model.fit(dataset[train_key], dataset["y_train"])
            elapsed = timeit.default_timer() - timer
            results.append({
                "name" : name, 
                "model" : model, 
                "train_acc" : accuracy_score(y_true = dataset["y_train"], y_pred = model.predict(X=dataset[train_key])), 
                "test_acc" : accuracy_score(y_true = dataset["y_test"], y_pred = model.predict(X=dataset[test_key])), 
                "precision" : precision_score(y_true = dataset["y_test"], y_pred = model.predict(X=dataset[test_key]), average="micro"), 
                "recall" : recall_score(y_true = dataset["y_test"], y_pred = model.predict(X=dataset[test_key]), average="micro"), 
                "f1_score" : f1_score(y_true = dataset["y_test"], y_pred = model.predict(X=dataset[test_key]), average="micro"), 
                "elapsed" : elapsed
                })
        results.sort(key=lambda x: -x["f1_score"])
        for result in results:
            print("{:>19} | {:5.2f}s | TRAIN/TEST acc {:4.2f}/{:4.2f} | pr/re/f1 {:4.2f}/{:4.2f}/{:4.2f} |".format(
                result["name"], 
                result["elapsed"], 
                result["train_acc"], 
                result["test_acc"], 
                result["precision"], 
                result["recall"], 
                result["f1_score"]
            ))
        print(20*" "+63*"-")

### tdidf with unigrams

In [42]:
for dataset in [reuters_data, ng_data, imdb_data]:
    dataset["X_train_vec"], dataset["X_test_vec"] = vectorize(TfidfVectorizer(), dataset["X_train"], dataset["X_test"])

models_eval(list_of_models, [imdb_data, ng_data, reuters_data])

imdb                ---------------------------------------------------------------
           Logistic |  0.96s | TRAIN/TEST acc 0.93/0.89 | pr/re/f1 0.89/0.89/0.89 |
    Logistic C=1000 |  1.47s | TRAIN/TEST acc 1.00/0.86 | pr/re/f1 0.86/0.86/0.86 |
   RandomForest 200 | 18.03s | TRAIN/TEST acc 1.00/0.85 | pr/re/f1 0.85/0.85/0.85 |
      MultinomialNB |  0.06s | TRAIN/TEST acc 0.89/0.83 | pr/re/f1 0.83/0.83/0.83 |
    RandomForest 10 |  1.25s | TRAIN/TEST acc 0.99/0.76 | pr/re/f1 0.76/0.76/0.76 |
                    ---------------------------------------------------------------
newsgroup20         ---------------------------------------------------------------
    Logistic C=1000 | 28.20s | TRAIN/TEST acc 0.97/0.68 | pr/re/f1 0.68/0.68/0.68 |
           Logistic | 11.48s | TRAIN/TEST acc 0.90/0.68 | pr/re/f1 0.68/0.68/0.68 |
   RandomForest 200 | 21.59s | TRAIN/TEST acc 0.97/0.61 | pr/re/f1 0.61/0.61/0.61 |
      MultinomialNB |  0.12s | TRAIN/TEST acc 0.81/0.61 | pr/re/f1 0.61/0.61

### tdidf with bigrams

In [None]:
for dataset in [reuters_data, ng_data, imdb_data]:
    dataset["X_train_vec"], dataset["X_test_vec"] = vectorize(TfidfVectorizer(ngram_range = [2, 2]), dataset["X_train"], dataset["X_test"])

models_eval(list_of_models, [imdb_data, ng_data, reuters_data])

### hashing trick with bigrams

In [26]:
list_of_models = {"Logistic" : LogisticRegression(solver="lbfgs", n_jobs = -1), 
                  "Logistic C=1000" : LogisticRegression(solver="lbfgs", n_jobs = -1, C=1000), 
                  "RandomForest 10" : RandomForestClassifier(n_jobs = -1), 
                  "RandomForest 200" : RandomForestClassifier(n_jobs = -1, n_estimators=200)
                 }

In [27]:
for dataset in [reuters_data, ng_data, imdb_data]:
    dataset["X_train_vec"], dataset["X_test_vec"] = vectorize(HashingVectorizer(n_features = 5000), dataset["X_train"], dataset["X_test"])

models_eval(list_of_models, [imdb_data, ng_data, reuters_data])

reuters             ------------------------------------------------------
    Logistic C=1000 | 40.30s | TRAIN/TEST acc 0.99/0.79 | pr/re/f1 0.92/0.79/0.85 |
   RandomForest 200 | 54.19s | TRAIN/TEST acc 1.00/0.66 | pr/re/f1 0.97/0.60/0.74 |
           Logistic | 30.20s | TRAIN/TEST acc 0.67/0.65 | pr/re/f1 0.97/0.59/0.73 |
    RandomForest 10 | 11.63s | TRAIN/TEST acc 0.94/0.61 | pr/re/f1 0.96/0.56/0.71 |
                    ------------------------------------------------------
newsgroup20         ------------------------------------------------------
    Logistic C=1000 |  6.72s | TRAIN/TEST acc 0.97/0.58 | pr/re/f1 0.58/0.58/0.58 |
   RandomForest 200 | 10.37s | TRAIN/TEST acc 0.97/0.55 | pr/re/f1 0.55/0.55/0.55 |
           Logistic |  4.08s | TRAIN/TEST acc 0.72/0.54 | pr/re/f1 0.54/0.54/0.54 |
    RandomForest 10 |  0.83s | TRAIN/TEST acc 0.97/0.42 | pr/re/f1 0.42/0.42/0.42 |
                    ------------------------------------------------------
imdb                --------

In [None]:
for dataset in [reuters_data, ng_data, imdb_data]:
    dataset["X_train_vec"], dataset["X_test_vec"] = vectorize(HashingVectorizer(n_features = 50000, analyzer="char_wb", ngram_range=[2,5]), 
                                                              dataset["X_train"], dataset["X_test"])

models_eval(list_of_models, [imdb_data, ng_data, reuters_data])

## Trying a neural network

In [None]:
# Do the same with the output data, except build one-hot vectors instead
y_set = set(ng_train_raw.target)
y_train_onehot = []
for i in ng_train_raw.target:
    y_train_onehot.append([0] * len(y_set))
    y_train_onehot[-1][i] = 1
y_train_onehot = np.array(y_train_onehot)
print(y_train_onehot[0:2])
print(y_train_onehot.shape)

y_test_onehot = []
for i in ng_test_raw.target:
    y_test_onehot.append([0] * len(y_set))
    y_test_onehot[-1][i] = 1
y_test_onehot = np.array(y_test_onehot)
print(y_test_onehot[0:2])
print(y_test_onehot.shape)

In [None]:
ng_train, ng_test = vectorize(TfidfVectorizer(max_features=50000))

print('Build model...')
model = Sequential()


model.add(Dense(64, activation='relu', input_shape=(50000,)))

model.add(Dense(len(ng_train_raw.target_names), activation='softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer=Adam(),
              metrics=['accuracy'])

early_stop = EarlyStopping(min_delta=0.01)

model.fit(ng_train["X"], y_train_onehot,
          batch_size=32,
          epochs=10,
          callbacks=[early_stop],
          validation_data=(ng_test["X"], y_test_onehot))

## Training word embeddings

In [29]:
def preprocessor(x):
    return re.sub(r"[ ]+", " ", re.sub(r"[^\w]+", " ", x)).lower()

In [None]:
sentences = [preprocessor(line).split() for line in ng_train_raw.data]

print(len(ng_train_raw.data))
print(len(sentences))

In [30]:
def w2v_prepare(dataset):
    return [preprocessor(line).split() for line in dataset]

def w2v_fit(text, size=100, alpha=0.025, window=5, min_count=5, workers=4, iter=5):
    w2v_model = Word2Vec(text, size=size, alpha=alpha, window=window, min_count=min_count, workers=workers)
    word_vectors = w2v_model.wv
    del w2v_model
    print(f"word2vec model has {len(word_vectors.vocab)} words")
    return word_vectors

reuters_wv = w2v_fit(w2v_prepare(reuters_data["X_train"]), min_count=1, iter=50, alpha=0.05)
ng_wv = w2v_fit(w2v_prepare(ng_data["X_train"]), min_count=1, iter=50, alpha=0.05)
imdb_wv = w2v_fit(w2v_prepare(imdb_data["X_train"]), min_count=1, iter=50, alpha=0.05)

word2vec model has 26319 words
word2vec model has 101675 words
word2vec model has 19998 words


In [32]:
def w2v_transform(text, word_vectors):
    vocab = set(word_vectors.vocab)
    size = word_vectors.vector_size
    vectorized = []
    for line in text:
        line = list(filter(lambda x: x in vocab, line))
        if line:
            line = np.mean(list(map(lambda x: word_vectors[x], line)), axis=0)
            vectorized.append(line)
        else:
            vectorized.append(np.zeros(size))
    return np.array(vectorized)

imdb_data["X_train_wv"] = w2v_transform(w2v_prepare(imdb_data["X_train"]), imdb_wv)
imdb_data["X_test_wv"] = w2v_transform(w2v_prepare(imdb_data["X_test"]), imdb_wv)

ng_data["X_train_wv"] = w2v_transform(w2v_prepare(ng_data["X_train"]), ng_wv)
ng_data["X_test_wv"] = w2v_transform(w2v_prepare(ng_data["X_test"]), ng_wv)

reuters_data["X_train_wv"] = w2v_transform(w2v_prepare(reuters_data["X_train"]), reuters_wv)
reuters_data["X_test_wv"] = w2v_transform(w2v_prepare(reuters_data["X_test"]), reuters_wv)

In [33]:
list_of_models = {"Logistic" : LogisticRegression(solver="lbfgs", n_jobs = -1), 
                  "Logistic C=1000" : LogisticRegression(solver="lbfgs", n_jobs = -1, C=1000), 
                  "RandomForest 10" : RandomForestClassifier(n_jobs = -1), 
                  "RandomForest 200" : RandomForestClassifier(n_jobs = -1, n_estimators=200)
                 }

In [34]:
models_eval(list_of_models, [imdb_data, ng_data, reuters_data], train_key="X_train_wv", test_key="X_test_wv")

imdb                ---------------------------------------------------------------
    Logistic C=1000 |  1.03s | TRAIN/TEST acc 0.86/0.85 | pr/re/f1 0.85/0.85/0.85 |
           Logistic |  1.40s | TRAIN/TEST acc 0.86/0.85 | pr/re/f1 0.85/0.85/0.85 |
   RandomForest 200 |  7.24s | TRAIN/TEST acc 1.00/0.81 | pr/re/f1 0.81/0.81/0.81 |
    RandomForest 10 |  0.61s | TRAIN/TEST acc 0.99/0.75 | pr/re/f1 0.75/0.75/0.75 |
                    ---------------------------------------------------------------
newsgroup20         ---------------------------------------------------------------
           Logistic |  8.43s | TRAIN/TEST acc 0.54/0.47 | pr/re/f1 0.47/0.47/0.47 |
    Logistic C=1000 |  8.95s | TRAIN/TEST acc 0.54/0.46 | pr/re/f1 0.46/0.46/0.46 |
   RandomForest 200 |  3.87s | TRAIN/TEST acc 0.97/0.41 | pr/re/f1 0.41/0.41/0.41 |
    RandomForest 10 |  0.41s | TRAIN/TEST acc 0.97/0.30 | pr/re/f1 0.30/0.30/0.30 |
                    --------------------------------------------------------

In [35]:
fil9_wv = w2v_fit(LineSentence("fil9"))

word2vec model has 218316 words


In [38]:
ng_data["X_train_wv"] = w2v_transform(w2v_prepare(ng_data["X_train"]), fil9_wv)
ng_data["X_test_wv"] = w2v_transform(w2v_prepare(ng_data["X_test"]), fil9_wv)

reuters_data["X_train_wv"] = w2v_transform(w2v_prepare(reuters_data["X_train"]), fil9_wv)
reuters_data["X_test_wv"] = w2v_transform(w2v_prepare(reuters_data["X_test"]), fil9_wv)

In [39]:
models_eval(list_of_models, [ng_data, reuters_data], train_key="X_train_wv", test_key="X_test_wv")

newsgroup20         ---------------------------------------------------------------
           Logistic |  8.90s | TRAIN/TEST acc 0.59/0.53 | pr/re/f1 0.53/0.53/0.53 |
    Logistic C=1000 |  8.78s | TRAIN/TEST acc 0.59/0.53 | pr/re/f1 0.53/0.53/0.53 |
   RandomForest 200 |  3.84s | TRAIN/TEST acc 0.97/0.44 | pr/re/f1 0.44/0.44/0.44 |
    RandomForest 10 |  0.31s | TRAIN/TEST acc 0.97/0.30 | pr/re/f1 0.30/0.30/0.30 |
                    ---------------------------------------------------------------
reuters             ---------------------------------------------------------------
           Logistic | 54.86s | TRAIN/TEST acc 0.67/0.66 | pr/re/f1 0.85/0.66/0.74 |
    Logistic C=1000 | 59.35s | TRAIN/TEST acc 0.74/0.63 | pr/re/f1 0.69/0.71/0.70 |
   RandomForest 200 | 84.15s | TRAIN/TEST acc 0.99/0.57 | pr/re/f1 0.96/0.49/0.65 |
    RandomForest 10 | 16.64s | TRAIN/TEST acc 0.90/0.54 | pr/re/f1 0.92/0.47/0.63 |
                    --------------------------------------------------------

In [None]:
print('Build model...')
model = Sequential()


model.add(Dense(256, activation='relu', input_shape=(200,)))

model.add(Dense(len(ng_train_raw.target_names), activation='softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer=Adam(),
              metrics=['accuracy'])

model.fit(w2v_train, y_train_onehot,
          batch_size=32,
          epochs=10,
validation_data=(w2v_test, y_test_onehot))

In [None]:
list_of_models = {"LogisticRegression" : LogisticRegression(solver="lbfgs", n_jobs = -1), 
                  "RandomForest" : RandomForestClassifier(n_jobs = -1), 
                  "RandomForest 100" : RandomForestClassifier(n_jobs = -1, n_estimators=100),
                  "GradientBoost" : GradientBoostingClassifier()
                 }

In [None]:
for name, model in list_of_models.items():
    print()
    print(name)
    model.fit(w2v_train, ng_train_raw.target)
    train_score = accuracy_score(y_true = ng_train_raw.target, y_pred = model.predict(X=w2v_train))
    print("Train score {0}:".format(train_score))
    test_score = accuracy_score(y_true = ng_test_raw.target, y_pred = model.predict(X=w2v_test))
    print("Test score {0}:".format(test_score))
    print()

## Recurrent networks

In [None]:
train_sentences = [preprocessor(line).split() for line in ng_train_raw.data]
test_sentences = [preprocessor(line).split() for line in ng_train_raw.data]

In [43]:
def keras_data(train_set, test_set):
    train_set = w2v_prepare(train_set)
    test_set = w2v_prepare(test_set)
    
    # Keep 0 for unknown tokens
    id2word = ["<NULL>"] + list(set([i for j in train_set for i in j]))

    word2id = dict()
    for i in range(len(id2word)):
        word2id[id2word[i]] = i + 1

    train_set = [[word2id.get(token, 0) for token in line] for line in train_set]
    test_set = [[word2id.get(token, 0) for token in line] for line in test_set]
    
    return train_set, test_set

ng_data["X_train_ids"], ng_data["X_test_ids"] = keras_data(ng_data["X_train"], ng_data["X_test"])
reuters_data["X_train_ids"], reuters_data["X_test_ids"] = keras_data(reuters_data["X_train"], reuters_data["X_test"])

In [53]:
def onehot_y(train_set, test_set):
    y_set = set(train_set)
    y_train_onehot = []
    for i in train_set:
        y_train_onehot.append([0] * len(y_set))
        y_train_onehot[-1][i] = 1
    y_train_onehot = np.array(y_train_onehot)

    y_test_onehot = []
    for i in test_set:
        y_test_onehot.append([0] * len(y_set))
        y_test_onehot[-1][i] = 1
    y_test_onehot = np.array(y_test_onehot)
    
    return y_train_onehot, y_test_onehot

imdb_data["y_train_onehot"], imdb_data["y_test_onehot"] = imdb_data["y_train"], imdb_data["y_test"]
ng_data["y_train_onehot"], ng_data["y_test_onehot"] = onehot_y(ng_data["y_train"], ng_data["y_test"])
#reuters_data["y_train_onehot"], reuters_data["y_test_onehot"] = onehot_y(reuters_data["y_train"], reuters_data["y_test"])

In [None]:
train_sentences = [[word2id.get(token, 0) for token in preprocessor(line).split()] for line in ng_train_raw.data]
test_sentences = [[word2id.get(token, 0) for token in preprocessor(line).split()] for line in ng_test_raw.data]

In [None]:
print(len(train_sentences))
print(len(test_sentences))

In [55]:
imdb_data["y_train_onehot"]

array([1, 0, 0, ..., 0, 1, 0])

https://github.com/keras-team/keras/blob/master/examples/imdb_fasttext.py

In [62]:
for dataset, classes, loss in zip([imdb_data, ng_data], [1, 20], ["binary_crossentropy", "sparse_categorical_crossentropy"]):
    # Set parameters:
    max_features = 100000
    batch_size = 32
    embedding_dims = 50
    epochs = 20
    maxlen = 400

    x_train = sequence.pad_sequences(dataset["X_train_ids"], maxlen=maxlen)
    x_test = sequence.pad_sequences(dataset["X_test_ids"], maxlen=maxlen)

    print('Build model...')
    model = Sequential()
    model.add(Embedding(max_features, embedding_dims, input_length=maxlen))
    model.add(GlobalAveragePooling1D())
    model.add(Dense(classes, activation='softmax'))
    optimizer = Adam(lr=0.01)
    model.compile(loss=loss, optimizer=optimizer, metrics=['accuracy'])

    early_stop = EarlyStopping(min_delta=0.01)

    model.fit(x_train, dataset["y_train"],
              batch_size=batch_size,
              epochs=epochs,
              callbacks=[early_stop],
              validation_data=(x_test, dataset["y_test"]))

Build model...
Train on 25000 samples, validate on 25000 samples
Epoch 1/20
Epoch 2/20
Build model...
Train on 11314 samples, validate on 7532 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20


https://github.com/keras-team/keras/blob/master/examples/imdb_lstm.py

In [None]:
# Set parameters:
max_features = 100000
maxlen = 100
batch_size = 32
embedding_dims = 100
epochs = 20
maxlen = 100

x_train = sequence.pad_sequences(train_sentences, maxlen=maxlen)
x_test = sequence.pad_sequences(test_sentences, maxlen=maxlen)

print('Build model...')
model = Sequential()
model.add(Embedding(max_features, embedding_dims, input_length=maxlen))

model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))

model.add(Dense(len(ng_train_raw.target_names), activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer=Adam(), metrics=['accuracy'])

model.fit(x_train, y_train_onehot,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(x_test, y_test_onehot))