In [2]:
%load_ext autoreload
%autoreload 2

In [7]:
import os
import glob
from collections import Counter
import math
import re
import json
import subprocess

import pandas as pd
import umap
from tqdm.autonotebook import tqdm
from nltk.tokenize import word_tokenize

import numpy as np
from sklearn.datasets import load_iris, load_digits
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import tensorflow as tf

from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.metrics import confusion_matrix
from sklearn.utils import class_weight

from tensorflow import keras
layers = keras.layers
models = keras.models
from tensorflow.keras.optimizers import Adam


# This code was tested with TensorFlow v1.8
print("You have TensorFlow version", tf.__version__)

import language
import news

You have TensorFlow version 2.0.0


In [3]:
def load_data(file_name):
    labels = []
    texts = []
    with open(file_name) as f:
        for line in f:
            words = line.split(" ")
            label = words[0].replace("__label__", "")
            text = " ".join(words[1:])
            labels.append(label)
            texts.append(text)

    data = pd.DataFrame({"category": labels, "text": texts})
    return data

In [27]:
train_data = load_data("data/category_train.txt")
test_data = load_data("data/category_val.txt")

In [28]:
print(train_data['category'].value_counts())
print(test_data['category'].value_counts())

society          628
junk             232
economy          201
other            138
sports            78
science           44
technology        38
entertainment     31
Name: category, dtype: int64
society          13408
sports            3296
other             2436
economy           2274
entertainment     1983
junk              1279
science           1034
technology         618
Name: category, dtype: int64


In [29]:
max_words = 10000
tokenize = keras.preprocessing.text.Tokenizer(num_words=max_words, 
                                              char_level=False)

In [30]:
tokenize.fit_on_texts(train_data.text) # fit tokenizer to our training text data
x_train = tokenize.texts_to_matrix(train_data.text)
x_test = tokenize.texts_to_matrix(test_data.text)

In [31]:
# Use sklearn utility to convert label strings to numbered index
encoder = LabelEncoder()
encoder.fit(train_data.category)
y_train_num = encoder.transform(train_data.category)
y_test_num = encoder.transform(test_data.category)

In [32]:
# Converts the labels to a one-hot representation
num_classes = np.max(y_train_num) + 1
y_train = keras.utils.to_categorical(y_train_num, num_classes)
y_test = keras.utils.to_categorical(y_test_num, num_classes)

In [33]:
# Inspect the dimenstions of our training and test data (this is helpful to debug)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)

x_train shape: (1390, 10000)
x_test shape: (26328, 10000)
y_train shape: (1390, 8)
y_test shape: (26328, 8)


In [34]:
# This model trains very quickly and 2 epochs are already more than enough
# Training for more epochs will likely lead to overfitting on this dataset
# You can try tweaking these hyperparamaters when using this model with your own data
batch_size = 32
epochs = 20
drop_ratio = 0.3

In [35]:
# Build the model
model = models.Sequential()
model.add(layers.Dropout(0.7))
model.add(layers.Dense(512, input_shape=(max_words,), use_bias=True))
model.add(layers.BatchNormalization())
model.add(layers.Activation('relu'))
model.add(layers.Dropout(drop_ratio))
model.add(layers.Dense(512, use_bias=True))
model.add(layers.BatchNormalization())
model.add(layers.Activation('relu'))
model.add(layers.Dropout(drop_ratio))
model.add(layers.Dense(num_classes))
model.add(layers.Activation('softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer=Adam(lr=0.0001),
              metrics=['accuracy'])

In [36]:
# model.fit trains the model
# The validation_split param tells Keras what % of our training data should be used in the validation set
# You can see the validation loss decreasing slowly when you run this
# Because val_loss is no longer decreasing we stop training to prevent overfitting
class_weights = class_weight.compute_class_weight('balanced', list(np.unique(y_train_num)), y_train_num)

history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_split=0.1,
                    class_weight=class_weights)

Train on 1251 samples, validate on 139 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [37]:
# Evaluate the accuracy of our trained model
score = model.evaluate(x_test, y_test, batch_size=batch_size, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

# Here's how to generate a prediction on individual examples
text_labels = encoder.classes_ 

predictions = model.predict(x_test)
predicted_labels = text_labels[np.argmax(predictions, axis=1)]
predicted_labels.tofile("data/tf_test_labels", sep="\n", format="%s")
print(Counter(predicted_labels))

Test loss: 1.5218549808880653
Test accuracy: 0.4790717
Counter({'society': 13055, 'junk': 10678, 'economy': 1234, 'sports': 889, 'other': 261, 'technology': 162, 'science': 26, 'entertainment': 23})


In [15]:


for i in range(10):
    prediction = model.predict(np.array([x_test[i]]))
    predicted_label = text_labels[np.argmax(prediction)]
    print(test_data.text.iloc[i][:250], "...")
    print('Actual label:' + test_data.category.iloc[i])
    print("Predicted label: " + predicted_label + "\n")  

тренд зимы правильно красиво носить берет пережить зиму легко красиво теплые стильные расскажем правильно красиво носить французский берет комплекте самом деле берет самых универсальных головных уборов покажем подготовили полноценный гид шнуровке кож ...
Actual label:junk
Predicted label: junk

канаде чёрную корову продали рекордные тысяч животное которому дали имя боролись фермеры канады мексики сша корову породы чёрный продали аукционе городе тысяч канадских долларов тысяч телеканал заплатить солидную сумму согласились генетикой даррен со ...
Actual label:economy
Predicted label: economy

торговые павильоны ликвидируют возле речного вокзала хабаровске хабаровске ликвидируют точки шаурмой офисы туристических фирм располагались возле здания речного вокзала срок аренды объектов истек данным мэрии владельцы точек общепита протестовали про ...
Actual label:other
Predicted label: other

суд обязал журналиста азара выплатить тыс хамовнический районный суд москвы обязал специального корреспо

In [11]:
s = "Calculate metrics for each label, and find their average, weighted by support (the number of true instances for each label).".split(" ")

for w, ws in zip(s, news.stem(s, "english")):
    print(f"{w} -> {ws}")

Calculate -> calcul
metrics -> metric
for -> for
each -> each
label, -> label,
and -> and
find -> find
their -> their
average, -> average,
weighted -> weight
by -> by
support -> support
(the -> (the
number -> number
of -> of
true -> true
instances -> instanc
for -> for
each -> each
label). -> label).
