In [1]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import glob
from collections import Counter
import math
import re
import json

import pandas as pd
import umap
from tqdm.autonotebook import tqdm
from nltk.tokenize import word_tokenize

import numpy as np
from sklearn.datasets import load_iris, load_digits
from sklearn.model_selection import train_test_split, KFold

from tensorflow import keras

import tensorflow as tf
gpus = tf.config.experimental.list_physical_devices('GPU')
#assert gpus
try:
    # Currently, memory growth needs to be the same across GPUs
    for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu, True)

    logical_gpus = tf.config.experimental.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
except RuntimeError as e:
    # Memory growth must be set before GPUs have been initialized
    print(e)


from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.metrics import confusion_matrix
from sklearn.utils import class_weight
layers = keras.layers
models = keras.models
from tensorflow.keras.optimizers import Adam

import convert_model

import language
import text_nn
import grab_category
import news

In [3]:
folder_bootstrap = "data/sample"
file_info = language.read_dump(os.path.join(folder_bootstrap, "langs", "ru"))
normalizer = news.TextNormalizer("russian")
normalizer.train(file_info)
train_texts = normalizer.normalize_texts(file_info)
ground_truth = grab_category.load_gt(folder_bootstrap)

folders_part2 = ["data/sample2", "data/sample3"]
file_info2 = []
ground_truth2 = {}
for folder in folders_part2:
    file_info2.extend(language.read_dump(os.path.join(folder, "langs", "ru")))
    ground_truth2.update(grab_category.load_gt(folder).items())

train_texts2 = normalizer.normalize_texts(file_info2)

folder_part3 = "data/sample4"
file_info3 = language.read_dump(os.path.join(folder_part3, "langs", "ru"))
train_texts3 = normalizer.normalize_texts(file_info3)
ground_truth3 = grab_category.load_gt(folder_part3)

43344 word stems
4043 GT labels loaded from data/sample
2837 GT labels loaded from data/sample2
1935 GT labels loaded from data/sample3
1504 GT labels loaded from data/sample4


In [4]:
kw_categories = text_nn.keyword_categories(file_info, text_nn.category_words_rus)
url_junk = grab_category.junk_by_url(file_info)
grab_category.gt_to_linear(kw_categories, url_junk, file_info)
categories = kw_categories.copy() 
print(f"{sum(cat != '' for cat in categories)} / {len(categories)} ({sum(cat != '' for cat in categories) / len(categories) * 100}%) have categories")
print(Counter(categories))

HBox(children=(IntProgress(value=0, description='assigning keyword-based labels', max=88150, style=ProgressSty…


87149 / 88150 (98.8644356211004%) have categories
Counter({'society': 43943, 'sports': 10663, 'economy': 7917, 'other': 7373, 'entertainment': 6405, 'junk': 5603, 'science': 3402, 'technology': 1843, '': 1001})


In [5]:
grab_category.gt_to_linear(categories, ground_truth, file_info)

In [6]:
def print_test_stats(predicted_labels, ground_truth, file_info, name):
    errors = 0
    for i, fi in enumerate(file_info):
        if fi.file in ground_truth:
            if predicted_labels[i] != ground_truth[fi.file]:
                errors += 1
            
    print(f"{name} predicted categories:", Counter(predicted_labels))
    print(f"{errors} / {len(ground_truth)} ({errors / len(ground_truth) * 100}%) {name} errors in GT")

In [7]:
np.random.seed(3)
reassigned_labels = text_nn.reassign_labels_one_dataset_ft(train_texts, categories)

print_test_stats(reassigned_labels, ground_truth, file_info, "FastText")
print_test_stats(kw_categories, ground_truth, file_info, "keywords")
grab_category.gt_to_linear(reassigned_labels, ground_truth, file_info)

part2_pseudolabels = text_nn.fasttext_predict_category(train_texts, reassigned_labels, train_texts2)    
print_test_stats(part2_pseudolabels, ground_truth2, file_info2, "FastText")
grab_category.gt_to_linear(part2_pseudolabels, ground_truth2, file_info2)

part3_predicted_labels = text_nn.fasttext_predict_category(train_texts + train_texts2, reassigned_labels + part2_pseudolabels, train_texts3)    
print_test_stats(part3_predicted_labels, ground_truth3, file_info3, "FastText")

HBox(children=(IntProgress(value=0, description='reassigning labels [ft]', max=5, style=ProgressStyle(descript…


FastText predicted categories: Counter({'society': 45419, 'sports': 10441, 'economy': 8045, 'other': 7255, 'entertainment': 6335, 'junk': 5818, 'science': 3139, 'technology': 1698})
792 / 4043 (19.589413801632453%) FastText errors in GT
keywords predicted categories: Counter({'society': 43943, 'sports': 10663, 'economy': 7917, 'other': 7373, 'entertainment': 6405, 'junk': 5603, 'science': 3402, 'technology': 1843, '': 1001})
1431 / 4043 (35.394509027949546%) keywords errors in GT
FastText predicted categories: Counter({'society': 61709, 'sports': 12322, 'economy': 11455, 'other': 9728, 'entertainment': 7758, 'junk': 6933, 'science': 4052, 'technology': 2041})
933 / 4772 (19.551550712489522%) FastText errors in GT
FastText predicted categories: Counter({'society': 33332, 'sports': 8815, 'economy': 5608, 'other': 5287, 'entertainment': 4417, 'junk': 4010, 'science': 2419, 'technology': 1515})
233 / 1504 (15.492021276595743%) FastText errors in GT


In [8]:
all_train_texts = train_texts + train_texts2
predict_texts = train_texts3
train_categories = reassigned_labels + part2_pseudolabels
verbose = 1
max_words = 7000
tokenize = keras.preprocessing.text.Tokenizer(num_words=max_words, char_level=False)
tokenize.fit_on_texts(train_texts)  # fit tokenizer to our training text data
x_train = tokenize.texts_to_matrix(all_train_texts).astype(np.float32)
x_test = tokenize.texts_to_matrix(predict_texts).astype(np.float32)

# Use sklearn utility to convert label strings to numbered index
encoder = LabelEncoder()
encoder.fit(train_categories)
y_train_num = encoder.transform(train_categories)

# Converts the labels to a one-hot representation
num_classes = np.max(y_train_num) + 1
y_train = keras.utils.to_categorical(y_train_num, num_classes).astype(np.float32)

def ground_truth_idx(ground_truth, file_info):
    idx = []
    for i, fi in enumerate(file_info):
        idx.append(fi.file in ground_truth)
        
    return idx

gt1 = ground_truth_idx(ground_truth, file_info)
gt2 = ground_truth_idx(ground_truth2, file_info2)
sample_weights = np.array(gt1 + gt2, np.float32) * 4 + 1

In [9]:
tokenizer_data = json.loads(tokenize.to_json())
word_index = json.loads(tokenizer_data["config"]["word_index"])
with open("data/dictionary_ru.tsv", "w") as f:
    for word, index in word_index.items():
        if index < max_words:
            f.write(f"{word}\t{index}\t{normalizer.idf[word] if word in normalizer.idf else 0}\n")
            
with open("data/categories_ru", "w") as f:
    for c in encoder.classes_:
        f.write(c + "\n")

In [10]:
%reset -f in out
batch_size = 32
epochs = 10
drop_ratio = 0.5

# Build the model
model = models.Sequential()
model.add(layers.Dropout(0.4, input_shape=(max_words,)))
model.add(layers.Dense(512, use_bias=True, activation="relu"))
model.add(layers.Dropout(drop_ratio))
model.add(layers.Dense(512, use_bias=True, activation="relu"))
model.add(layers.Dropout(drop_ratio))
model.add(layers.Dense(num_classes, activation="softmax"))

model.compile(loss='categorical_crossentropy',
              optimizer=Adam(lr=0.0001),
              metrics=['accuracy'])

model.summary()

Flushing input history
Flushing output cache (0 entries)
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dropout (Dropout)            (None, 7000)              0         
_________________________________________________________________
dense (Dense)                (None, 512)               3584512   
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 512)               262656    
_________________________________________________________________
dropout_2 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 8)                 4104      
Total params: 3,851,272
Trainable params: 3,851,272
Non-trainable

In [11]:
# model.fit trains the model
#class_weights = class_weight.compute_class_weight('balanced', list(np.unique(y_train_num)), y_train_num)

history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=verbose,
                    validation_split=0.0,
                    class_weight=None,
                    sample_weight=sample_weights)

Train on 204148 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [12]:
# Here's how to generate a prediction on individual examples
text_labels = encoder.classes_

predictions = model.predict(x_test)
predicted_labels = text_labels[np.argmax(predictions, axis=1)]
print_test_stats(predicted_labels, ground_truth3, file_info3, "TF")

TF predicted categories: Counter({'society': 35950, 'sports': 8648, 'economy': 4733, 'entertainment': 4681, 'other': 4066, 'junk': 3096, 'science': 2665, 'technology': 1564})
157 / 1504 (10.438829787234043%) TF errors in GT


In [13]:
model.save('data/keras_model.h5', include_optimizer=False)
convert_model.convert("data/keras_model.h5", "data/category_model_ru.json", no_tests=False)

loading data/keras_model.h5
Forward pass took 0.051303 s.
Forward pass took 0.002445 s.
Forward pass took 0.0022 s.
Starting performance measurements.
Forward pass took 0.00242 s.
Forward pass took 0.002405 s.
Forward pass took 0.002167 s.
Forward pass took 0.0022 s.
Forward pass took 0.002319 s.
Forward pass took 0.0023022 s on average.
Converting model architecture.
Converting model weights.
Done converting model weights.
Calculating model hash.
Model conversion finished.
writing data/category_model_ru.json


In [14]:
cpp_predictions3 = news.classify_ru([fi.file for fi in file_info3])
print_test_stats(cpp_predictions3, ground_truth3, file_info3, "TF-C++")

TF-C++ predicted categories: Counter({'society': 35950, 'sports': 8648, 'economy': 4733, 'entertainment': 4681, 'other': 4066, 'junk': 3096, 'science': 2665, 'technology': 1564})
157 / 1504 (10.438829787234043%) TF-C++ errors in GT


In [15]:
for j in list([i for i in range(len(file_info3)) if predicted_labels[i] == "science"])[20:30]:
    if file_info3[j].file in ground_truth3:
        continue
        
    print("\tFT: " + part3_predicted_labels[j])
    print("\tTF: " + predicted_labels[j])
    print("\t" + file_info3[j].site)
    print(file_info3[j].text)
    print()

	FT: junk
	TF: science
	sprosi.d3.ru
[детский трансплантолог] пересадки почки детям с весом до 9 кг Добрый день. Меня зовут Михаил Каабак. С 1989 года занимаюсь детской трансплантологией. Вместе с моей командой мы разработали уникальные технологии, позволяющие успешно пересаживать почки от взрослых доноров маленьким детям, с весом менее 9 кг. В России трансплантация детям с таким весом... Врач–трансплантолог Михаил Каабак: «Мы будем оставаться с пациентами до тех пор, пока последний из них жив» В ноябре Национальный медицинский исследовательский центр здоровья детей уволил трансплантологов Михаила Каабака и Надежду Бабенко за то, что они использовали п…rtvi.com Добрый день.  Меня зовут Михаил Каабак. С 1989 года занимаюсь детской трансплантологией. Вместе с моей командой мы разработали уникальные технологии, позволяющие успешно пересаживать почки от взрослых доноров маленьким детям, с весом менее 9 кг. В России трансплантация детям с таким весом проводится только нашей командой, другие

In [16]:
def cat_words(text):
    text = text.lower()
    cat_words = {}
    for cat, words in category_words.items():
        for word in words:
            if word in text:
                if cat not in cat_words:
                    cat_words[cat] = set()

                cat_words[cat].add(word)
            
    return cat_words


chosen_cat = ""
cat_idx = [i for i in range(len(file_info)) if categories[i] == chosen_cat]

def print_by_idx(i, file_info):
    print(f"{file_info[i].site} {file_info[i].file}")
    text = file_info[i].text
    print("\t", text)
    #print("\t", ", ".join(word for word, _ in tfidf(text)[:50]))
    #print("\t", categories[i])
    #for cat, words in cat_words(text).items():
    #    print(f"\t\t{cat}: {' '.join(words)}")
        
    print()

#for i in np.random.choice(cat_idx, 10):
#    print_by_idx(i)

In [17]:
site_counter = Counter(fi.site for fi in file_info)
site_counter.most_common(15)

[('Невские Новости', 2451),
 ('Регион Online', 1552),
 ('НВ', 1496),
 ('ФедералПресс', 1348),
 ('Корреспондент.net', 1291),
 ('Interfax.ru', 1020),
 ('charter97.org', 958),
 ('EADaily', 914),
 ('РИА Мелитополь', 911),
 ('Коммерсантъ', 898),
 ('Фонтанка.ру', 873),
 ('RT на русском', 871),
 ('Allhockey.ru', 866),
 ('РИА Новости', 844),
 ('Українські Новини', 823)]

In [18]:
selected_sites = Counter()
for i, fi in enumerate(file_info3):
    if predicted_labels[i] == "technology":
        selected_sites[fi.site] += 1
        
print(selected_sites.most_common())

[('CAR.RU', 119), ('Актуальные новости', 60), ('4PDA - Новости мира мобильных устройств', 49), ('ТВОЕ АВТО', 47), ('ВладТайм', 46), ('Русаргумент', 42), ('@ASTERA: Новости IT и финансов', 41), ('3DNews - Daily Digital Digest', 35), ('Автоцентр', 34), ('Телеграф', 32), ('Автоновости дня', 31), ('AKKet', 30), ('Drom.ru', 27), ('iLenta', 24), ('Нью Информ', 20), ('Анонсенс', 20), ('Известия', 20), ('www.zr.ru', 17), ('tvoygorodpskov.ru', 17), ('drive2.ru', 17), ('unionnews.ru', 16), ('110KM', 15), ('Popmech.ru', 15), ('Overclockers.ru', 15), ('Интересные новости OAnews', 14), ('Planet Today', 14), ('Журнал Движок', 14), ('Новости Мойка78', 13), ('Onliner', 13), ('Народные Новости России', 13), ('ITC.ua', 13), ('autoconsulting.com.ua', 13), ('ФБА «Экономика сегодня»', 12), ('Инфореактор', 11), ('RuNews24', 11), ('Лайфхакер', 10), ('Корреспондент.net', 10), ('Andro-News', 10), ('FaceNews', 10), ('VistaNews.ru', 10), ('Код Дурова', 9), ('iXBT.com', 9), ('НВ', 9), ('iPhones.ru', 8), ('Life.ru