In [1]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import glob
from collections import Counter
import math
import re
import json
import requests

import pandas as pd
import umap
from tqdm.autonotebook import tqdm
from nltk.tokenize import word_tokenize

import numpy as np
from sklearn.datasets import load_iris, load_digits
from sklearn.model_selection import train_test_split, KFold

from tensorflow import keras

import tensorflow as tf
gpus = tf.config.experimental.list_physical_devices('GPU')
#assert gpus
try:
    # Currently, memory growth needs to be the same across GPUs
    for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu, True)

    logical_gpus = tf.config.experimental.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
except RuntimeError as e:
    # Memory growth must be set before GPUs have been initialized
    print(e)


from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.metrics import confusion_matrix
from sklearn.utils import class_weight
layers = keras.layers
models = keras.models
from tensorflow.keras.optimizers import Adam

import convert_model

import language
import text_nn
import grab_category
import news

In [3]:
folder_bootstrap = "data/sample"
file_info = language.read_dump(os.path.join(folder_bootstrap, "langs", "dump", "en"))
normalizer = news.TextNormalizer("english")
normalizer.train(file_info)
train_texts = normalizer.normalize_texts(file_info)

folders_part2 = ["data/sample2", "data/sample3"]
file_info2 = []
for folder in folders_part2:
    file_info2.extend(language.read_dump(os.path.join(folder, "langs", "dump", "en")))

train_texts2 = normalizer.normalize_texts(file_info2)

folder_part3 = "data/sample4"
file_info3 = language.read_dump(os.path.join(folder_part3, "langs", "dump", "en"))
train_texts3 = normalizer.normalize_texts(file_info3)

35197 word stems


In [4]:
ground_truth = grab_category.load_gt(folder_bootstrap)

ground_truth2 = {}
for folder in folders_part2:
    ground_truth2.update(grab_category.load_gt(folder).items())

ground_truth3 = grab_category.load_gt(folder_part3)

7542 GT labels loaded from data/sample
6848 GT labels loaded from data/sample2
5818 GT labels loaded from data/sample3
5853 GT labels loaded from data/sample4


In [5]:
def print_test_stats(predicted_labels, ground_truth, file_info, name):
    errors = 0
    for i, fi in enumerate(file_info):
        if fi.file in ground_truth:
            if predicted_labels[i] != ground_truth[fi.file]:
                errors += 1
            
    print(f"{name} predicted categories:", Counter(predicted_labels))
    print(f"{errors} / {len(ground_truth)} ({errors / len(ground_truth) * 100}%) {name} errors in GT")

In [6]:
kw_categories = text_nn.keyword_categories(file_info, text_nn.category_words_en)
url_junk = grab_category.junk_by_url(file_info, grab_category.junk_url_pattern_en)
grab_category.gt_to_linear(kw_categories, url_junk, file_info)
categories = kw_categories.copy() 
print(f"{sum(cat != '' for cat in categories)} / {len(categories)} ({sum(cat != '' for cat in categories) / len(categories) * 100}%) have categories")
print(Counter(categories))

print_test_stats(kw_categories, ground_truth, file_info, "keywords")

HBox(children=(IntProgress(value=0, description='assigning keyword-based labels', max=70638, style=ProgressSty…


70555 / 70638 (99.88249950451599%) have categories
Counter({'society': 24653, 'junk': 10430, 'sports': 10125, 'entertainment': 9390, 'economy': 7448, 'technology': 4333, 'other': 2496, 'science': 1680, '': 83})
keywords predicted categories: Counter({'society': 24653, 'junk': 10430, 'sports': 10125, 'entertainment': 9390, 'economy': 7448, 'technology': 4333, 'other': 2496, 'science': 1680, '': 83})
1729 / 7542 (22.92495359321135%) keywords errors in GT


In [7]:
grab_category.gt_to_linear(categories, ground_truth, file_info)

In [8]:
np.random.seed(3)
reassigned_labels = text_nn.reassign_labels_one_dataset_ft(train_texts, categories)

print_test_stats(reassigned_labels, ground_truth, file_info, "FastText")
print_test_stats(kw_categories, ground_truth, file_info, "keywords")
grab_category.gt_to_linear(reassigned_labels, ground_truth, file_info)

part2_pseudolabels = text_nn.fasttext_predict_category(train_texts, reassigned_labels, train_texts2)    
print_test_stats(part2_pseudolabels, ground_truth2, file_info2, "FastText")
grab_category.gt_to_linear(part2_pseudolabels, ground_truth2, file_info2)

part3_predicted_labels = text_nn.fasttext_predict_category(train_texts + train_texts2, reassigned_labels + part2_pseudolabels, train_texts3)    
print_test_stats(part3_predicted_labels, ground_truth3, file_info3, "FastText")

HBox(children=(IntProgress(value=0, description='reassigning labels [ft]', max=5, style=ProgressStyle(descript…


FastText predicted categories: Counter({'society': 24713, 'junk': 10649, 'sports': 10434, 'entertainment': 9285, 'economy': 7501, 'technology': 4081, 'other': 2354, 'science': 1621})
1134 / 7542 (15.035799522673033%) FastText errors in GT
keywords predicted categories: Counter({'society': 24653, 'junk': 10430, 'sports': 10125, 'entertainment': 9390, 'economy': 7448, 'technology': 4333, 'other': 2496, 'science': 1680, '': 83})
1729 / 7542 (22.92495359321135%) keywords errors in GT
FastText predicted categories: Counter({'society': 37551, 'junk': 16046, 'sports': 14385, 'entertainment': 13999, 'economy': 13216, 'technology': 6675, 'other': 2940, 'science': 2615})
1312 / 7998 (16.404101025256317%) FastText errors in GT
FastText predicted categories: Counter({'society': 20079, 'sports': 9886, 'junk': 9085, 'entertainment': 7276, 'economy': 6008, 'technology': 2694, 'other': 1495, 'science': 1150})
312 / 5853 (5.330599692465402%) FastText errors in GT


In [9]:
all_train_texts = train_texts + train_texts2
predict_texts = train_texts3
train_categories = reassigned_labels + part2_pseudolabels
verbose = 1
max_words = 7000
tokenize = keras.preprocessing.text.Tokenizer(num_words=max_words, char_level=False)
tokenize.fit_on_texts(train_texts)  # fit tokenizer to our training text data
x_train = tokenize.texts_to_matrix(all_train_texts).astype(np.float32)
x_test = tokenize.texts_to_matrix(predict_texts).astype(np.float32)

# Use sklearn utility to convert label strings to numbered index
encoder = LabelEncoder()
encoder.fit(train_categories)
y_train_num = encoder.transform(train_categories)

# Converts the labels to a one-hot representation
num_classes = np.max(y_train_num) + 1
y_train = keras.utils.to_categorical(y_train_num, num_classes).astype(np.float32)

def ground_truth_idx(ground_truth, file_info):
    idx = []
    for i, fi in enumerate(file_info):
        idx.append(fi.file in ground_truth)
        
    return idx

gt1 = ground_truth_idx(ground_truth, file_info)
gt2 = ground_truth_idx(ground_truth2, file_info2)
sample_weights = np.array(gt1 + gt2, np.float32) * 4 + 1

In [10]:
tokenizer_data = json.loads(tokenize.to_json())
word_index = json.loads(tokenizer_data["config"]["word_index"])
with open("data/dictionary_en.tsv", "w") as f:
    for word, index in word_index.items():
        if index < max_words:
            f.write(f"{word}\t{index}\t{normalizer.idf[word] if word in normalizer.idf else 0}\n")
            
with open("data/categories_en", "w") as f:
    for c in encoder.classes_:
        f.write(c + "\n")

In [11]:
%reset -f in out
batch_size = 32
epochs = 10
drop_ratio = 0.5

# Build the model
model = models.Sequential()
model.add(layers.Dropout(0.4, input_shape=(max_words,)))
model.add(layers.Dense(512, use_bias=True, activation="relu"))
model.add(layers.Dropout(drop_ratio))
model.add(layers.Dense(512, use_bias=True, activation="relu"))
model.add(layers.Dropout(drop_ratio))
model.add(layers.Dense(num_classes, activation="softmax"))

model.compile(loss='categorical_crossentropy',
              optimizer=Adam(lr=0.0001),
              metrics=['accuracy'])

model.summary()

Flushing input history
Flushing output cache (0 entries)
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dropout (Dropout)            (None, 7000)              0         
_________________________________________________________________
dense (Dense)                (None, 512)               3584512   
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 512)               262656    
_________________________________________________________________
dropout_2 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 8)                 4104      
Total params: 3,851,272
Trainable params: 3,851,272
Non-trainable

In [12]:
# model.fit trains the model
#class_weights = class_weight.compute_class_weight('balanced', list(np.unique(y_train_num)), y_train_num)

history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=verbose,
                    validation_split=0.0,
                    class_weight=None,
                    sample_weight=sample_weights)

Train on 178065 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [13]:
# Here's how to generate a prediction on individual examples
text_labels = encoder.classes_

predictions = model.predict(x_test)
predicted_labels = text_labels[np.argmax(predictions, axis=1)]
print_test_stats(predicted_labels, ground_truth3, file_info3, "TF")

TF predicted categories: Counter({'society': 21449, 'sports': 10391, 'junk': 8141, 'entertainment': 6624, 'economy': 6111, 'technology': 2729, 'other': 1124, 'science': 1104})
249 / 5853 (4.254228600717581%) TF errors in GT


In [14]:
model.save('data/keras_model.h5', include_optimizer=False)
convert_model.convert("data/keras_model.h5", "data/category_model_en.json", no_tests=False)

loading data/keras_model.h5
Forward pass took 0.052015 s.
Forward pass took 0.00255 s.
Forward pass took 0.00234 s.
Starting performance measurements.
Forward pass took 0.002141 s.
Forward pass took 0.002165 s.
Forward pass took 0.002184 s.
Forward pass took 0.002301 s.
Forward pass took 0.00208 s.
Forward pass took 0.0021742000000000003 s on average.
Converting model architecture.
Converting model weights.
Done converting model weights.
Calculating model hash.
Model conversion finished.
writing data/category_model_en.json


In [15]:
cpp_predictions3 = news.classify_en(file_info3)
print_test_stats(cpp_predictions3, ground_truth3, file_info3, "TF-C++")

TF-C++ predicted categories: Counter({'society': 21448, 'sports': 10391, 'junk': 8142, 'entertainment': 6624, 'economy': 6111, 'technology': 2729, 'other': 1124, 'science': 1104})
249 / 5853 (4.254228600717581%) TF-C++ errors in GT


In [170]:
for j in list([i for i in range(len(file_info3)) if predicted_labels[i] == "science"])[20:30]:
    if file_info3[j].file in ground_truth3:
        continue
        
    print("\tFT: " + part3_predicted_labels[j])
    print("\tTF: " + predicted_labels[j])
    print("\t" + file_info3[j].site)
    print(file_info3[j].text)
    print()

	FT: science
	TF: science
	Engadget
Astronomers create first global map of Saturn's moon Titan It has landscape almost as diverse as Earth. Scientists finally have a comprehensive view of Titan, Saturn's largest moon. A team of astronomers has created the first global map of Titan by using the Cassini probe's over 100 fly-bys to stitch together both imagery and radar measurements. The comprehensive view reveals a landscape that's almost as diverse as Earth in key way. Just shy of two thirds of the surface is dominated by flat plains, but 17 percent of it (mostly at the equator) is blanketed in dunes. Another 14 percent is hilly or mountainous, while 1.5 percent of it has a labyrinthine terrain shaped by erosion and rain. What you don't see is just as important, too. Only 1.5 percent of Titan is covered in lakes (methane, not water), and the distinct lack of impact craters suggests the surface is comparatively young. The map should help researchers address some of Titan's mysteries, suc

In [16]:
def cat_words(text):
    text = text.lower()
    cat_words = {}
    for cat, words in category_words.items():
        for word in words:
            if word in text:
                if cat not in cat_words:
                    cat_words[cat] = set()

                cat_words[cat].add(word)
            
    return cat_words


chosen_cat = ""
cat_idx = [i for i in range(len(file_info)) if categories[i] == chosen_cat]

def print_by_idx(i, file_info):
    print(f"{file_info[i].site} {file_info[i].file}")
    text = file_info[i].text
    print("\t", text)
    #print("\t", ", ".join(word for word, _ in tfidf(text)[:50]))
    #print("\t", categories[i])
    #for cat, words in cat_words(text).items():
    #    print(f"\t\t{cat}: {' '.join(words)}")
        
    print()

#for i in np.random.choice(cat_idx, 10):
#    print_by_idx(i)

In [4]:
site_counter = Counter(fi.site for fi in file_info)
site_counter.most_common(15)

[('Reuters', 1817),
 ('the Guardian', 1742),
 ('mirror', 1725),
 ('CNA', 1671),
 ('The Washington Times', 1587),
 ('WFXT', 1561),
 ('Business-Standard', 1528),
 ('Forbes', 1372),
 ('Sputniknews', 1249),
 ('Malaymail', 1238),
 ('The Hindu', 1155),
 ('Fox News', 1068),
 ('Yahoo', 1029),
 ('www.ctvnews.ca', 826),
 ('Breaking News', 765)]

In [None]:
all_reuters_pages = {}

In [None]:
for fi in tqdm(file_info + file_info2 + file_info3):
    if fi.site == "Reuters" and fi.file not in all_reuters_pages:
        link = fi.url.replace("https://www.reuters.com/", "https://mobile.reuters.com/")
        all_reuters_pages[fi.file] = grab_category.grab_page(link)

HBox(children=(IntProgress(value=0, max=235738), HTML(value='')))

In [41]:
kw_regex = re.compile(r'<meta name="keywords" content="([^"]*)" />')
all_reuters_kw = {}
for file, page in all_reuters_pages.items():
    m = kw_regex.search(page)
    if m:
        keywords = frozenset([kw.lower() for kw in m.group(1).split(",")])
        all_reuters_kw[file] = list(keywords)
    else:
        print(file)
        print("No keywords")
        print()
        
with open("data/reuters_kw.json", "w") as f:
    json.dump(all_reuters_kw, f)

In [None]:
for i, fi in enumerate(file_info):
    if fi.site == "Reuters":
        print(fi.url)
        print("\t" + fi.file)

In [18]:
selected_sites = Counter()
for i, fi in enumerate(file_info3):
    if predicted_labels[i] == "technology":
        selected_sites[fi.site] += 1
        
print(selected_sites.most_common())

[('CAR.RU', 119), ('Актуальные новости', 60), ('4PDA - Новости мира мобильных устройств', 49), ('ТВОЕ АВТО', 47), ('ВладТайм', 46), ('Русаргумент', 42), ('@ASTERA: Новости IT и финансов', 41), ('3DNews - Daily Digital Digest', 35), ('Автоцентр', 34), ('Телеграф', 32), ('Автоновости дня', 31), ('AKKet', 30), ('Drom.ru', 27), ('iLenta', 24), ('Нью Информ', 20), ('Анонсенс', 20), ('Известия', 20), ('www.zr.ru', 17), ('tvoygorodpskov.ru', 17), ('drive2.ru', 17), ('unionnews.ru', 16), ('110KM', 15), ('Popmech.ru', 15), ('Overclockers.ru', 15), ('Интересные новости OAnews', 14), ('Planet Today', 14), ('Журнал Движок', 14), ('Новости Мойка78', 13), ('Onliner', 13), ('Народные Новости России', 13), ('ITC.ua', 13), ('autoconsulting.com.ua', 13), ('ФБА «Экономика сегодня»', 12), ('Инфореактор', 11), ('RuNews24', 11), ('Лайфхакер', 10), ('Корреспондент.net', 10), ('Andro-News', 10), ('FaceNews', 10), ('VistaNews.ru', 10), ('Код Дурова', 9), ('iXBT.com', 9), ('НВ', 9), ('iPhones.ru', 8), ('Life.ru