In [3]:
from collections import Counter
from itertools import chain, groupby
from random import choice
import json
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from nltk import ngrams

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import typing as tp
TCorpus = tp.List[tp.Tuple[str, str, tp.Dict[str, tp.Set[str]]]]
TNgrams = tp.Dict[tp.Tuple[str], int]

In [5]:
with open("tpc_2019_01_dataset.json") as f:
    data = json.load(f)
data[0]

{'questionId': 'b5f69407-d633-400c-9c11-ea40f59912c9',
 'userId': 'NyB5YNGjqxmsKjW3w',
 'title': "Facebook Auto-UnLiker — Your Facebook Page 'Likes' Might Drop This Week",
 'text': '\n\n\n\nDo you own a Facebook Business page? If yes, then you will notice a drop in the number of "likes" on your Facebook Page by next week, which could be quite disappointing but, Facebook believes, will help business to know their actual followers.\n\n\n\n\n\nFACEBOOK\'S OFFICIAL MASS AUTO-UNLIKE \n\nThe social network giant is giving its Pages a little spring cleaning, purging them of memorialized and voluntarily deactivated inactive Facebook accounts in an attempt to make its users data more meaningful for businesses and brands.\n\n   \n\n\nFacebook purge will begin from March 12, Facebook said, and should continue over the next few weeks.\n\n"Over the coming weeks, Page admins should expect to see a small dip in their number of Page likes as a result of this update," Facebook said in a blog post. "It’

In [6]:
def data_generator(data):
    for _, group in groupby(data, key=lambda x: x["questionId"]):
        labels = {}
        for x in group:
            title = x["title"]
            text = x["text"]
            labels.update({x["userId"]: x["labels"]})
        yield (title, text, labels)

def test_data_generator(data):
    for _, group in groupby(data, key=lambda x: x["questionId"]):
        labels = {}
        for x in group:
            title = x["title"]
            text = x["text"]
            labels.update({x["userId"]: x["labels"]})
        yield (title, text), labels

def batch_generator(data, preprocess=None, batch_size=8, shuffle=True):
    batch = []
    indexes = np.arange(len(data))
    for _ in range(epochs):
        if shuffle:
            np.random.shuffle(indexes)
        for i in indexes:
            batch.append(data[i])
            if len(batch) >= batch_size:
                yield batch
                batch = []

In [7]:
class Net(nn.Module):
    def __init__(self, input_size, output_size):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.fc2 = nn.Linear(128, 32)
        self.fc3 = nn.Linear(32, output_size)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.sigmoid(self.fc3(x))
        return x

In [8]:
class Solution:
    def __init__(
            self,            # type: Solution
            classes=None,    # type: tp.Optional[tp.List[str]],
            nonclass=None,   # type: tp.Optional[str]
            languages=None,  # type: tp.Optional[tp.Dict[str, tp.Set[str]]]
            epochs=100,      # type: tp.Optional[int]
            batch_size=16,   # type: tp.Optional[int]
            gramm_size=3,    # type: tp.Optional[int]
            verbose=0,       # type: tp.Optional[int]
            roc_auc=0.5,     # type: tp.Optional[float]
            lr=0.001,        # type: tp.Optional[float]
            lr_step=0.97     # type: tp.Optional[float]
            ):
        # type: (...) -> None
        self.epochs=epochs
        self.gramm_size=gramm_size
        self.batch_size = batch_size
        self.classes = self._get_classes(classes)
        self.nonclass = self._get_non_class(nonclass)
        self.languages = self._get_languages(languages)
        self.clfs = None
        self.verbose = verbose
        self.roc_auc = roc_auc
        self.lr=lr
        self.lr_step = lr_step

    def train(
            self,         # type: Solution
            train_corpus  # type: TCorpus
            ):
        # type: (...) -> None
        language_to_corpus = self._split_on_languages(train_corpus)
        self.clfs = {
            lang: self._train(corpus["corp"], {gramm: i for i, gramm in enumerate(corpus["ngrams"])}) 
            for lang, corpus in language_to_corpus.items()
        }

    def predict(
            self,  # type: Solution
            news   # type: List[Tuple[str, str]]
            ):
        # type: (...) -> List[Set[str]]
        result = []
        for new in news:
            text = new[1]
            lang = self._get_language(text)
            n_grams, clf = self.clfs[lang]
            vec = self._text_to_vector(text, n_grams)
            y = clf(torch.from_numpy(vec).type(torch.FloatTensor))
            y = {class_ for p, class_ in zip(y, self.classes) if p > self.roc_auc}
            result.append(y if len(y) > 0 else {self.nonclass})
        return result

    def _train(
            self,          # type: Solution
            train_corpus,  # type: TCorpus
            n_grams        # type: TNgrams
            ):
        # type: (...) -> None
        X, Y = self._corpus_to_vectors(train_corpus, n_grams)

        net = Net(len(n_grams), len(self.classes))
        criterion = nn.BCELoss()

        lr = self.lr
        optimizer = optim.Adam(net.parameters(), lr=lr, weight_decay=0.00001)
        optimizer.zero_grad()

        for epoch in range(self.epochs):
            epoch_loss = 0
            for i, (X_batch, y_batch) in enumerate(self._batch_generator(X, Y)):
                output = net(X_batch)
                loss = criterion(output, y_batch)
                loss.backward()
                optimizer.step()
                epoch_loss += loss
            self._verbose(epoch, epoch_loss, net, X, Y)

        return n_grams, net
    
    def _verbose(self, epoch, loss, net, X, Y):
        if (self.verbose > 0):
            print("epoch: {}".format(epoch), end=" ")
        if self.verbose > 1:
            print("loss: {}".format(loss), end=" ")
        if self.verbose > 2:
            print("score: {}".format(self._score(net, X, Y)), end=" ")
        if self.verbose > 0:
            print()

    def _score(self, clf, X, Y):
        Y_pred = clf(X)
        score = [f1_score(orig, pred)
                 for orig, pred in zip((Y.numpy() > self.roc_auc).astype(np.int), 
                                       Y_pred.detach().numpy().round().astype(np.int))]
        
        return sum(score) / len(score)

    def _batch_generator(self, X, Y):
        indexes = np.arange(len(X))
        np.random.shuffle(indexes)
        for i in range(len(X) // self.batch_size):
            x_batch, y_batch = (X[i*self.batch_size: (i+1)*self.batch_size], 
                                Y[i*self.batch_size: (i+1)*self.batch_size])
            yield x_batch, y_batch

    def _split_on_languages(self, corpus):
        corpuses = {lang: {"corp": [], "ngrams": set()} for lang in self.languages}
        for text_with_labels in corpus:
            text = text_with_labels[1]
            lang, n_grams = self._get_lang_and_ngrams(text)
            corpuses[lang]["corp"].append(text_with_labels)
            corpuses[lang]["ngrams"].update(n_grams)
        return corpuses

    def _get_lang_and_ngrams(self, text):
        return self._get_language(text), self._n_gramm(text)

    def _get_language(
            self,  # type: Solution
            text   # type: str
            ):
        # type: (...) -> str
        languages = dict.fromkeys(self.languages.keys(), 0)
        for sym in text:
            for lang, letters in self.languages.items():
                if sym in letters:
                    languages[lang] += 1
        return max(languages.items(), key=lambda tup: tup[1])[0]

    def _n_gramm(self, text):
        # type: (...) -> tp.Tuple[str]
        n=self.gramm_size
        n_grams = ngrams(" ".join(text.split()).lower(), n)
        for grams in n_grams:
            yield grams

    def _labels_to_vector(self, labels):
        classes = choice(list(labels))
        return np.array([bool(class_ in classes) for class_ in self.classes], dtype=np.int)

    def _corpus_to_vectors(self, corpus, ngramms):
        X = []
        Y = []
        for corp in corpus:
            X.append(self._text_to_vector(corp[1], ngramms))
            Y.append(self._labels_to_vector(corp[2].values()))
        return (torch.from_numpy(np.array(X)).type(torch.FloatTensor), 
                torch.from_numpy(np.array(Y)).type(torch.FloatTensor))

    def _text_to_vector(self, text, ngramms):
        vec = np.zeros(len(ngramms))
        for gramm in self._n_gramm(text):
            if gramm in ngramms:
                vec[ngramms[gramm]] += 1
        return (vec / np.linalg.norm(vec))

    @staticmethod
    def _get_classes(classes):
        if classes is not None:
            return classes
        return ["Угроза", "Уязвимость", "Эксплойт", "Инцидент", "Вредоносное ПО"]

    @staticmethod
    def _get_non_class(nonclass):
        if nonclass is not None:
            return nonclass
        return "Прочее"

    @staticmethod
    def _get_languages(languages):
        if languages is not None:
            return languages
        return {"ru": set("йцукенгшщзхъфывапролджэёячсмитьбю"),
                "en": set("qwertyuiopasdfghjklzxcvbnm")}

In [369]:
def to_vec(st):
    classes = ["Угроза", "Уязвимость", "Эксплойт", "Инцидент", "Вредоносное ПО", "Прочее"]
    return [int(elem in st) for elem in classes]

indexes = np.arange(len(data))
np.random.shuffle(indexes)
train_data = [data[i] for i in indexes[:1000]]
test_data = [data[i] for i in indexes[1000:]]

clf = Solution(verbose=2)
clf.train(data_generator(train_data))

metrics = []
for x, y in test_data_generator(test_data):
    y_pred, y_orig = to_vec(clf.predict([x])[0]), to_vec(list(y.values())[0])
    metrics.append(f1_score(y_orig, y_pred))
print(sum(metrics) / len(metrics))

epoch: 0 loss: 23.17462730407715 
epoch: 1 loss: 21.611413955688477 
epoch: 2 loss: 21.518016815185547 
epoch: 3 loss: 20.15555191040039 
epoch: 4 loss: 17.965036392211914 
epoch: 5 loss: 15.779212951660156 
epoch: 6 loss: 16.009010314941406 
epoch: 7 loss: 15.701870918273926 
epoch: 8 loss: 15.066007614135742 
epoch: 9 loss: 14.687211036682129 
epoch: 10 loss: 13.874427795410156 
epoch: 11 loss: 13.433914184570312 
epoch: 12 loss: 12.428201675415039 
epoch: 13 loss: 13.25733757019043 
epoch: 14 loss: 12.087092399597168 
epoch: 15 loss: 14.77341079711914 
epoch: 16 loss: 13.404492378234863 
epoch: 17 loss: 15.720309257507324 
epoch: 18 loss: 12.269837379455566 
epoch: 19 loss: 15.155976295471191 
epoch: 20 loss: 13.873270034790039 
epoch: 21 loss: 13.944623947143555 
epoch: 22 loss: 14.689910888671875 
epoch: 23 loss: 14.314431190490723 
epoch: 24 loss: 12.814324378967285 
epoch: 25 loss: 11.621940612792969 
epoch: 26 loss: 12.403376579284668 
epoch: 27 loss: 12.069376945495605 
epoch:

In [246]:
gen = test_data_generator(test_data)

In [247]:
metrics = []
def to_vec(st):
    classes = ["Угроза", "Уязвимость", "Эксплойт", "Инцидент", "Вредоносное ПО", "Прочее"]
    return [int(elem in st) for elem in classes]
for x, y in test_data_generator(test_data):
    y_pred, y_orig = to_vec(clf.predict([x])[0]), to_vec(list(y.values())[0])
    metrics.append(f1_score(y_orig, y_pred))
print(sum(metrics) / len(metrics))

0.6141997593261135
