In [1]:
with open("klej_ar/train.tsv", "r") as f:
    raw_train = f.readlines()

In [2]:
with open("klej_ar/dev.tsv", "r") as f:
    raw_dev = f.readlines()

In [3]:
def prepare_data(raw_data):
    corpus = []
    labels = []
    for doc in raw_data:
        record = doc.strip().split("\t")
        if len(record) != 2:
            continue
        text, target = record
        label = int(float(target))
        corpus.append(text)
        labels.append(label)
    return corpus, labels

In [4]:
train_corpus, train_labels = prepare_data(raw_train[1:])

In [5]:
test_corpus, test_labels = prepare_data(raw_dev[1:])

# Spacy

In [6]:
simplification = {1: 1, 2: 1, 3: 2, 4: 3, 5: 3}
train_labels = [simplification[label] for label in train_labels]
test_labels = [simplification[label] for label in test_labels]

In [7]:
import random
from sklearn.metrics import classification_report
import spacy
from spacy.util import minibatch

nlp = spacy.load('pl_core_news_md')
random.seed(42)

In [8]:
nlp.pipe_names

['tagger', 'parser', 'ner']

In [9]:
textcat = nlp.create_pipe("textcat", config={"exclusive_classes": True, "architecture": "simple_cnn"})
nlp.add_pipe(textcat, last=True)

In [10]:
nlp.pipe_names

['tagger', 'parser', 'ner', 'textcat']

In [11]:
textcat.add_label("1")
textcat.add_label("2")
textcat.add_label("3")

1

In [12]:
textcat.labels

('1', '2', '3')

In [13]:
spacy_train_labels = [{"cats": {"1": label == 1, 
                                "2": label == 2, 
                                "3": label == 3}} 
                      for label in train_labels]

In [14]:
spacy_train_labels[0]

{'cats': {'1': False, '2': True, '3': False}}

In [15]:
train_data = list(zip(train_corpus, spacy_train_labels))

In [16]:
train_data[:2]

[('Jako do ceny dobra. Przyssawka mogłaby być lepsza. Po 2 miesiącach użytkowania musiałem nóżkę z przyssawką rozkręcić i przyssawkę podkleić bo guma zaczęła pękać od strony mocowania do uchwytu (uchwyt zaczął się po prostu trząść bo zrobił się luz).  Mechanizm mocowania telefonu póki co (3 miesiące użytkowania) działa bez zarzutu. ',
  {'cats': {'1': False, '2': True, '3': False}}),
 ('Na słuchawkę czekałam spory czas a po zadzwonieniu okazało się ,że paczka im się zawieruszyła i w ten sam dzień mieli wysłać najszybszym kurierem i mimo to i tak czekałam znowu gdzie bardzo mi była potrzebna.  Do tego niby bateria trzyma długo gdzie tak nie jest i słuchawka się rozłącza . Dostałam ponownie wiadomość o tym ,że znowu dostanę te samą paczkę po wiadomości do nich ,że to pomyłka Pan odpisał mi,że nie bo to gratis no to się nastawiłam ,że gratis i ,że opłacona za to ,że tyle musiałam czekać .Tym razem przyjeżdża listonosz i mówi paczka do opłaty .Paczka poszła na pocztę i tam sobie leży ;) ! 

In [18]:
not_for_training = [pipe for pipe in nlp.pipe_names if pipe != "textcat"]

In [19]:
ITERATIONS = 10
BATCH = 48

In [None]:
with nlp.disable_pipes(not_for_training):
    optimizer = nlp.begin_training()
    for i in range(ITERATIONS):
        loss = {}
        random.shuffle(train_data)
        batches = minibatch(train_data, size=BATCH)
        for batch in batches:
            text, labels = zip(*batch)
            nlp.update(text, labels, drop=0.5, sgd=optimizer, losses=loss)
        print(f"iter: {i}  loss: {loss['textcat']}", end="\t")
        with textcat.model.use_params(optimizer.averages):
            docs = (nlp.tokenizer(text) for text in test_corpus)
            preds = [int(max(doc.cats.items(), key=lambda x: x[1])[0]) for doc in textcat.pipe(docs)]
            acc = accuracy_score(test_labels, preds)
            print(f"dev acc: {acc}")

In [20]:
with nlp.disable_pipes(not_for_training):
    optimizer = nlp.begin_training()
    for i in range(ITERATIONS):
        loss = {}
        random.shuffle(train_data)
        batches = minibatch(train_data, size=BATCH)
        for batch in batches:
            text, labels = zip(*batch)
            nlp.update(text, labels, drop=0.5, sgd=optimizer, losses=loss)
        print(f"iter: {i}  loss: {loss['textcat']}", end="\t")
        with textcat.model.use_params(optimizer.averages):
            docs = (nlp.tokenizer(text) for text in test_corpus)
            preds = [int(max(doc.cats.items(), key=lambda x: x[1])[0]) for doc in textcat.pipe(docs)]
            acc = accuracy_score(test_labels, preds)
            print(f"dev acc: {acc}")

iter: 0  loss: 0.0480814356560586	dev acc: 0.6182190378710338
iter: 1  loss: 0.04261774256883655	dev acc: 0.6816786079836233
iter: 2  loss: 0.037633761487086304	dev acc: 0.6980552712384852
iter: 3  loss: 0.03759206104587065	dev acc: 0.7093142272262026
iter: 4  loss: 0.03551659890945302	dev acc: 0.72978505629478
iter: 5  loss: 0.03301691386150196	dev acc: 0.7256908904810645
iter: 6  loss: 0.029631754121510312	dev acc: 0.72978505629478
iter: 7  loss: 0.03158019229886122	dev acc: 0.7287615148413511
iter: 8  loss: 0.027786335562268505	dev acc: 0.7287615148413511
iter: 9  loss: 0.028273644784349017	dev acc: 0.72978505629478


In [21]:
with nlp.use_params(optimizer.averages):
    nlp.to_disk("spacy_model")

In [22]:
import os
os.listdir("spacy_model")

['meta.json', 'parser', 'tagger', 'ner', 'tokenizer', 'vocab', 'textcat']

In [23]:
nlp = spacy.load("spacy_model")

In [24]:
doc_corpus = list(nlp.pipe(train_corpus))

In [25]:
doc_corpus[0].cats

{'1': 0.2227286994457245, '2': 0.25765758752822876, '3': 0.5196137428283691}

In [26]:
max(doc_corpus[0].cats.items(), key=lambda x: x[1])

('3', 0.5196137428283691)

In [27]:
spacy_preds = [max(doc.cats.items(), key=lambda x: x[1])[0] for doc in doc_corpus]

In [28]:
spacy_preds[:3]

['3', '1', '3']

In [29]:
spacy_preds = [int(pred) for pred in spacy_preds]

In [30]:
print(classification_report(train_labels, spacy_preds))

              precision    recall  f1-score   support

           1       0.77      0.88      0.82      2799
           2       0.50      0.02      0.04      1195
           3       0.84      0.95      0.89      5186

    accuracy                           0.81      9180
   macro avg       0.70      0.62      0.58      9180
weighted avg       0.77      0.81      0.76      9180



In [31]:
test_doc_corpus = list(nlp.pipe(test_corpus))

In [32]:
test_spacy_preds = [max(doc.cats.items(), key=lambda x: x[1])[0] for doc in test_doc_corpus]

In [33]:
test_spacy_preds = [int(pred) for pred in test_spacy_preds]

In [34]:
print(classification_report(test_labels, test_spacy_preds))

              precision    recall  f1-score   support

           1       0.69      0.79      0.74       327
           2       0.00      0.00      0.00       137
           3       0.76      0.88      0.82       513

    accuracy                           0.73       977
   macro avg       0.48      0.56      0.52       977
weighted avg       0.63      0.73      0.68       977

