In [1]:
with open("klej_polemo2.0-in/train.tsv", "r") as f:
    raw_train = f.readlines()

In [2]:
with open("klej_polemo2.0-in/dev.tsv", "r") as f:
    raw_dev = f.readlines()

In [3]:
def prepare_data(raw_data):
    corpus = []
    labels = []
    for doc in raw_data:
        text, target = doc.strip().split("\t")
        if "plus" in target:
            label = 0
        elif "minus" in target:
            label = 1
        else:
            label = 2
        corpus.append(text)
        labels.append(label)
    return corpus, labels

In [4]:
train_corpus, train_labels = prepare_data(raw_train[1:])

In [5]:
test_corpus, test_labels = prepare_data(raw_dev[1:])

# LSTM + słownik

```
!wget https://dl.fbaipublicfiles.com/fasttext/vector-crawl/cc.pl.300.bin.gz
!gunzip cc.pl.300.bin.gz
!pip install fasttext
```

In [6]:
import random
from tqdm import tqdm
import numpy as np
import pandas as pd
import torch
from torch import nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import fasttext
from sklearn.metrics import classification_report

torch.manual_seed(42)
random.seed(42)

In [7]:
senti_df = pd.read_csv("slownikWydzwieku01.csv", sep="\t", header=None)
senti_df.head()

Unnamed: 0,0,1,2,3,4,5
0,poddany,c,0,-1,-1,-4.6
1,przewrażliwiony,m,1,1,-2,-3.6
2,cudzoziemiec,c,0,0,0,-2.6
3,przekonywać,m,1,1,0,-1.8
4,skrupuł,c,1,1,1,-3.0


In [8]:
mins = np.min(senti_df[[2, 3, 4, 5]].values, axis=0)
maxs = np.max(senti_df[[2, 3, 4, 5]].values, axis=0)

In [9]:
senti_dict = {}
for row in senti_df.itertuples(index=False):
    key = row[0]
    value = np.array(row[2:])
    value = (value - mins) / (maxs - mins)
    senti_dict[key] = value

In [10]:
senti_dict["poddany"]

array([0.        , 0.        , 0.25      , 0.26744186])

In [11]:
VEX = fasttext.load_model("cc.pl.300.bin")
N_FEATS = VEX.get_dimension()



In [12]:
def w2v(token):
    try:
        return VEX.get_word_vector(token)
    except KeyError:
        return np.zeros((N_FEATS,))

In [13]:
tok_train_corpus = [doc.split() for doc in train_corpus]
tok_test_corpus = [doc.split() for doc in test_corpus]

In [14]:
train_data = list(zip(tok_train_corpus, train_labels))
test_data = list(zip(tok_test_corpus, test_labels))

In [15]:
PADDING_VECTOR = np.zeros((N_FEATS + 4,))

In [16]:
def datapoints_to_batch(datapoints, max_len, senti_dict):
    size = len(datapoints)
    vectors = []
    lengths = []
    labels = []
    for tokens, label in datapoints:
        vec = [np.concatenate((w2v(token), senti_dict.get(token, np.zeros(4)))) for token in tokens]
        tok_num = len(vec)
        if tok_num > max_len:
            vec = vec[:max_len]
            tok_num = max_len
        vectors.append(vec)
        lengths.append(tok_num)
        labels.append(label)
    max_len = max(lengths)
    
    for vec in vectors:
        while len(vec) < max_len:
            vec.insert(0, PADDING_VECTOR)
    X = torch.tensor(vectors, dtype=torch.float32)
    Y = torch.tensor(labels)
    return X, Y, lengths

In [17]:
def train_on_batch(model, criterion, optimizer, X, Y, lengths):
    model.train()
    optimizer.zero_grad()
    output = model(X, lengths)
    loss = criterion(output, Y)
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
    optimizer.step()
    return loss.item()

In [18]:
def predict_on_batch(model, X, Y, lengths):
    model.eval()
    output = model(X, lengths)
    decision = output.topk(1).indices.squeeze()
    equal = decision == Y
    correct = sum(equal).item()
    return correct, decision

In [19]:
class LSTMModel(nn.Module):
    
    def __init__(self, input_size, hidden_size, output_size, dropout=0.5):
        super().__init__()
        self.hidden_size = hidden_size
        self.dropout = nn.Dropout(dropout)
        self.lstm = nn.LSTM(input_size, hidden_size, bidirectional=True, batch_first=True)
        self.dense = nn.Linear(hidden_size * 2, output_size)
        self.softmax = nn.LogSoftmax(dim=1)
    
    def init_state(self, batch_size):
        state = torch.zeros(2, batch_size, self.hidden_size)
        cell = torch.zeros(2, batch_size, self.hidden_size)
        return state, cell
    
    def forward(self, data, lengths):
        batch_size = data.shape[0]
        zero_hidden, zero_cell = self.init_state(batch_size)
        
        packed_input = pack_padded_sequence(data, lengths, batch_first=True, enforce_sorted=False)
        packed_output, _ = self.lstm(packed_input, (zero_hidden, zero_cell))
        
        output, lengths = pad_packed_sequence(packed_output, batch_first=True)
        
        aggregated = self.dropout(output.sum(1).div(lengths.unsqueeze(1)))
        
        output = self.softmax(self.dense(aggregated))
        return output

In [20]:
model = LSTMModel(N_FEATS + 4, 20, 3)
criterion = torch.nn.NLLLoss()
learning_rate = 0.001
epochs = 20
batch_size = 10
max_len = 100
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [21]:
num_train_batches = len(train_data) // batch_size + int(bool(len(train_data) % batch_size))
num_test_batches = len(test_data) // batch_size + int(bool(len(test_data) % batch_size))

best_acc = 0

In [22]:
for epoch in range(epochs):
    random.shuffle(train_data)
    total_loss = 0
    for n in tqdm(range(num_train_batches)):
        datapoints = train_data[n * batch_size:(n + 1) * batch_size]
        X, Y, lengths = datapoints_to_batch(datapoints, max_len, senti_dict)
        loss = train_on_batch(model, criterion, optimizer, X, Y, lengths)
        total_loss += loss
    print(f"loss: {total_loss}")
    
    with torch.no_grad():
        total = 0
        correct = 0
        for n in tqdm(range(num_test_batches)):
            datapoints = test_data[n * batch_size:(n + 1) * batch_size]
            X, Y, lengths = datapoints_to_batch(datapoints, max_len, senti_dict)
            result, _ = predict_on_batch(model, X, Y, lengths)
            total += batch_size
            correct += result
        acc = correct/total * 100
        print(f"acc: {acc}")
        if acc > best_acc:
            best_acc = acc
            torch.save(model, "lstm.model")

100%|██████████| 579/579 [00:56<00:00, 10.21it/s]
  3%|▎         | 2/73 [00:00<00:03, 18.38it/s]

loss: 555.7316371202469


100%|██████████| 73/73 [00:04<00:00, 18.13it/s]
  0%|          | 0/579 [00:00<?, ?it/s]

acc: 62.73972602739726


100%|██████████| 579/579 [00:55<00:00, 10.40it/s]
  3%|▎         | 2/73 [00:00<00:04, 16.37it/s]

loss: 444.92512008547783


100%|██████████| 73/73 [00:04<00:00, 18.13it/s]
  0%|          | 1/579 [00:00<00:59,  9.76it/s]

acc: 68.4931506849315


100%|██████████| 579/579 [00:56<00:00, 10.17it/s]
  3%|▎         | 2/73 [00:00<00:03, 19.02it/s]

loss: 395.9656071290374


100%|██████████| 73/73 [00:04<00:00, 17.85it/s]
  0%|          | 2/579 [00:00<00:56, 10.27it/s]

acc: 71.78082191780823


100%|██████████| 579/579 [00:56<00:00, 10.25it/s]
  3%|▎         | 2/73 [00:00<00:04, 15.64it/s]

loss: 372.0642773061991


100%|██████████| 73/73 [00:04<00:00, 17.21it/s]
  0%|          | 1/579 [00:00<01:02,  9.23it/s]

acc: 69.17808219178082


100%|██████████| 579/579 [00:56<00:00, 10.26it/s]
  3%|▎         | 2/73 [00:00<00:03, 18.61it/s]

loss: 359.2016426771879


100%|██████████| 73/73 [00:04<00:00, 18.16it/s]
  0%|          | 2/579 [00:00<00:55, 10.35it/s]

acc: 71.64383561643835


100%|██████████| 579/579 [00:56<00:00, 10.28it/s]
  3%|▎         | 2/73 [00:00<00:04, 17.33it/s]

loss: 349.8338042348623


100%|██████████| 73/73 [00:04<00:00, 17.05it/s]
  0%|          | 1/579 [00:00<01:03,  9.07it/s]

acc: 75.89041095890411


100%|██████████| 579/579 [00:59<00:00,  9.71it/s]
  3%|▎         | 2/73 [00:00<00:04, 17.23it/s]

loss: 328.2367929816246


100%|██████████| 73/73 [00:04<00:00, 17.19it/s]
  0%|          | 1/579 [00:00<01:31,  6.34it/s]

acc: 74.24657534246575


100%|██████████| 579/579 [00:58<00:00,  9.97it/s]
  3%|▎         | 2/73 [00:00<00:03, 18.53it/s]

loss: 322.7046853899956


100%|██████████| 73/73 [00:04<00:00, 17.79it/s]
  0%|          | 1/579 [00:00<01:05,  8.84it/s]

acc: 70.82191780821918


100%|██████████| 579/579 [00:55<00:00, 10.40it/s]
  3%|▎         | 2/73 [00:00<00:03, 19.00it/s]

loss: 316.63884633406997


100%|██████████| 73/73 [00:04<00:00, 17.84it/s]
  0%|          | 2/579 [00:00<00:55, 10.39it/s]

acc: 73.15068493150685


100%|██████████| 579/579 [00:55<00:00, 10.42it/s]
  3%|▎         | 2/73 [00:00<00:03, 18.51it/s]

loss: 310.48247000947595


100%|██████████| 73/73 [00:04<00:00, 18.01it/s]
  0%|          | 2/579 [00:00<00:54, 10.61it/s]

acc: 73.28767123287672


100%|██████████| 579/579 [00:58<00:00,  9.98it/s]
  3%|▎         | 2/73 [00:00<00:03, 18.17it/s]

loss: 304.6229900084436


100%|██████████| 73/73 [00:04<00:00, 16.26it/s]
  0%|          | 2/579 [00:00<00:54, 10.60it/s]

acc: 74.93150684931507


100%|██████████| 579/579 [00:59<00:00,  9.72it/s]
  3%|▎         | 2/73 [00:00<00:05, 13.62it/s]

loss: 292.33895471319556


100%|██████████| 73/73 [00:04<00:00, 17.18it/s]
  0%|          | 1/579 [00:00<00:58,  9.83it/s]

acc: 72.32876712328768


100%|██████████| 579/579 [00:58<00:00,  9.93it/s]
  3%|▎         | 2/73 [00:00<00:04, 16.65it/s]

loss: 293.5813474059105


100%|██████████| 73/73 [00:04<00:00, 17.59it/s]
  0%|          | 1/579 [00:00<00:59,  9.77it/s]

acc: 75.75342465753425


100%|██████████| 579/579 [00:58<00:00,  9.90it/s]
  3%|▎         | 2/73 [00:00<00:04, 17.34it/s]

loss: 287.6773977652192


100%|██████████| 73/73 [00:04<00:00, 17.43it/s]
  0%|          | 1/579 [00:00<01:03,  9.05it/s]

acc: 75.61643835616438


100%|██████████| 579/579 [00:59<00:00,  9.74it/s]
  3%|▎         | 2/73 [00:00<00:05, 13.89it/s]

loss: 280.64492953475565


100%|██████████| 73/73 [00:04<00:00, 16.52it/s]
  0%|          | 1/579 [00:00<01:00,  9.48it/s]

acc: 75.75342465753425


100%|██████████| 579/579 [00:59<00:00,  9.70it/s]
  3%|▎         | 2/73 [00:00<00:04, 17.15it/s]

loss: 269.2567945048213


100%|██████████| 73/73 [00:04<00:00, 17.05it/s]
  0%|          | 1/579 [00:00<01:08,  8.41it/s]

acc: 73.28767123287672


100%|██████████| 579/579 [00:57<00:00, 10.07it/s]
  3%|▎         | 2/73 [00:00<00:05, 13.99it/s]

loss: 268.40229304879904


100%|██████████| 73/73 [00:04<00:00, 17.22it/s]
  0%|          | 1/579 [00:00<01:00,  9.59it/s]

acc: 75.75342465753425


100%|██████████| 579/579 [00:58<00:00,  9.97it/s]
  3%|▎         | 2/73 [00:00<00:04, 14.57it/s]

loss: 263.3012895239517


100%|██████████| 73/73 [00:04<00:00, 17.42it/s]
  0%|          | 1/579 [00:00<01:06,  8.75it/s]

acc: 74.38356164383562


100%|██████████| 579/579 [00:57<00:00, 10.15it/s]
  3%|▎         | 2/73 [00:00<00:03, 17.87it/s]

loss: 257.87953903432935


100%|██████████| 73/73 [00:04<00:00, 17.09it/s]
  0%|          | 1/579 [00:00<00:58,  9.84it/s]

acc: 75.34246575342466


100%|██████████| 579/579 [00:58<00:00,  9.96it/s]
  3%|▎         | 2/73 [00:00<00:05, 13.37it/s]

loss: 251.42889169976115


100%|██████████| 73/73 [00:04<00:00, 17.02it/s]

acc: 75.89041095890411





In [23]:
model = torch.load("lstm.model")
model.eval()

LSTMModel(
  (dropout): Dropout(p=0.5, inplace=False)
  (lstm): LSTM(304, 20, batch_first=True, bidirectional=True)
  (dense): Linear(in_features=40, out_features=3, bias=True)
  (softmax): LogSoftmax(dim=1)
)

In [24]:
X, Y, lengths = datapoints_to_batch(test_data, max_len, senti_dict)
_, pred = predict_on_batch(model, X, Y, lengths)

In [25]:
print(classification_report(test_labels, pred))

              precision    recall  f1-score   support

           0       0.80      0.72      0.76       209
           1       0.71      0.89      0.79       271
           2       0.84      0.66      0.74       243

    accuracy                           0.77       723
   macro avg       0.78      0.76      0.76       723
weighted avg       0.78      0.77      0.76       723

