In [0]:
!unzip  elmo4irony.zip

In [0]:
!pip install colored_traceback
!pip install tensorboardX
!pip install dataset

In [0]:
import os
os.chdir("./elmo4irony")

In [0]:
import os

from glob import glob

import torch
import colored_traceback
import numpy as np
from tqdm import tqdm
from tensorboardX import SummaryWriter

from src.corpus.corpus import ClassificationCorpus

from src.utils.logger import Logger
# from src.utils.ops import np_softmax

# from src.train import Trainer
# from src.optim.optim import OptimWithDecay
from src import config

# from src.models.classifier import Classifier

# from src.layers.pooling import PoolingLayer

from base_args import base_parser, CustomArgumentParser

In [0]:
# 此处生成语料库
corpus = ClassificationCorpus(config.corpora_dict, "palek",
                              force_reload=False,
                              train_data_proportion=0.8,
                              dev_data_proportion=0.2,
                              batch_size=32,
                              lowercase=True,
                              use_pos=False)

In [0]:
import torch
import time
from torch import nn
from torch.autograd import Variable
import numpy as np
import re
import random
import json
import os
from torch.autograd import Variable
import warnings
warnings.simplefilter("ignore", UserWarning)

In [0]:
class RNNModel(nn.Module):
  def __init__(self, n_token):
      embed_size = config['embedding_size']
      hidden_size = config['hidden_size']
      n_layers = config['n_layers']
      dropout = config['dropout']
      bidirectional = config['bidirectional']
      ffnn_layers = config['linear']
      super(RNNModel, self).__init__()
      self.embed = nn.Embedding(n_token, embed_size)
      self.RNN = nn.LSTM(embed_size, hidden_size, n_layers,
                          batch_first=True, dropout=dropout, bidirectional=bidirectional)
      sentence_length = 58
      batch_size = 32
      curr_dim = hidden_size * (bidirectional + 1) * sentence_length
      self.layers = [nn.Linear(curr_dim, ffnn_layers[0])]
      for i, o in zip(ffnn_layers[0:-1], ffnn_layers[1:]):
          self.layers.append(nn.Linear(i, o))
      self.softmax = nn.Softmax(dim=1)

  def forward(self, x):
      # print(x.size())
      embedding = self.embed(x)
      output, hidden = self.RNN(embedding)
      # print(output.size())
      output = output.reshape(output.size(0), -1).cuda()
      # print(output.size())
      for layer in self.layers:
          output = layer.cuda()(output)
          # output = nn.ReLU().cuda()(output)
          # print(output.size())

      output = self.softmax.cuda()(output)
      
      return output.view(-1, 2).cuda()

In [0]:
def train(model, corpus):
    model.cuda()
    model.train()
    start = time.time()
    batch_id = 0
    criterion = nn.CrossEntropyLoss()
    window_size = config['window_size']
    batch_size = config["batch_size"]
    window_step = config["window_step_size"]
    optimizer = torch.optim.Adam(model.parameters())
    data_generator = corpus.train_batches
    total_loss = 0
    print_period = 200
    for batchIter in data_generator.gen():
        x = batchIter["sequences"]
        y = batchIter["labels"]
        x = torch.LongTensor(x)
        y = torch.LongTensor(y)
        x, y = Variable(x.cuda()), Variable(y.cuda())
        output = model(x)
        # print(output)
        output = output.view(-1, 2)
        y = y.view(-1)
        # print(len(output))
        # print(len(output[0]))
        # print(y)

        loss = criterion(output, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.data
        batch_id += 1
        if batch_id % print_period == 0:
          print('batchid:{} time:{}s loss:{}'.format(batch_id, time.time()-start, total_loss/print_period))
          total_loss = 0
          start = time.time()

def test(model, corpus):
    model.cuda()
    model.eval()
    start = time.time()
    window_size = config['window_size']
    test_batch_size = config["test_batch_size"]
    window_step = config["window_step_size"]
    test_data = corpus.test_batches
    criterion = nn.CrossEntropyLoss()
    total_loss = 0
    true_pos = 0
    false_pos = 0
    true_neg = 0
    false_neg = 0
    batch_counter = 0
    mark_counter = 0
    for batchIter in test_data.gen():
        x = batchIter["sequences"]
        y = batchIter["labels"]
        x = torch.LongTensor(x)
        y = torch.LongTensor(y)

        x, y = Variable(x.cuda()), Variable(y.cuda())
        prediction = model(x)
        loss = criterion(prediction, y)
        total_loss += loss
        prediction[prediction[:, 1] > 0.5] = 1
        prediction[prediction[:, 1] <= 0.5] = 0
        prediction = prediction[:, 1].bool()
        # print(prediction)
        y= y.bool()
        true_pos += (y * prediction).sum()
        false_neg += (y * (~prediction)).sum()
        false_pos += ((~y) * prediction).sum()
        true_neg += ((~y) * (~prediction)).sum()
        # print(true_pos)

        batch_counter += 1
    return print_info(true_pos, false_pos, true_neg, false_neg, start, total_loss, batch_counter)
    print(total_loss/batch_counter)


In [0]:
def print_info(true_pos, false_pos, true_neg, false_neg, since, total_loss, batch_counter):
    print('-' * 15)
    print('TP: %d\nTN: %d\nFP: %d\nFN: %d\nmark_counter: \n' %
          (true_pos, true_neg, false_pos, false_neg))
    print('-' * 15)
    print('time: %.3f s\nloss: %.4f\nprecision: %.4f\nrecall: %.4f\nf score: %.4f\naccuracy: %.4f'
          % (
              time.time() - since,
              total_loss / batch_counter,
              true_pos / (true_pos + false_pos + 1e-6),
              true_pos / (true_pos + false_neg + 1e-6),
              2 * true_pos / (2 * true_pos + false_pos + false_neg + 1e-6),
              (true_pos + true_neg) / (true_neg + true_pos + false_neg + false_pos)
          ))
    print('-' * 15)
    return 2 * true_pos / (2 * true_pos + false_pos + false_neg + 1e-6)

In [0]:

config = {
    "embedding_size": 100,
    "hidden_size": 100,
    "n_layers": 3,
    "dropout": 0.2,
    "bidirectional": True,
    "linear": [100, 100, 2],
    "learning_rate": 0.01,
    'init_range': 0.1,
    'test_batch_size': 16,
    "batch_size": 32,
    "window_size": 28,
    "window_step_size": 25,
}
model = RNNModel(len(corpus.lang.token2id))
nEpochs = 100
for epoch in range(nEpochs):
    train(model, corpus)
    test(model, corpus)


batchid:200 time:6.697207689285278s loss:0.6424132585525513
batchid:400 time:6.591456174850464s loss:0.5855687856674194
batchid:600 time:6.615060091018677s loss:0.5685141682624817
batchid:800 time:6.602676868438721s loss:0.5823965072631836
batchid:1000 time:6.592909812927246s loss:0.5524418354034424
---------------
TP: 5813
TN: 3708
FP: 2552
FN: 594
mark_counter: 

---------------
time: 7.754 s
loss: 0.5503
precision: 0.6949
recall: 0.9073
f score: 0.7870
accuracy: 0.0000
---------------
batchid:200 time:6.661293268203735s loss:0.5486118793487549
batchid:400 time:6.606712579727173s loss:0.5261557102203369
batchid:600 time:6.597862958908081s loss:0.5142425894737244
batchid:800 time:6.573846817016602s loss:0.5152010917663574
batchid:1000 time:6.6177122592926025s loss:0.4951779544353485
---------------
TP: 5494
TN: 4632
FP: 1628
FN: 913
mark_counter: 

---------------
time: 7.747 s
loss: 0.5061
precision: 0.7714
recall: 0.8575
f score: 0.8122
accuracy: 0.0000
---------------
batchid:200 t

KeyboardInterrupt: ignored

In [0]:
batches = next(corpus.test_batches.gen())
x = batches['sequences']
y = batches['labels']
x = torch.LongTensor(x)
y = torch.LongTensor(y)
x, y = Variable(x.cuda()), Variable(y.cuda())

In [0]:
prediction = model(x)
prediction[prediction[:, 1] > 0.5] = 1
prediction[prediction[:, 1] <= 0.5] = 0

prediction = prediction[:, 1].bool()

In [0]:
for i in x: print(len(i))

58
58
58
58
58
58
58
58
58
58
58
58
58
58
58
58
