In [14]:
%load_ext blackcellmagic

The blackcellmagic extension is already loaded. To reload it, use:
  %reload_ext blackcellmagic


In [669]:
import torch
from torch import nn, autograd
from torch.nn import functional as F
from skorch import NeuralNetClassifier
from skorch.helper import predefined_split
from skorch.callbacks import EarlyStopping

from nltk import casual_tokenize

from sklearn.metrics import recall_score, precision_score

import pandas as pd
import numpy as np

import csv


In [1038]:
TRAIN_SPLIT = 2
MSG_LEN = 20
EMBED_DIM = 300
HIDDEN_DIM = 100
NUM_LAYERS = 2
VOCAB_SIZE = 100000
OOV_IDX = 0
EOS_IDX = 1

In [1042]:
def load_wvs(embeddings_path: str, embedding_dim: int, limit=None):
    if limit is not None:
        limit = int(limit)
    with open(embeddings_path) as infile:
        if next(infile).split(" ") == embeddings_path:
            # Skip header for fasttext, don't for glove
            infile.seek(0)
        return pd.read_csv(
            infile,
            header=None,
            delim_whitespace=True,
            names=list(range(embedding_dim)),
            quoting=csv.QUOTE_NONE,
            nrows=limit,
            index_col=0,
        )


all_embeds = load_wvs("embeddings.txt", EMBED_DIM, VOCAB_SIZE)
raw_embeds = np.vstack([np.zeros(EMBED_DIM), all_embeds.values])

In [948]:
token_idxs = {tok: idx + 1 for idx, tok in enumerate(all_embeds.index.values.tolist())}

In [949]:
def vectorize(text):
    idxs = [token_idxs.get(tok, OOV_IDX) for tok in text.split(' ')[:MSG_LEN]]
    return np.array(idxs + [EOS_IDX] * (MSG_LEN - len(idxs)))

In [950]:
def make_dataset(topic_num: int):
    df = pd.read_csv("data/{}.csv".format(topic_num))
    presents = df[df["present"] == 1]
    missings = df[df["present"] == 0]
    train = pd.concat(
        [
            presents.iloc[: int(presents.shape[0] // TRAIN_SPLIT)],
            missings.iloc[: int(missings.shape[0] // TRAIN_SPLIT)],
        ],
        axis="rows",
    )
    validation = pd.concat(
        [
            presents.iloc[int(presents.shape[0] // TRAIN_SPLIT) :],
            missings.iloc[int(missings.shape[0] // TRAIN_SPLIT) :],
        ],
        axis="rows",
    )
    return train, validation


def make_x_and_y(dataset):
    x = [vectorize(text) for text in dataset['text'].values.tolist()]
    y = dataset['present'].values
    return torch.LongTensor(np.vstack(x).astype(np.int64)), \
        torch.LongTensor(y.astype(np.int64))


In [951]:
training, validation = make_dataset(36)

In [952]:
training_x, training_y = make_x_and_y(training)
validation_x, validation_y = make_x_and_y(validation)

In [953]:
training_x.shape, training_y.shape

(torch.Size([65, 20]), torch.Size([65]))

In [954]:
from skorch.dataset import Dataset

In [955]:
def prf1(predictions, true):
    p = precision_score(predictions, true)
    r = recall_score(predictions, true)
    f1 = 2 * p * r / (p + r)
    return p, r, f1

In [1085]:
class Classifier(nn.Module):
    def __init__(self, num_units=10, nonlin=F.relu):
        super().__init__()
        self.word_embeddings = nn.Embedding(VOCAB_SIZE + 1, embedding_dim=EMBED_DIM)
        self.word_embeddings.weight = nn.Parameter(torch.FloatTensor(raw_embeds))
        self.word_embeddings.weight.requires_grad = False
        self.lstm = nn.GRU(
            EMBED_DIM,
            HIDDEN_DIM,
            num_layers=NUM_LAYERS,
            dropout=0.2,
            batch_first=True,
            bidirectional=True,
        )
        self.softmax = nn.Linear(HIDDEN_DIM * 4, 2)
        self.dropout = nn.Dropout2d(0.1)
        self.hidden = self.init_hidden()
    
        print(sum(param.nelement() for param in self.lstm.parameters()))
            

    def forward(self, doc):
        embeds = self.word_embeddings(doc)
        if self.training:
            embeds = self.dropout(embeds.permute(0, 2, 1)).permute(0, 2, 1)
        lstm_out, self.hidden = self.lstm(embeds)
        tag_space = self.softmax(
            torch.cat(
                [
                    torch.max(self.dropout(lstm_out), 1)[0],
                    torch.mean(self.dropout(lstm_out), 1),
                ],
                1,
            )
        )
        tag_scores = F.softmax(tag_space, dim=-1)
        return tag_scores

    def init_hidden(self):
        return (
            autograd.Variable(torch.zeros(2, 1, HIDDEN_DIM)),
            autograd.Variable(torch.zeros(2, 1, HIDDEN_DIM)),
        )


net = NeuralNetClassifier(
    Classifier,
    batch_size=32,
    max_epochs=100,
    lr=0.2,
    train_split=predefined_split(Dataset(validation_x, validation_y)),
    callbacks=[EarlyStopping(patience=5, monitor='valid_loss')],
    device='cpu',
)


In [1086]:
net.fit(training_x, training_y)

422400
  epoch    train_loss    valid_acc    valid_loss     dur
-------  ------------  -----------  ------------  ------
      1        [36m0.5681[0m       [32m0.8060[0m        [35m0.5533[0m  0.1845
      2        0.7198       0.8060        [35m0.5147[0m  0.2056
      3        0.6510       0.8060        [35m0.5007[0m  0.1954
      4        0.6235       0.8060        [35m0.4885[0m  0.2075
      5        0.6014       0.8060        [35m0.4808[0m  0.1935
      6        0.5734       0.8060        [35m0.4757[0m  0.1870
      7        [36m0.5584[0m       0.8060        [35m0.4712[0m  0.2024
      8        0.5629       0.8060        [35m0.4658[0m  0.1882
      9        [36m0.5391[0m       0.8060        [35m0.4607[0m  0.1922
     10        0.5439       0.8060        [35m0.4588[0m  0.2016
     11        [36m0.5358[0m       0.8060        [35m0.4552[0m  0.2067
     12        [36m0.5271[0m       0.8060        [35m0.4533[0m  0.1967
     13        [36m0.5129[0m   

<class 'skorch.classifier.NeuralNetClassifier'>[initialized](
  module_=Classifier(
    (word_embeddings): Embedding(100001, 300)
    (lstm): GRU(300, 100, num_layers=2, batch_first=True, dropout=0.2, bidirectional=True)
    (softmax): Linear(in_features=400, out_features=2, bias=True)
    (dropout): Dropout2d(p=0.1)
  ),
)

In [1087]:
print('validation')
prf1(net.predict(validation_x), validation_y)

validation


(0.5384615384615384, 1.0, 0.7000000000000001)

In [1088]:
print('training')
prf1(net.predict(training_x), training_y)

training


(1.0, 1.0, 1.0)

In [1089]:
net.predict(torch.LongTensor(np.vstack([
    vectorize('she was not professional'),
    vectorize('she was not very professional'),
    vectorize('he was not very professional'),
    vectorize('he was very rude'),
    vectorize('she was very rude'),
    vectorize('he was very unprofessional'),
    vectorize('they were very condescending'),
    vectorize('she was very condescending'),
])))

array([0, 0, 0, 1, 1, 1, 1, 1])

In [1097]:
%%time
_ = net.predict(torch.cat([validation_x] * 100))


CPU times: user 13.3 s, sys: 128 ms, total: 13.4 s
Wall time: 3.35 s


In [1098]:
print(torch.cat([validation_x] * 100).shape)

torch.Size([6700, 20])


In [1083]:
print('gpu tokens processed per second:', 6700 * MSG_LEN / 0.6)

gpu tokens processed per second: 223333.33333333334


In [1099]:
print('cpu tokens processed per second:', 6700 * MSG_LEN / 4 / 3.35)

cpu tokens processed per second: 10000.0
