### Possible libraries and models to use: flair, fastai, allennlp, huggingface

### Summary report: https://nlpprogress.com/

# Pytorch Official Tutorial

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [None]:
torch.manual_seed(1)

In [5]:
lstm = nn.LSTM(3, 3)  # Input dim is 3, output dim is 3
inputs = [torch.randn(1, 3) for _ in range(5)]  # make a sequence of length 5

In [6]:
inputs

[tensor([[-0.5525,  0.6355, -0.3968]]),
 tensor([[-0.6571, -1.6428,  0.9803]]),
 tensor([[-0.0421, -0.8206,  0.3133]]),
 tensor([[-1.1352,  0.3773, -0.2824]]),
 tensor([[-2.5667, -1.4303,  0.5009]])]

In [4]:
# initialize the hidden state.
hidden = (torch.randn(1, 1, 3),
          torch.randn(1, 1, 3))

In [5]:
for i in inputs:
    # Step through the sequence one element at a time.
    # after each step, hidden contains the hidden state.
    out, hidden = lstm(i.view(1, 1, -1), hidden)

In [7]:
inputs = torch.cat(inputs).view(len(inputs), 1, -1)
hidden = (torch.randn(1, 1, 3), torch.randn(1, 1, 3))  # clean out hidden state
out, hidden = lstm(inputs, hidden)
print(out)
print(hidden)

tensor([[[-0.2682,  0.0304, -0.1526]],

        [[-0.5370,  0.0346, -0.1958]],

        [[-0.3947,  0.0391, -0.1217]],

        [[-0.1854,  0.0740, -0.0979]],

        [[-0.3600,  0.0893,  0.0215]]], grad_fn=<StackBackward>)
(tensor([[[-0.3600,  0.0893,  0.0215]]], grad_fn=<StackBackward>), tensor([[[-1.1298,  0.4467,  0.0254]]], grad_fn=<StackBackward>))


In [8]:
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)


training_data = [
    ("The dog ate the apple".split(), ["DET", "NN", "V", "DET", "NN"]),
    ("Everybody read that book".split(), ["NN", "V", "DET", "NN"])
]
word_to_ix = {}
for sent, tags in training_data:
    for word in sent:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)
print(word_to_ix)
tag_to_ix = {"DET": 0, "NN": 1, "V": 2}

# These will usually be more like 32 or 64 dimensional.
# We will keep them small, so we can see how the weights change as we train.
EMBEDDING_DIM = 6
HIDDEN_DIM = 6

{'The': 0, 'dog': 1, 'ate': 2, 'the': 3, 'apple': 4, 'Everybody': 5, 'read': 6, 'that': 7, 'book': 8}


In [9]:
class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
        self.hidden = self.init_hidden()

    def init_hidden(self):
        # Before we've done anything, we dont have any hidden state.
        # Refer to the Pytorch documentation to see exactly
        # why they have this dimensionality.
        # The axes semantics are (num_layers, minibatch_size, hidden_dim)
        return (torch.zeros(1, 1, self.hidden_dim),
                torch.zeros(1, 1, self.hidden_dim))

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, self.hidden = self.lstm(
            embeds.view(len(sentence), 1, -1), self.hidden)
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

In [10]:
model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

# See what the scores are before training
# Note that element i,j of the output is the score for tag j for word i.
# Here we don't need to train, so the code is wrapped in torch.no_grad()
with torch.no_grad():
    inputs = prepare_sequence(training_data[0][0], word_to_ix)
    tag_scores = model(inputs)
    print(tag_scores)

for epoch in range(300):  # again, normally you would NOT do 300 epochs, it is toy data
    for sentence, tags in training_data:
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Also, we need to clear out the hidden state of the LSTM,
        # detaching it from its history on the last instance.
        model.hidden = model.init_hidden()

        # Step 2. Get our inputs ready for the network, that is, turn them into
        # Tensors of word indices.
        sentence_in = prepare_sequence(sentence, word_to_ix)
        targets = prepare_sequence(tags, tag_to_ix)

        # Step 3. Run our forward pass.
        tag_scores = model(sentence_in)

        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        loss = loss_function(tag_scores, targets)
        loss.backward()
        optimizer.step()

# See what the scores are after training
with torch.no_grad():
    inputs = prepare_sequence(training_data[0][0], word_to_ix)
    tag_scores = model(inputs)

    # The sentence is "the dog ate the apple".  i,j corresponds to score for tag j
    # for word i. The predicted tag is the maximum scoring tag.
    # Here, we can see the predicted sequence below is 0 1 2 0 1
    # since 0 is index of the maximum value of row 1,
    # 1 is the index of maximum value of row 2, etc.
    # Which is DET NOUN VERB DET NOUN, the correct sequence!
    print(tag_scores)

tensor([[-1.0388, -0.9874, -1.2962],
        [-0.9900, -1.0083, -1.3334],
        [-0.9792, -1.0514, -1.2912],
        [-0.9556, -1.0550, -1.3197],
        [-0.9470, -1.0579, -1.3284]])
tensor([[-0.0882, -2.5320, -5.3084],
        [-3.8870, -0.0483, -3.6236],
        [-2.2643, -3.0358, -0.1648],
        [-0.0950, -2.6379, -3.9579],
        [-4.1251, -0.0204, -5.5152]])


# Flair (https://github.com/zalandoresearch/flair)


In [2]:
import os
import gc

In [3]:
from flair.data import Sentence
from flair.models import SequenceTagger
from flair.models import TextClassifier

from flair.data import TaggedCorpus
from flair.data_fetcher import  NLPTaskDataFetcher, NLPTask

In [4]:
train_pos = os.listdir('data/aclImdb_v1/aclImdb/train/pos/')
train_neg = os.listdir('data/aclImdb_v1/aclImdb/train/neg/')
test_pos = os.listdir('data/aclImdb_v1/aclImdb/test/pos/')
test_neg = os.listdir('data/aclImdb_v1/aclImdb/test/neg/')

In [5]:
pos_data = []
for name in train_pos:
    with open('data/aclImdb_v1/aclImdb/train/pos/' + name, 'r') as f:
        pos_data.append(f.readline())
for name in test_pos:
    with open('data/aclImdb_v1/aclImdb/test/pos/' + name, 'r') as f:
        pos_data.append(f.readline())

In [6]:
neg_data = []
for name in train_neg:
    with open('data/aclImdb_v1/aclImdb/train/neg/' + name, 'r') as f:
        neg_data.append(f.readline())
for name in test_neg:
    with open('data/aclImdb_v1/aclImdb/test/neg/' + name, 'r') as f:
        neg_data.append(f.readline())

In [7]:
sent_model = TextClassifier.load('en-sentiment')

In [9]:
def hook(m, i, o): 
    print(i[0].data.shape)

h = sent_model.decoder.register_forward_hook(hook)

In [13]:
len(pos_data) / 6400

3.90625

In [10]:
%%time
sent_model.predict([Sentence(s) for s in pos_data[:1000]])

torch.Size([32, 2048])
torch.Size([32, 2048])
torch.Size([32, 2048])
torch.Size([32, 2048])
torch.Size([32, 2048])
torch.Size([32, 2048])
torch.Size([32, 2048])
torch.Size([32, 2048])
torch.Size([32, 2048])
torch.Size([32, 2048])
torch.Size([32, 2048])
torch.Size([32, 2048])
torch.Size([32, 2048])
torch.Size([32, 2048])
torch.Size([32, 2048])
torch.Size([32, 2048])
torch.Size([32, 2048])
torch.Size([32, 2048])
torch.Size([32, 2048])
torch.Size([32, 2048])
torch.Size([32, 2048])
torch.Size([32, 2048])
torch.Size([32, 2048])
torch.Size([32, 2048])
torch.Size([32, 2048])
torch.Size([32, 2048])
torch.Size([32, 2048])
torch.Size([32, 2048])
torch.Size([32, 2048])
torch.Size([32, 2048])
torch.Size([32, 2048])
torch.Size([8, 2048])
CPU times: user 1min 51s, sys: 12.7 s, total: 2min 4s
Wall time: 2min 4s


[Sentence: "I stumbled across rerun syndication of this show several years ago, and fell in love with it. It features Téa Leoni and Holland Taylor and kept me laughing, one episode after the next. I guess it didn't make it so big, and was cancelled after a few seasons, but I believe it was a good run, and would suggest watching it...if the opportunity arises." - 64 Tokens,
 Sentence: "Paulie is a fantasy of a littler girl or perhaps her recollection of what her youth was like growing up.<br /><br />Tony Shaloub executes a flawless performance as an Russian Scientist (PhD) who cannot find decent work in America. He befriends an isolated parrot while performing meanial duties of a janitor at a behavioral science lab.<br /><br />The chief Doctor is a bitter man, as Paulie, who can speak and fully comprehend language and learn, embarasses the Doctor, who later banishses him to the lower levels of the building, where Mikail (Tony S.) finds him.<br /><br />Paulie recants his life with Marie 

In [8]:
records = []

def hook(m, i, o): 
    val = torch.zeros(1, 2048)
    val.copy_(i[0].data)
    records.append(val.tolist())

h = sent_model.decoder.register_forward_hook(hook)
for (i, sent) in enumerate(pos_data):
    sent_model.predict(Sentence(sent))
    print(i)
h.remove()

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

KeyboardInterrupt: 

In [35]:
for sen in sentences:
    print(sen.labels)

[POSITIVE (1.0)]
[POSITIVE (1.0)]
[POSITIVE (1.0)]
[POSITIVE (1.0)]
[POSITIVE (1.0)]
[POSITIVE (1.0)]
[POSITIVE (1.0)]
[POSITIVE (1.0)]
[POSITIVE (1.0)]
[POSITIVE (1.0)]
[POSITIVE (1.0)]
[POSITIVE (0.7514839172363281)]
[POSITIVE (1.0)]
[POSITIVE (1.0)]
[POSITIVE (1.0)]
[POSITIVE (1.0)]
[POSITIVE (1.0)]
[POSITIVE (1.0)]
[POSITIVE (1.0)]
[POSITIVE (0.21342727541923523)]
[POSITIVE (1.0)]
[POSITIVE (1.0)]
[POSITIVE (1.0)]
[POSITIVE (0.5871809720993042)]
[POSITIVE (1.0)]
[POSITIVE (1.0)]
[POSITIVE (1.0)]
[POSITIVE (1.0)]
[POSITIVE (1.0)]
[POSITIVE (1.0)]
[POSITIVE (1.0)]
[POSITIVE (1.0)]
[POSITIVE (1.0)]
[POSITIVE (1.0)]
[POSITIVE (1.0)]
[POSITIVE (1.0)]
[POSITIVE (1.0)]
[POSITIVE (1.0)]
[POSITIVE (0.8683277368545532)]
[POSITIVE (0.48940327763557434)]
[POSITIVE (0.8193634748458862)]
[POSITIVE (1.0)]
[POSITIVE (1.0)]
[POSITIVE (1.0)]
[POSITIVE (1.0)]
[POSITIVE (1.0)]
[POSITIVE (1.0)]
[POSITIVE (1.0)]
[POSITIVE (1.0)]
[POSITIVE (0.7528064250946045)]
[POSITIVE (1.0)]
[POSITIVE (1.0)]
[POSITIVE

In [27]:
%%time
inter_val

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 3.81 µs


[tensor([[-0.0411,  0.0048, -0.0511,  ..., -0.0895, -0.1808,  0.0509]]),
 tensor([[ 0.0370, -0.1056,  0.0722,  ..., -0.0667, -0.0495, -0.1277]]),
 tensor([[-0.0193, -0.0480,  0.0119,  ..., -0.1480, -0.0269, -0.1073]]),
 tensor([[-0.0140, -0.0737,  0.0113,  ...,  0.1729, -0.0655, -0.0659]]),
 tensor([[ 0.1415, -0.0319, -0.0013,  ..., -0.1360, -0.0949, -0.2074]]),
 tensor([[-0.0221, -0.0007, -0.0531,  ..., -0.1498,  0.1480, -0.2181]]),
 tensor([[ 0.0567, -0.1174,  0.1452,  ...,  0.0286,  0.0652,  0.0825]]),
 tensor([[ 0.0655, -0.0371,  0.0578,  ..., -0.1602,  0.1299, -0.1328]]),
 tensor([[-0.0255,  0.0019, -0.0559,  ...,  0.1681, -0.1455, -0.0203]]),
 tensor([[-0.0877, -0.0327,  0.0070,  ..., -0.0902, -0.0880, -0.0886]]),
 tensor([[-0.1594,  0.0520,  0.0768,  ..., -0.0364,  0.1285, -0.0340]]),
 tensor([[-0.0577, -0.0559,  0.0334,  ..., -0.0590, -0.0160, -0.0193]]),
 tensor([[-0.0321, -0.0037, -0.0690,  ...,  0.2156, -0.0177, -0.0405]]),
 tensor([[-0.2475, -0.1108, -0.0898,  ...,  0.0203,

In [60]:
corpus: TaggedCorpus = NLPTaskDataFetcher.load_corpus(NLPTask.UD_ENGLISH)

2019-02-20 16:13:28,144 https://raw.githubusercontent.com/UniversalDependencies/UD_English-EWT/master/en_ewt-ud-dev.conllu not found in cache, downloading to /tmp/tmpk6qgnfvm


1667872B [00:00, 22522954.83B/s]         

2019-02-20 16:13:28,314 copying /tmp/tmpk6qgnfvm to cache at /home/snie/.flair/datasets/ud_english/en_ewt-ud-dev.conllu
2019-02-20 16:13:28,317 removing temp file /tmp/tmpk6qgnfvm





2019-02-20 16:13:28,635 https://raw.githubusercontent.com/UniversalDependencies/UD_English-EWT/master/en_ewt-ud-test.conllu not found in cache, downloading to /tmp/tmp4_peid4s


1661693B [00:00, 16564735.51B/s]         

2019-02-20 16:13:28,837 copying /tmp/tmp4_peid4s to cache at /home/snie/.flair/datasets/ud_english/en_ewt-ud-test.conllu
2019-02-20 16:13:28,840 removing temp file /tmp/tmp4_peid4s





2019-02-20 16:13:29,553 https://raw.githubusercontent.com/UniversalDependencies/UD_English-EWT/master/en_ewt-ud-train.conllu not found in cache, downloading to /tmp/tmp73b0827p


13301153B [00:00, 53427017.37B/s]         

2019-02-20 16:13:29,881 copying /tmp/tmp73b0827p to cache at /home/snie/.flair/datasets/ud_english/en_ewt-ud-train.conllu
2019-02-20 16:13:29,889 removing temp file /tmp/tmp73b0827p
2019-02-20 16:13:29,890 Reading data from /home/snie/.flair/datasets/ud_english
2019-02-20 16:13:29,890 Train: /home/snie/.flair/datasets/ud_english/en_ewt-ud-train.conllu
2019-02-20 16:13:29,891 Dev: /home/snie/.flair/datasets/ud_english/en_ewt-ud-dev.conllu
2019-02-20 16:13:29,891 Test: /home/snie/.flair/datasets/ud_english/en_ewt-ud-test.conllu





In [65]:
s = corpus.train[0]

In [72]:
s.to_tagged_string('pos')

'Al <NNP> - <HYPH> Zaman <NNP> : <:> American <JJ> forces <NNS> killed <VBD> Shaikh <NNP> Abdullah <NNP> al <NNP> - <HYPH> Ani <NNP> , <,> the <DT> preacher <NN> at <IN> the <DT> mosque <NN> in <IN> the <DT> town <NN> of <IN> Qaim <NNP> , <,> near <IN> the <DT> Syrian <JJ> border <NN> . <.>'

In [77]:
pos_tagger = SequenceTagger.load('pos-multi')

2019-02-20 16:19:54,682 https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/models-v0.4/release-dodekapos-512-l2-multi/pos-multi-v0.1.pt not found in cache, downloading to /tmp/tmpmkvm3hyv


100%|██████████| 314055714/314055714 [01:13<00:00, 4268088.76B/s]

2019-02-20 16:21:08,792 copying /tmp/tmpmkvm3hyv to cache at /home/snie/.flair/models/pos-multi-v0.1.pt





2019-02-20 16:21:08,987 removing temp file /tmp/tmpmkvm3hyv


In [80]:
pos_tagger

SequenceTagger(
  (embeddings): StackedEmbeddings(
    (list_embedding_0): FlairEmbeddings(
      (lm): LanguageModel(
        (drop): Dropout(p=0.5)
        (encoder): Embedding(300, 100)
        (rnn): LSTM(100, 2048)
        (decoder): Linear(in_features=2048, out_features=300, bias=True)
      )
    )
    (list_embedding_1): FlairEmbeddings(
      (lm): LanguageModel(
        (drop): Dropout(p=0.5)
        (encoder): Embedding(300, 100)
        (rnn): LSTM(100, 2048)
        (decoder): Linear(in_features=2048, out_features=300, bias=True)
      )
    )
  )
  (word_dropout): WordDropout()
  (locked_dropout): LockedDropout()
  (embedding2nn): Linear(in_features=4096, out_features=4096, bias=True)
  (rnn): LSTM(4096, 512, num_layers=2, dropout=0.5, bidirectional=True)
  (linear): Linear(in_features=1024, out_features=21, bias=True)
)

In [108]:
s = corpus.test[1]

In [110]:
pos_tagger.predict(s)

[Sentence: "What if Google expanded on its search - engine ( and now e-mail ) wares into a full - fledged operating system ?" - 23 Tokens]

In [111]:
s.to_tagged_string()

'What <what/PRON/WP/root/Int> if <if/SCONJ/IN/mark> Google <Google/PROPN/NNP/nsubj/Sing> expanded <expand/VERB/VBD/advcl/Ind/Past/Fin> on <on/ADP/IN/case> its <its/PRON/PRP$/nmod:poss/Neut/Sing/3/Yes/Prs> search <search/NOUN/NN/compound/Sing> - <-/PUNCT/HYPH/punct> engine <engine/NOUN/NN/compound/Sing> ( <(/PUNCT/-LRB-/punct> and <and/CCONJ/CC/cc> now <now/ADV/RB/advmod> e-mail <e-mail/NOUN/NN/conj/Sing> ) <)/PUNCT/-RRB-/punct> wares <wares/VERB/NNS/obl/Plur> into <into/ADP/IN/case> a <a/DET/DT/det/Ind/Art> full <full/ADJ/RB/advmod> - <-/PUNCT/HYPH/punct> fledged <fledged/VERB/JJ/amod/Pos> operating <operating/NOUN/NN/compound/Sing> system <system/NOUN/NN/obl/Sing> ? <?/PUNCT/./punct>'