In [1]:
from custom.pipeline import ClassicPipeline, TransformerPipeline, txt2list, PreProcess
from custom.embeddings_generation import EmbeddingGenerator
from custom.sentiment import BiLSTMClassifier
import torch
import spacy

In [2]:
preprocess = PreProcess()
batch_sequence = preprocess.load_text(
    txt_path='/Users/sergicastellasape/Repos/zeta-alpha/datasets/quora_questions_lite.txt')
batch_tensors, indices_chunk = preprocess.forward(batch_sequence)

In [3]:
i = 0
print(batch_sequence[i])
print(indices_chunk[i])
print(batch_tensors.size())

Are convolutional neural networks useful for tasks other than image classification?
[(0,), (1,), (2, 3, 4, 5, 6, 7, 8), (9,), (10,), (11,), (12,), (13,), (14, 15), (16,), (17,)]
torch.Size([16, 48, 768])


In [4]:
def max_pooling(input_tensors, dim=0):
    # input tensors have shape (n, EmbDimension), 
    # this is problematic for now because this operation cannot be parallelized on GPU
    tensor, _ = torch.max(input_tensors, dim=0)
    return tensor

In [5]:
generator = EmbeddingGenerator(pool_function=max_pooling)

In [6]:
compressed, mask = generator.forward(batch_tensors, indices_chunk)

In [7]:
print(compressed.size())
print(mask.size())
print(batch_tensors.size())
print(mask[0,:])

torch.Size([16, 48, 768])
torch.Size([16, 48])
torch.Size([16, 48, 768])
tensor([ True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False])


In [8]:
sentset_size = 3 # negative, neutral, positive 
batch_size = 16
num_layers = 2
embedding_size = 768
hidden_size = 768

lengths = list(mask.sum(dim=1))
lengths = [int(i) for i in lengths] # list to int instead of tensors

padded_sequence = torch.nn.utils.rnn.pack_padded_sequence(compressed, lengths, batch_first=True, enforce_sorted=False)
model = BiLSTMClassifier(embedding_size, hidden_size, sentset_size, num_layers, batch_size, dropout=0.)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.999), eps=1e-08, weight_decay=0, amsgrad=False)


prediction = model.forward(compressed)
target = torch.zeros_like(prediction)
target[:, 0] = torch.ones((16,))
L = model.loss(prediction, target)
L.backward()
optimizer.step()

In [2]:
nlp = spacy.load('en_trf_bertbaseuncased_lg')
classic_nlp = spacy.load('en_core_web_sm')

In [3]:
#text = "What is a bayesian neural network?"
cls_pipeline = ClassicPipeline()
trf_pipeline = TransformerPipeline()

In [9]:
texts = txt2list(
    txt_path='/Users/sergicastellasape/Repos/zeta-alpha/datasets/quora_questions.txt')
docs = trf_pipeline.make_docs(texts[0:10])
cls_docs = cls_pipeline.make_docs(texts)

In [5]:
#texts = ['What are Convolutional Neural Networks this is.', 'Are bayesian networks new.']

spacy_tokenization = cls_pipeline.make_base_tokenization(cls_docs)
base_chunks = cls_pipeline.make_noun_phrase_chunks(cls_docs)

brackets, chunk2spacy_idx = cls_pipeline.compact_tokens(texts)
spacy2trf_idx = trf_pipeline.spacy2trf_indices(docs)

chunk2trf_idx = trf_pipeline.chunk2trf_indices(chunk2spacy_idx, spacy2trf_idx)

In [8]:
i = 3

print("Spacy chunks: ", base_chunks[i])
print("Spacy tokens: ", spacy_tokenization[i])
print("Transformer wordpieces: ", docs[i]._.trf_word_pieces_)
print("Chunk to trans idx: ", chunk2trf_idx[i])

Spacy chunks:  [('what',), ('a', 'dirichlet', 'process'), ('layman', "'s", 'terms')]
Spacy tokens:  ['bayesian', 'inference', ':', 'what', 'is', 'a', 'dirichlet', 'process', 'in', 'layman', "'s", 'terms', '?']
Transformer wordpieces:  ['[CLS]', 'bay', '##esian', 'inference', ':', 'what', 'is', 'a', 'dir', '##ich', '##let', 'process', 'in', 'lay', '##man', "'", 's', 'terms', '?', '[SEP]']
Chunk to trans idx:  [(1, 2), (3,), (4,), (5,), (6,), (7, 8, 9, 10, 11), (12,), (13, 14, 15, 16, 17), (18,)]


In [7]:
classic_doc = classic_nlp(text)
chunks_tokenization = [chunk.text for chunk in classic_doc.noun_chunks]
spacy_tokenization = [word.text for word in doc]

NameError: name 'text' is not defined

In [None]:
print(chunks_tokenization)
print(spacy_tokenization)
spacy

In [None]:
from spacy.gold import align
a = ['What', 'is', 'a bayesian neural network', '?']
b = ['What', 'is', 'a', 'bayesian', 'neural', 'network', '?']
cost, a2b, b2a, a2b_multi, b2a_multi = align(a, b)
print("Misaligned tokens:", cost)  # 2
print("One-to-one mappings a -> b", a2b)  # array([0, 1, 2, 3, -1, -1, 5, 6])
print("One-to-one mappings b -> a", b2a)  # array([0, 1, 2, 3, 5, 6, 7])
print("Many-to-one mappings a -> b", a2b_multi)  # {4: 4, 5: 4}
print("Many-to-one mappings b-> a", b2a_multi)  # {}

In [None]:
a = [(1,), (2,), (3, 4,), (5,)]
print(a[1:])

In [None]:
s = 'sergi     castella'
s.split()

In [None]:
print(s)

In [11]:
import torch
t = torch.Tensor([1, 2, 3, 4])

In [14]:
t.remove(2)

AttributeError: 'Tensor' object has no attribute 'remove'