# Testing Bert

In [19]:
import pandas as pd
import torch
from transformers import BertTokenizer

### loading and correction of corpus

In [2]:
corpus = pd.read_csv("../corpora/german_poems.csv")

In [3]:
corpus.sample(3)

Unnamed: 0,pid,filename,poet,title,year,poem,poemlength
54920,1323,"Gryphius, Andreas_18._1640","Gryphius, Andreas",18.,1640,Nembt wider von mir hin/ was ihr mir nechst ve...,133
45707,48606,"Freiligrath, Ferdinand_3._1843","Freiligrath, Ferdinand",3.,1843,Wie unsre mut'gen Orlogsmänner heißen? Komm mi...,141
58883,6164,"Tersteegen, Gerhard_7. Jesus zu der Seele_1733","Tersteegen, Gerhard",7. Jesus zu der Seele,1733,"Ei, stör dich nicht, mein Kind, bleib innig ab...",68


In [4]:
orthography_correction = {"ſ": "s", "uͤ": "ue", "aͤ": "ae", "oͤ": "oe"}
for k, v in orthography_correction.items():
    corpus["poem"] = corpus["poem"].str.replace(k, v)

In [5]:
sentences = corpus.poem #todo: rename
labels = corpus.poet

### BertTokenizer

In [6]:
tokenizer = BertTokenizer.from_pretrained('bert-base-german-dbmdz-uncased')

In [7]:
print(sentences[0][:30])
print(tokenizer.tokenize(sentences[0])[:8])
print(tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sentences[0])[:8]))

Saß der König Artaxerxes In de
['saß', 'der', 'kon', '##ig', 'art', '##ax', '##er', '##xe']
[11189, 127, 367, 209, 1548, 6905, 105, 10215]


### Sentences to IDs

In [8]:
max_len = 30 #normally up to 512

In [9]:
input_ids = []
for sent in sentences:
    encoded_sent = tokenizer.encode(sent[:max_len], add_special_tokens = True)
    input_ids.append(encoded_sent)
print('Original: ', sentences[0][:30])
print('Token IDs:', input_ids[0])

Original:  Saß der König Artaxerxes In de
Token IDs: [102, 11189, 127, 367, 209, 1548, 6905, 105, 10215, 30941, 142, 433, 103]


### Padding

In [10]:
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [11]:
#padding
max_pad_len = 64
from keras.preprocessing.sequence import pad_sequences

input_ids = pad_sequences(input_ids, 
                          maxlen=max_pad_len, 
                          dtype="long", 
                          value=0, 
                          truncating="post", 
                          padding="post")

In [12]:
input_ids

array([[  102, 11189,   127, ...,     0,     0,     0],
       [  102,   224,   281, ...,     0,     0,     0],
       [  102,   244,   377, ...,     0,     0,     0],
       ...,
       [  102,  1771, 19725, ...,     0,     0,     0],
       [  102,   261,   235, ...,     0,     0,     0],
       [  102,  2315,  3188, ...,     0,     0,     0]])

### Attention Mask

In [13]:
attention_masks = []
for sent in input_ids:

    att_mask = [int(token_id > 0) for token_id in sent]
    attention_masks.append(att_mask)

### Train Test Split

In [15]:
from sklearn.model_selection import train_test_split

In [16]:
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, 
                                                                                    labels, 
                                                                                    random_state=2018, 
                                                                                    test_size=0.1)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, 
                                                       labels,
                                                       random_state=2018, 
                                                       test_size=0.1)

### Convert into Tensors

In [21]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler# The DataLoader needs to know our batch size for training, so we specify it 

In [23]:
train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)

  """Entry point for launching an IPython kernel.
  


In [24]:
train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)

ValueError: too many dimensions 'str'

In [22]:
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

AttributeError: 'list' object has no attribute 'size'