<a href="https://colab.research.google.com/github/skaltenp/snlp_sdiercks_skaltenp_2021/blob/master/SNLP_skaltenp_sdiercks.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#SNLP 

In [None]:
#download of the datsets
!pip install gdown
!gdown https://drive.google.com/uc?id=1K_SWHsTGxbzojgElkSd1cSVY7wGFWqim
!gdown https://drive.google.com/uc?id=1xBZKAe7ezUIkPFlOpWwYH5hRcWkEB9Vi

Downloading...
From: https://drive.google.com/uc?id=1K_SWHsTGxbzojgElkSd1cSVY7wGFWqim
To: /content/SNLP2020_test.tsv
100% 73.9k/73.9k [00:00<00:00, 27.9MB/s]
Downloading...
From: https://drive.google.com/uc?id=1xBZKAe7ezUIkPFlOpWwYH5hRcWkEB9Vi
To: /content/SNLP2020_training.tsv
100% 72.1k/72.1k [00:00<00:00, 28.0MB/s]


In [None]:
# imports
!pip install transformers

from transformers import BertTokenizer, BertModel

import pandas as pd

import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(2012)

from sklearn.model_selection import train_test_split

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

Collecting transformers
  Downloading transformers-4.15.0-py3-none-any.whl (3.4 MB)
[K     |████████████████████████████████| 3.4 MB 12.8 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 45.4 MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 33.2 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 44.1 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 499 kB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, tra

In [None]:
# load dataset

df= pd.read_table('SNLP2020_training.tsv', header= None)
df[0] = None
df = df.drop([0], axis= 1)
df.head()

# train, test split

train_set, val_set = train_test_split(df, test_size= 0.1, shuffle= False) 

train_set = pd.DataFrame(train_set, index=None)
val_set = pd.DataFrame(val_set, index=None)

In [None]:
# get the longest sentence
max = 0

for i in df.get(1):
    if len(i) > max:
        max = len(i)

print(f'lonest sentence is {max} words long')

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

lonest sentence is 83 words long


Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
# dataloader with preprocessing
def dataloader(dataset, batch_size = 32, max_length= 100):
    sentences = dataset[1].to_numpy()
    results = dataset.get(2).to_numpy()
    sentences_encodes = list()
    attation_masks = list()

    for sentence in sentences:
        sentence_encoded = tokenizer.encode(sentence, 
                                        add_special_tokens= True,
                                        truncation= True,
                                        padding = 'max_length',
                                        max_length = max_length                             
                                        )
        
        sentences_encodes.append(sentence_encoded)

        #attation mask
        for i, encoded in enumerate(sentence_encoded):
            if encoded == 0:
                mask = [1 if j<i else 0 for j in range(0, max_length)]
                attation_masks.append(mask)
                break
        
 
    torch_dataset = torch.utils.data.TensorDataset(
        torch.tensor(sentences_encodes), 
        torch.tensor(attation_masks), 
        torch.tensor(results)
        )

    return torch.utils.data.DataLoader(torch_dataset, batch_size)

train_dataloader = dataloader(train_set, 32, max_length= max)
val_dataloader = dataloader(val_set, 32, max_length= max)

In [None]:
# load test dataset
test_set= pd.read_table('SNLP2020_test.tsv', header= None)
test_set = pd.DataFrame(test_set, index=None)

# testdataset dataloader with preprocessing
def test_dataloader(dataset, batch_size = 32, max_length= 100):
    sentences = dataset[1].to_numpy()

    sentences_encodes = list()
    attation_masks = list()

    for sentence in sentences:
        sentence_encoded = tokenizer.encode(sentence, 
                                        add_special_tokens= True,
                                        truncation= True,
                                        padding = 'max_length',
                                        max_length = max_length                             
                                        )
        
        sentences_encodes.append(sentence_encoded)

        #attation mask
        for i, encoded in enumerate(sentence_encoded):
            if encoded == 0:
                mask = [1 if j<i else 0 for j in range(0, max_length)]
                attation_masks.append(mask)
                break
        
 
    torch_dataset = torch.utils.data.TensorDataset(
        torch.tensor(sentences_encodes), 
        torch.tensor(attation_masks),
        torch.tensor(dataset[0].to_numpy())
        )

    return torch.utils.data.DataLoader(torch_dataset, batch_size)

test_dataloader = test_dataloader(test_set, 32, max_length= max)

In [None]:
#########
# Model #
#########

# Define model
class FakeCheckerModel(nn.Module):

    def __init__(self):

        super(FakeCheckerModel, self).__init__()

        self.model_loaded = BertModel.from_pretrained("bert-base-uncased")

        self.linear1 = nn.Linear(self.model_loaded.config.hidden_size, out_features=256, bias=True)
        self.linear12 = nn.Linear(256, out_features=256, bias=True)
        self.linear2 = nn.Linear(256, out_features=1, bias=False)

    #, token_type_ids
    def forward(self, input_ids, attention_mask):
        
        # BertModel
        outputs = self.model_loaded(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        
        # extra layers on top
        logits1 = self.linear1(outputs[1])
        logits12 = self.linear12(logits1)
        logits2 = self.linear2(logits12)
        
        return logits2

    def save_pretrained(self, path):
        self.model_loaded.save_pretrained(path)
    
    def from_pretrained(self, path):
        self.model_loaded.load_pretrained(path)

# Define device (GPU vs CPU)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
   
#  Model
model = FakeCheckerModel()
model.to(device)

# Optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

# Loss function (a.k.a. criterion)
criterion = torch.nn.MSELoss()

    
print(model)

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


FakeCheckerModel(
  (model_loaded): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_

In [None]:
!export PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:128

In [None]:
def train(train_data, val_data, test_data, epochs, model, optimizer, criterion):
    min_val = 9999999999
    for i in range(epochs):
        training_loss = 0
        val_loss = 0

        for phase in ('train', 'val'):
        
            if phase is 'train':
                model.train()
                model.zero_grad()
                dataset = train_data

            else:
                model.eval()
                dataset = val_data

            total_loss = 0

            for text, attention_mask, results in dataset:
                
                optimizer.zero_grad()

                text = text.to(device)
                attention_mask = attention_mask.to(device)
                results = results.to(device)

                output = model(
                    text,
                    attention_mask = attention_mask
                    )

                results = results.type(torch.float32)
                loss = criterion(output.squeeze(), results)
  
                if phase == 'train':
                    loss.backward()
                    optimizer.step()         

                total_loss += loss.detach().item()


            if(phase == 'train'):             
                training_loss = total_loss
            else:
                val_loss = total_loss

        print(f'epoche: {i}, training_loss: {training_loss}, validation_loss: {val_loss}')
  
train(train_dataloader, val_dataloader, test_dataloader, 8, model, optimizer, criterion)

epoche: 0, training_loss: 9.326798215508461, validation_loss: 0.9678271114826202
epoche: 1, training_loss: 8.114570826292038, validation_loss: 0.9308214634656906
epoche: 2, training_loss: 7.496062442660332, validation_loss: 0.9582123160362244
epoche: 3, training_loss: 6.92836956679821, validation_loss: 0.9683420807123184
epoche: 4, training_loss: 5.9177777618169785, validation_loss: 1.1080523133277893
epoche: 5, training_loss: 5.0714366510510445, validation_loss: 1.2409091591835022
epoche: 6, training_loss: 4.6017324812710285, validation_loss: 1.2663341462612152
epoche: 7, training_loss: 4.27728796005249, validation_loss: 1.3929442465305328


In [None]:
##########################
## predict test_dataset ##
##########################
def predict(model, test_data):
    test_array = [] 
    ids_array = []

    model.eval()

    torch.cuda.empty_cache()

    print('Predict test data')
   
    for text, attention_mask, id in test_data:
        text = text.to(device)

        attention_mask = attention_mask.to(device)
        with torch.no_grad():
            output = model(
                text,
                attention_mask = attention_mask
                )

            test_array.append(output.to('cpu').squeeze())
            ids_array.append(id.to('cpu').squeeze())
            torch.cuda.empty_cache()

    # write to file
    with open("./result.ttl", "w") as result_file:
        for i, ids in zip(test_array, ids_array):
            for j, id in zip(i, ids):
   
                if j<0:
                    value = 0
                elif j>1:
                    value = 1
                else:
                    value= j.numpy()

                result_file.write('<http://swc2017.aksw.org/task2/dataset/' + str(id.numpy()) + '> <http://swc2017.aksw.org/hasTruthValue>"' + str(value) + '"^^<http://www.w3.org/2001/XMLSchema#double> .\n')

predict(model, test_dataloader)

Predict test data
