In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import os
import nltk
import torch
import torch.nn as nn
import transformers
from transformers import AutoModel, BertTokenizerFast

# specify GPU
device = torch.device("cuda")

In [None]:
# import BERT-base pretrained model
bert = AutoModel.from_pretrained('../input/feedback-model/bert',return_dict=False)
# Load the BERT tokenizer
tokenizer = BertTokenizerFast.from_pretrained('../input/feedback-model/tokenizer')

In [None]:
# freeze all the parameters
for param in bert.parameters():
    param.requires_grad = False

In [None]:
class BERT_Arch(nn.Module):

    def __init__(self, bert):
      
      super(BERT_Arch, self).__init__()

      self.bert = bert 
      
      # dropout layer
      self.dropout = nn.Dropout(0.1)
      
      # relu activation function
      self.relu =  nn.ReLU()

      # dense layer 1
      self.fc1 = nn.Linear(768,512)
      
      # dense layer 8 (Output layer)
      self.fc2 = nn.Linear(512,8)

      #softmax activation function
      self.softmax = nn.LogSoftmax(dim=1)

    #define the forward pass
    def forward(self, sent_id, mask):

      #pass the inputs to the model  
      _, cls_hs = self.bert(sent_id, attention_mask=mask)
      
      x = self.fc1(cls_hs)

      x = self.relu(x)

      x = self.dropout(x)

      # output layer
      x = self.fc2(x)
      
      # apply softmax activation
      x = self.softmax(x)

      return x

In [None]:
# pass the pre-trained BERT to our define architecture
model = BERT_Arch(bert)

# push the model to GPU
model = model.to(device)

In [None]:
#load weights of best model
path = '../input/feedback-model/saved_weights.pt'
model.load_state_dict(torch.load(path))

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
def create_test_texts_list():
    total_list = []
    
    test_dir = "../input/feedback-prize-2021/test"
    for filename in os.listdir(test_dir):
        file_path = os.path.join(test_dir, filename)
        # checking if it is a file
        if os.path.isfile(file_path) and os.path.splitext(file_path)[1] == ".txt":
            with open(file_path) as f:
                    total_list.append({
                        'text' : f.read(), 
                        'id' : os.path.splitext(filename)[0]
                    })
    
    return total_list

In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [None]:
max_seq_len = 50
def tokenize(text):
    return tokenizer.batch_encode_plus(
        text,
        max_length = max_seq_len,
        pad_to_max_length=True,
        truncation=True,
        return_token_type_ids=False,
    )


In [None]:
# get predictions for test data
import torch, gc

def predict(seq,mask):
    test_seq = torch.tensor(seq)
    test_mask = torch.tensor(mask)
    gc.collect()
    torch.cuda.empty_cache()
    with torch.no_grad():
        preds = model(test_seq.to(device), test_mask.to(device))
        _, prediction = torch.max(preds, dim=1)
        return prediction.detach().cpu().numpy()

In [None]:
label_dict = {
    0:"Claim",
    1:"Concluding Statement",
    2:"Counterclaim",
    3:"Evidence",
    4:"Lead",
    5:"Other",
    6:"Position",
    7:"Rebuttal"
}

In [None]:
test_texts = create_test_texts_list()

In [None]:
pred_dicts_list = []

for test_text in test_texts:
    
    total_word_count = 0
    
    tokenized_sentences = nltk.sent_tokenize(test_text["text"])
    tokenized_text = tokenize(tokenized_sentences)
    preds = predict(tokenized_text['input_ids'],tokenized_text['attention_mask'])
#     x_test = vectorizer.transform(tokenized_sentences)
#     x_test = selector.transform(x_test).astype('float32')
#     preds = svm_model.predict(x_test) #Returns list
    
    
    for i, pred in enumerate(preds):
        if pred == 5:
            continue
        # Generate prediction strings for each predicted discourse
        tokenized_sentence = tokenized_sentences[i]
        
        if i == 0 or preds[i-1] != pred:
            prediction_string = ""
        
        for x in range(total_word_count, total_word_count + len(tokenized_sentence.split())):
            prediction_string += f"{x} "
        
        total_word_count += len(tokenized_sentence.split())
        
        try:
            if preds[i+1] == pred:
                continue
        except:
            pass
        
        pred_dicts_list.append({
            "id" : test_text["id"],
            "class" : label_dict.get(pred), 
            "predictionstring" : prediction_string.strip()
        })

In [None]:
submission_df = pd.DataFrame(pred_dicts_list)

In [None]:
submission_df

In [None]:
submission_df.to_csv("submission.csv", index=False)