In [1]:
import numpy as np 
import pandas as pd 
import torch
import os

from transformers import AutoModel, AutoTokenizer, AutoModelForSequenceClassification, AutoModelForTokenClassification
from torch.utils.data import TensorDataset

tokenizer = AutoTokenizer.from_pretrained('../input/allenailongformerbase4096/longformer', 
                                          do_lower_case=True)

In [2]:
train_df = pd.read_csv('../input/feedback-prize-2021/train.csv')
train_df.head()

label_dict = {'Lead': 1, 'Position': 2, 'Evidence': 3, 'Claim': 4, 'Counterclaim': 5, 'Rebuttal': 6, 'Concluding Statement': 7, 'No Pred': 0}

In [3]:
# tokenizer = LongformerTokenizerFast.from_pretrained('allenai/longformer-base-4096')
new_df_holder = []
former_id = None
ongoing_text = ''
ongoing_tokenized = []
ongoing_labels = []

for index, row in train_df.iterrows():
    if not former_id:
        former_id = row['id']
    essay_id = row['id']
    text = row['discourse_text']
    # text_list = row['discourse_text'].split()
    # print(text.split())
    # tokenized_text = tokenizer.encode(text=text.split(), add_special_tokens=True, max_length=1024, return_tensors='pt', padding='max_length', truncation=True)['input_ids']
    tokenized_text = tokenizer.tokenize(text)
    # print(tokenized_text)
    labels = [label_dict[row['discourse_type']]] * len(tokenized_text)
    
    if former_id == essay_id:
        ongoing_text += text
        # ongoing_tokenized.extend(tokenized_text)
        ongoing_labels.extend(labels)
    else:
        new_df_holder.append({'essay_id': former_id, 'text': ongoing_text, 'labels': ongoing_labels})
        former_id = essay_id
        ongoing_text = text
        # ongoing_tokenized = tokenized_text
        ongoing_labels = labels

In [4]:
train_data = pd.DataFrame(new_df_holder)

constant_array = np.full(1024, 99)

label_holder = np.zeros([len(train_data), 1024])
# label_holder
for idx, label in enumerate(train_data.labels):
    label = np.array(label)
    label_len = len(label)
    if label_len > 1024:
        new_label = label[:1024]
    else:
        needed = 1024 - len(label)
        const = np.array([0] * needed)
        new_label = np.append(label, const)
    label_holder[idx] = new_label

label_holder

array([[1., 1., 1., ..., 0., 0., 0.],
       [2., 2., 2., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       ...,
       [1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.]])

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(train_data.index.values, 
                                                  train_data.text.values, 
                                                  test_size=0.2, 
                                                  random_state=42)

train_data['data_type'] = ['not_set']*train_data.shape[0]

train_data.loc[X_train, 'data_type'] = 'train'
train_data.loc[X_val, 'data_type'] = 'val'

# train_data.groupby(['labels', 'data_type']).count()
train_data.head()

Unnamed: 0,essay_id,text,labels,data_type
0,423A1CA112E2,Modern humans today are always on their phone....,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",val
1,A8445CABFECE,Drivers should not be able to use phones while...,"[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, ...",train
2,6B4F7A0165B9,The ability to stay connected to people we kno...,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",train
3,E05C7F5C1156,People are debating whether if drivers should ...,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",val
4,50B3435E475B,Over half of drivers in today's society have t...,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",train


In [6]:
encoded_data_train = tokenizer.batch_encode_plus(
    list(train_data[train_data.data_type=='train'].text.values), 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=1024,
    truncation=True,
    return_tensors='pt'
)

print(encoded_data_train)

encoded_data_val = tokenizer.batch_encode_plus(
    list(train_data[train_data.data_type=='val'].text.values), 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=1024,
    truncation=True,
    return_tensors='pt'
)

input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']

# dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
# dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)



{'input_ids': tensor([[    0, 34002,  3697,  ...,     1,     1,     1],
        [    0,   133,  1460,  ...,     1,     1,     1],
        [    0, 10777,   457,  ...,     1,     1,     1],
        ...,
        [    0,  2895,    82,  ...,     1,     1,     1],
        [    0, 35634,  1268,  ...,     1,     1,     1],
        [    0,   970,    34,  ...,     1,     1,     1]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}


In [7]:
labels_train = torch.from_numpy(label_holder[train_data.data_type=='train'])
labels_val = torch.from_numpy(label_holder[train_data.data_type=='val'])

dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

In [8]:
model = AutoModelForTokenClassification.from_pretrained('../input/allenailongformerbase4096/longformer',
                                  num_labels=8,
                                  output_attentions=False,
                                  output_hidden_states=False)

Some weights of LongformerForTokenClassification were not initialized from the model checkpoint at ../input/allenailongformerbase4096/longformer and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

batch_size = 4

dataloader_train = DataLoader(dataset_train, 
                              sampler=RandomSampler(dataset_train), 
                              batch_size=batch_size)

dataloader_validation = DataLoader(dataset_val, 
                                   sampler=SequentialSampler(dataset_val), 
                                   batch_size=batch_size)

In [10]:
from transformers import AdamW, get_linear_schedule_with_warmup

optimizer = AdamW(model.parameters(),
                  lr=1e-5, 
                  eps=1e-8)
                  
epochs = 3

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=0,
                                            num_training_steps=len(dataloader_train)*epochs)

In [11]:
from sklearn.metrics import f1_score

def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=2).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='macro')

def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}
    
    preds_flat = np.argmax(preds, axis=2).flatten()
    labels_flat = labels.flatten()

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}\n')

In [12]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

print(device)

cuda


In [13]:
# dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_small = TensorDataset(input_ids_train[:1000], attention_masks_train[:1000], labels_train[:1000])

dataloader_small = DataLoader(dataset_small, 
                              sampler=RandomSampler(dataset_small), 
                              batch_size=batch_size)

In [14]:
import random
from tqdm.notebook import tqdm

seed_val = 1
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

def evaluate(dataloader_val):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in dataloader_val:
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0].long(),
                  'attention_mask': batch[1].long(),
                  'labels':         batch[2].long(),
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        # print(logits.shape)
        label_ids = inputs['labels'].cpu().numpy()
        # print(label_ids.shape)
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals
    
for epoch in tqdm(range(1, epochs+1)):
    
    model.train()
    
    loss_train_total = 0

    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:

        model.zero_grad()
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0].long(),
                  'attention_mask': batch[1].long(),
                  'labels':         batch[2].long(),
                 }       

        outputs = model(**inputs)
        
        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
         
    # torch.save(model.state_dict(), f'data_volume/finetuned_BERT_epoch_{epoch}.model')
    tqdm.write(f'\nEpoch {epoch}')
    
    loss_train_avg = loss_train_total/len(dataloader_train)            
    tqdm.write(f'Training loss: {loss_train_avg}')
    
    val_loss, predictions, true_vals = evaluate(dataloader_validation)
    val_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (Macro): {val_f1}')

  0%|          | 0/3 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/3119 [00:00<?, ?it/s]


Epoch 1
Training loss: 0.5945223220105965
Validation loss: 0.48720412367047405
F1 Score (Macro): 0.567414637097118


Epoch 2:   0%|          | 0/3119 [00:00<?, ?it/s]


Epoch 2
Training loss: 0.42773188937021855
Validation loss: 0.4458125977562024
F1 Score (Macro): 0.5907011426372567


Epoch 3:   0%|          | 0/3119 [00:00<?, ?it/s]


Epoch 3
Training loss: 0.370809661798387
Validation loss: 0.44444739441745557
F1 Score (Macro): 0.5940427493758625


In [15]:
accuracy_per_class(predictions, true_vals)

Class: No Pred
Accuracy: 103/1740160

Class: Lead
Accuracy: 106021/113743

Class: Position
Accuracy: 47126/65171

Class: Evidence
Accuracy: 757560/825428

Class: Claim
Accuracy: 129937/202687

Class: Counterclaim
Accuracy: 16469/31782

Class: Rebuttal
Accuracy: 13167/26849

Class: Concluding Statement
Accuracy: 168356/188036



In [16]:
def test_essay(test_file):
    '''
    generates tokenized essays for test data with corresponding list of essay IDs
    '''
    
    full_path = os.path.join(test_dir, test_file)
    with open(full_path) as f:
        data = f.read()
    text = data.lower()
    
    # lines_len = len(text)
    id_str = test_file.replace('.txt', '')
    
    return [text], id_str

In [17]:
def test_longform(model, essay, batch_size=1):
    
    encoded_data_test = tokenizer.batch_encode_plus(
    essay, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=1024,
    truncation=True,
    return_tensors='pt'
                                                    )

    input_ids_test = encoded_data_test['input_ids']
    attention_masks_test = encoded_data_test['attention_mask']
    # labels_val = torch.tensor(train_data[train_data.data_type=='val'].label.values)

    dataset_test = TensorDataset(input_ids_test, attention_masks_test)
    
    dataloader_test = DataLoader(dataset_test, 
                                 sampler=SequentialSampler(dataset_test), 
                                 batch_size=batch_size)
    
    model.eval()
    
    predictions = []
    
    for batch in dataloader_test:
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1]
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        logits = outputs.logits

        logits = logits.detach().cpu().numpy()
        predictions.append(logits)
        
    predictions = np.concatenate(predictions, axis=0)
    
    return predictions

In [18]:
def build_submission(essay_id, output_labels):
    list_dicts = []
    # incr = 0
    for idx, label in enumerate(output_labels):
        # label = output_labels[idx]
        # essay_id = essay_ids[idx]
        if idx == 0:
            prev_label = label
            prev_id = essay_id
            # building_text += line
            # incr = 1
            new_incr = idx
            continue
        if label == prev_label and essay_id == prev_id:
            # building_text += line
            prev_label = label
            prev_id = essay_id
        else:
            # prediction_string, new_incr = create_predictionstring(building_text, incr)
            prediction_string = []
            for i in np.arange(new_incr, idx):
                prediction_string.append(str(i))
            prediction_string = " ".join(prediction_string)
            list_dicts.append({'id': prev_id, 'class': prev_label, 'predictionstring': prediction_string})
            if essay_id != prev_id:
                incr = 0
            else:
                new_incr = idx
                prev_label = label
                # building_text = line
    return list_dicts

In [19]:
test_dir = '../input/feedback-prize-2021/test'
concat = []
label_dict_inverse = {v: k for k, v in label_dict.items()}
for dirname, _ , filenames in os.walk(test_dir):
    for filename in filenames:
        raw_essay, id_str = test_essay(filename)
        predictions = test_longform(model, raw_essay)
        preds_flat = np.argmax(predictions, axis=2).flatten()
        labels = []
        for pred in preds_flat:
            labels.append(label_dict_inverse[pred])
        list_dicts = build_submission(id_str, labels)
        sub = pd.DataFrame(data=list_dicts)
        concat.append(sub)
        
fin_cat = pd.concat(concat, ignore_index = True)
# fin_cat.drop(columns = ['original text'], inplace = True)
fin_cat.to_csv('submission.csv', index=False)

