In [1]:
import numpy as np 
import pandas as pd 
import torch
from tqdm.notebook import tqdm
from transformers import AutoModel, AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import TensorDataset
from transformers import BertForSequenceClassification
import nltk.data
from nltk.tokenize import WordPunctTokenizer
from transformers import DistilBertModel, DistilBertConfig
import os

In [2]:
train_data = pd.read_csv('../input/feedback-prize-2021/train.csv')
sent_tokenizer = nltk.data.load('../input/nltk-data/nltk_data/tokenizers/punkt/PY3/english.pickle')
wp_tokenizer = WordPunctTokenizer()

data = {'discourse_text': [], 'discourse_type': [], 'lengths': []}
for index, row in train_data.iterrows():
    lines = sent_tokenizer.tokenize(row['discourse_text'])
    for line in lines:
        data['discourse_text'].append(line)
        data['discourse_type'].append(row['discourse_type'])
        data['lengths'].append(len(line))

train_data = pd.DataFrame.from_dict(data)

In [3]:
train_data.discourse_type.value_counts()

Evidence                172748
Claim                    55441
Concluding Statement     42204
Lead                     28462
Position                 18220
Counterclaim              7530
Rebuttal                  6619
Name: discourse_type, dtype: int64

In [4]:
ev = (train_data[train_data.discourse_type.eq('Evidence')]).sample(n=70000, ignore_index=True)
claim = (train_data[train_data.discourse_type.eq('Claim')]).sample(n=45000, ignore_index=True)
con_stat = (train_data[train_data.discourse_type.eq('Concluding Statement')]).sample(n=35000, ignore_index=True)
lead = (train_data[train_data.discourse_type.eq('Lead')]).sample(n=25000, ignore_index=True)
position = (train_data[train_data.discourse_type.eq('Position')]).sample(n=15000, ignore_index=True)
coun_claim = (train_data[train_data.discourse_type.eq('Counterclaim')]).sample(n=7530, ignore_index=True)
rebuttal = (train_data[train_data.discourse_type.eq('Rebuttal')]).sample(n=6619, ignore_index=True)

train_data = pd.concat([ev, claim, con_stat, lead, position, coun_claim, rebuttal], ignore_index=True)

In [5]:
train_data.discourse_type.value_counts()

Evidence                70000
Claim                   45000
Concluding Statement    35000
Lead                    25000
Position                15000
Counterclaim             7530
Rebuttal                 6619
Name: discourse_type, dtype: int64

In [6]:
# Label Encoding

possible_labels = train_data.discourse_type.unique()

label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index
label_dict

{'Evidence': 0,
 'Claim': 1,
 'Concluding Statement': 2,
 'Lead': 3,
 'Position': 4,
 'Counterclaim': 5,
 'Rebuttal': 6}

In [7]:
train_data['label'] = train_data.discourse_type.replace(label_dict)
train_data.head()

Unnamed: 0,discourse_text,discourse_type,lengths,label
0,"There are various, reasonable explanations for...",Evidence,88,0
1,They might go to college for it and get a job ...,Evidence,84,0
2,"If you are asking for advice, it is important ...",Evidence,77,0
3,"Although this assignment may seem tedious, sin...",Evidence,216,0
4,voting for an electoral group is like going to...,Evidence,156,0


In [8]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(train_data.index.values, 
                                                  train_data.label.values, 
                                                  test_size=0.2, 
                                                  random_state=42, 
                                                  stratify=train_data.label.values)

train_data['data_type'] = ['not_set']*train_data.shape[0]

train_data.loc[X_train, 'data_type'] = 'train'
train_data.loc[X_val, 'data_type'] = 'val'

train_data.groupby(['discourse_type', 'label', 'data_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,discourse_text,lengths
discourse_type,label,data_type,Unnamed: 3_level_1,Unnamed: 4_level_1
Claim,1,train,36000,36000
Claim,1,val,9000,9000
Concluding Statement,2,train,28000,28000
Concluding Statement,2,val,7000,7000
Counterclaim,5,train,6024,6024
Counterclaim,5,val,1506,1506
Evidence,0,train,56000,56000
Evidence,0,val,14000,14000
Lead,3,train,20000,20000
Lead,3,val,5000,5000


In [9]:
tokenizer = AutoTokenizer.from_pretrained('../input/transformers/distilbert-base-uncased', 
                                          do_lower_case=True)
                                          
encoded_data_train = tokenizer.batch_encode_plus(
    list(train_data[train_data.data_type=='train'].discourse_text.values), 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=216,
    truncation=True,
    return_tensors='pt'
)

encoded_data_val = tokenizer.batch_encode_plus(
    list(train_data[train_data.data_type=='val'].discourse_text.values), 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=216,
    truncation=True,
    return_tensors='pt'
)


input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(train_data[train_data.data_type=='train'].label.values)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(train_data[train_data.data_type=='val'].label.values)

dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)



In [10]:
model = AutoModelForSequenceClassification.from_pretrained("../input/transformers/distilbert-base-uncased",
                                  num_labels=len(label_dict),
                                  output_attentions=False,
                                  output_hidden_states=False)

Some weights of the model checkpoint at ../input/transformers/distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at ../input/transformers/distilbert-base-uncased and are newly initialized: ['classifier

In [11]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

batch_size = 32

dataloader_train = DataLoader(dataset_train, 
                              sampler=RandomSampler(dataset_train), 
                              batch_size=batch_size)

dataloader_validation = DataLoader(dataset_val, 
                                   sampler=SequentialSampler(dataset_val), 
                                   batch_size=batch_size)

In [12]:
from transformers import AdamW, get_linear_schedule_with_warmup

optimizer = AdamW(model.parameters(),
                  lr=1e-5, 
                  eps=1e-8)
                  
epochs = 3

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=0,
                                            num_training_steps=len(dataloader_train)*epochs)

In [13]:
from sklearn.metrics import f1_score

def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='macro')

def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}
    
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}\n')

In [14]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

print(device)

cuda


In [15]:
import random

seed_val = 1
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

def evaluate(dataloader_val):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in dataloader_val:
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals
    
for epoch in tqdm(range(1, epochs+1)):
    
    model.train()
    
    loss_train_total = 0

    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:

        model.zero_grad()
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }       

        outputs = model(**inputs)
        
        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
         
        
    #torch.save(model.state_dict(), f'data_volume/finetuned_BERT_epoch_{epoch}.model')
        
    tqdm.write(f'\nEpoch {epoch}')
    
    loss_train_avg = loss_train_total/len(dataloader_train)            
    tqdm.write(f'Training loss: {loss_train_avg}')
    
    val_loss, predictions, true_vals = evaluate(dataloader_validation)
    val_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (Macro): {val_f1}')

  0%|          | 0/3 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/5104 [00:00<?, ?it/s]


Epoch 1
Training loss: 1.2504809916743478
Validation loss: 1.1414338590396236
F1 Score (Macro): 0.517928127156863


Epoch 2:   0%|          | 0/5104 [00:00<?, ?it/s]


Epoch 2
Training loss: 1.1072655706009522
Validation loss: 1.1217178856037253
F1 Score (Macro): 0.5393380120540698


Epoch 3:   0%|          | 0/5104 [00:00<?, ?it/s]


Epoch 3
Training loss: 1.0527343580785402
Validation loss: 1.1177079552393347
F1 Score (Macro): 0.5423829976800828


In [16]:
accuracy_per_class(predictions, true_vals)

Class: Evidence
Accuracy: 10640/14000

Class: Claim
Accuracy: 4912/9000

Class: Concluding Statement
Accuracy: 3423/7000

Class: Lead
Accuracy: 2550/5000

Class: Position
Accuracy: 1651/3000

Class: Counterclaim
Accuracy: 711/1506

Class: Rebuttal
Accuracy: 390/1324



In [17]:
def test_sent(test_file):
    '''
    generates tokenized sentences for test data with corresponding list of essay IDs
    '''
    
    full_path = os.path.join(test_dir, test_file)
    with open(full_path) as f:
        data = f.read()
    lines = sent_tokenizer.tokenize(data.lower())
    
    tokens = []
    for line in lines:
        tok_line = wp_tokenizer.tokenize(line)
        tokens.append(tok_line)
        
    lines_len = len(lines)
    id_list = [test_file.replace('.txt', '')] * lines_len
    
    return tokens, id_list, lines

In [18]:
def test_bert(model, lines, batch_size=32):
    
    encoded_data_test = tokenizer.batch_encode_plus(
    lines, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=216,
    truncation=True,
    return_tensors='pt'
                                                    )

    input_ids_test = encoded_data_test['input_ids']
    attention_masks_test = encoded_data_test['attention_mask']
    #labels_val = torch.tensor(train_data[train_data.data_type=='val'].label.values)

    dataset_test = TensorDataset(input_ids_test, attention_masks_test)
    
    dataloader_test = DataLoader(dataset_test, 
                                 sampler=SequentialSampler(dataset_test), 
                                 batch_size=batch_size)
    
    model.eval()
    
    predictions = []
    
    for batch in dataloader_test:
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1]
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        logits = outputs.logits

        logits = logits.detach().cpu().numpy()
        predictions.append(logits)
        
    predictions = np.concatenate(predictions, axis=0)
    
    return predictions

In [19]:
def create_predictionstring(text, incr):
    '''
    A function to strip the punction and create punctuation string
    '''
    split_text = text.split()
    prediction_string = []
    for word in split_text:
        prediction_string.append(str(incr))
        incr += 1
    prediction_string = " ".join(prediction_string)
    return prediction_string, incr

In [20]:
def build_submission(all_lines, essay_ids, output_labels):
    list_dicts = []
    building_text = ''
    incr = 0
    for idx, line in enumerate(all_lines):
        label = output_labels[idx]
        essay_id = essay_ids[idx]
        if incr == 0:
            prev_label = label
            prev_id = essay_id
            building_text += line
            incr = 1
            continue
        if label == prev_label and essay_id == prev_id:
            building_text += line
            prev_label = label
            prev_id = essay_id
        else:
            prediction_string, new_incr = create_predictionstring(building_text, incr)
            list_dicts.append({'id': prev_id, 'class': prev_label, 'original text': building_text, 'predictionstring': prediction_string})
            if essay_id != prev_id:
                incr = 0
            else:
                incr = new_incr
                prev_label = label
                building_text = line
    return list_dicts

In [21]:
test_dir = '../input/feedback-prize-2021/test'
concat = []
label_dict_inverse = {v: k for k, v in label_dict.items()}
for dirname, _ , filenames in os.walk(test_dir):
    for filename in filenames:
        tokens, id_list, lines = test_sent(filename)
        predictions = test_bert(model, lines)
        preds_flat = np.argmax(predictions, axis=1).flatten()
        labels = []
        for pred in preds_flat:
            labels.append(label_dict_inverse[pred])
        list_dicts = build_submission(lines, id_list, labels)
        sub = pd.DataFrame(data=list_dicts)
        concat.append(sub)
        
fin_cat = pd.concat(concat, ignore_index = True)
fin_cat.drop(columns = ['original text'], inplace = True)
fin_cat.to_csv('submission.csv', index=False)

