In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, precision_recall_fscore_support, roc_curve, auc
import spacy
import nltk
from nltk.corpus import stopwords
from IPython.display import clear_output 
from termcolor import colored
import random
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm


try:
    from transformers import BertTokenizer, AutoTokenizer, PreTrainedTokenizer
except:
    !pip install transformers
    from transformers import BertTokenizer, AutoTokenizer, PreTrainedTokenizer
    clear_output()

In [2]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
set_seed(1)

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
path = '/content/drive/MyDrive/nlp_data'
# Load dataset
pcl_col_names = ['paragraph_id', 'article_id', 'keyword', 'country_code', 'paragraph','label']
cat_col_names = ['paragraph_id', 'article_id', 'paragraph', 'keyword', 'country_code', 'span_start', 'span_end', 'span_text', 'category_label', 'number_of_annotators_agreeing_on_that_label']
df_pcl = pd.read_csv(os.path.join(path, 'dontpatronizeme_pcl.tsv'), sep='\t', skiprows=3, header=None, names=pcl_col_names, index_col='paragraph_id')
df_cat = pd.read_csv(os.path.join(path, 'dontpatronizeme_categories.tsv'), sep='\t', skiprows=3, header=None, names=cat_col_names)

df_pcl.dropna(subset=['paragraph'], inplace=True)
df_cat.dropna(subset=['paragraph'], inplace=True)
# 0,1 => No PCL, 2, 3, 4 => PCL
df_pcl['label'] = 1 * (df_pcl['label'] > 1)

# Train/test split based on official document
df_train_index = pd.read_csv(os.path.join(path, 'train_semeval_parids-labels.csv'))
df_test_index = pd.read_csv(os.path.join(path, 'dev_semeval_parids-labels.csv'))
df_train = df_pcl.reindex(df_train_index['par_id'])
df_test = df_pcl.reindex(df_test_index['par_id'])

In [4]:
df_train['para_len'] = df_train['paragraph'].apply(len)
df_train.groupby('label')['para_len'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,7581.0,265.90529,161.163166,8.0,168.0,232.0,328.0,5493.0
1,794.0,286.525189,180.4126,34.0,177.25,251.0,355.0,2776.0


In [5]:
# spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS
try:
    nlp = spacy.load('en_core_web_md')
except:
    !python -m spacy download en_core_web_md
    clear_output()

In [6]:
from collections import defaultdict
from spacy.tokens import Token

In [7]:
def preprocess(paragraph):
    tokens_list = []
    sent_list = []
    num = 0
    email = 0
    url = 0
    bracket = 0
    quote = 0
    currency = 0
    oov = 0
    for doc in nlp.pipe(paragraph):
        tokens = defaultdict(list)
        sent_list.append(doc)
        for token in doc:
            if token.is_stop:
                continue
            tokens['tokens'].append(token)
            if token.like_num:
                tokens['num'].append(token)
                num += 1
            if token.like_email:
                tokens['email'].append(token)
                email += 1
            if token.like_url:
                url +=1
                tokens['url'].append(token)
            if token.is_bracket:
                bracket += 1
            if token.is_quote:
                quote += 1
            if token.is_currency:
                currency += 1
                tokens['currency'].append(token)
            if token.is_oov:
                oov += 1
                tokens['oov'].append(token)
            # if len(tokens) > 3:
            #     tokens.append(doc) 
            # print(token.text, token.like_num, nlp.vocab.strings[token.text])
        # break
        tokens_list.append(tokens)
    return tokens_list, (num,email,url,bracket,quote,currency,oov), sent_list
pcl_tokens, pcl_stats, pcl_sent_list = preprocess(df_pcl.paragraph)
cat_tokens, cat_stats, cat_sent_list = preprocess(df_cat.span_text)

In [161]:
doc1 = nlp('GOD WILL NOT DO FOR MAN WHAT MAN CAN DO FOR HIMSELF')
for token in doc1:
    print(token.is_stop)

False
True
True
True
True
False
True
False
True
True
True
True


In [198]:
tk = AutoTokenizer.from_pretrained("bert-base-cased")

class dataset(Dataset):
    def __init__(self, df, tokenizer):
        self.df = df
        self.tk = tokenizer
    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self, index):
        
        data_encoded = self.tk(self.df['paragraph'].iloc[index], padding='max_length', max_length=512, return_tensors='pt')
        return data_encoded['input_ids'].squeeze(), data_encoded['attention_mask'].squeeze(), torch.LongTensor([self.df['label'].iloc[index]]).squeeze()

train_data = dataset(df_train,tk)
val_data = dataset(df_test,tk)
train_dataloader = DataLoader(dataset = train_data, batch_size=24, shuffle=True)
val_dataloader = DataLoader(dataset = val_data, batch_size=24, shuffle=False)

In [120]:
a,b,c = next(iter(train_dataloader))
print(a.shape, b.shape, c.shape)

torch.Size([1, 512]) torch.Size([1, 512]) torch.Size([1])


In [63]:
use_cuda = torch.cuda.is_available()
device = 'cuda' if use_cuda else 'cpu'

In [186]:
def cal_acc(y_pred, y_true):
    return torch.sum(torch.argmax(y_pred, axis=1) == y_true) / len(y_true)

In [187]:
from transformers import RobertaForSequenceClassification, get_linear_schedule_with_warmup, BertForSequenceClassification
class MyBertModel(nn.Module):
    def __init__(self):
        super(MyBertModel, self).__init__()
        self.bert = BertForSequenceClassification.from_pretrained('bert-base-uncased')
        
    def forward(self, input_ids, attention_mask):
        bert_out = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        out = bert_out.logits
        return out

class Trainer(object):
    def __init__(self, model, train_loader, val_loader):
        self.model = model.to(device)
        self.epochs = 20
        self.optimizer = torch.optim.AdamW(params=self.model.parameters(), lr=0.003)
        self.scheduler = get_linear_schedule_with_warmup(self.optimizer,
                                                num_warmup_steps=0, # Default value
                                                num_training_steps=self.epochs * len(train_loader))
        self.loss_fn = nn.CrossEntropyLoss()
        self.train_loader = train_loader
        self.val_loader = val_loader
        self.metric = cal_acc

        
    


    def run_one_epoch(self, loader, logging_freq=10, eval=False ):
        # Moving average statistics
        epoch_loss = 0.0
        epoch_accuracy = 0.0

        batch_loss = []
        batch_accuracy = []
        for step, batch in enumerate(loader):
            input_ids, attention_mask, y_true = [x.to(device) for x in batch]
            y_pred = self.model(input_ids, attention_mask)

            if not eval:
                self.optimizer.zero_grad()

                loss = self.loss_fn(y_pred, y_true)
                loss.backward()
                self.optimizer.step()
                self.scheduler.step()

                accuracy = self.metric(y_pred, y_true)
                loss = loss.cpu().item() if use_cuda else loss.item()
                accuracy = accuracy.cpu().item() if use_cuda else accuracy.item()     
                
                batch_loss.append(loss)
                batch_accuracy.append(accuracy)

            else:
                loss = self.loss_fn(y_pred.cpu(), y_true)
                accuracy = self.metric(y_pred, y_true)

                loss = loss.cpu().item() if use_cuda else loss.item()
                accuracy = accuracy.cpu().item() if use_cuda else accuracy.item()  

                batch_loss.append(loss)
                batch_accuracy.append(accuracy)
                
            epoch_loss = np.sum(batch_loss)
            epoch_accuracy += np.sum(batch_accuracy)
            mode = "Train" if not eval else "Eval"
            if (step+1) % logging_freq == 0: # Use 1-based index for logging
                print("Mode: {} | Step: {} | Loss: {} | Accuracy: {}".format(
                    mode, step + 1, 
                    np.mean(batch_loss[(step + 1 - logging_freq) : (step+1)]), 
                    np.mean(batch_accuracy[(step + 1 - logging_freq) : (step+1)])
                ))
            if (step) == len(loader)-1:
                print("Mode: {} | End of Epoch | Loss: {} | Accuracy: {}".format(
                    mode, 
                    epoch_loss/(step+1),
                    epoch_accuracy/(step+1)
                ))
            
        return epoch_loss, accuracy
    
    def train(self, val_freq=20):
        # Moving average statistics
        train_loss = 0.0
        train_accuracy = 0.0
        val_loss = 0.0
        val_accuracy = 0.0

        for i in range(self.epochs):
            self.model.train()
            print('-' * 30 + 'Train for Epoch {}'.format(i) + '-'*30 )
            epoch_loss, epoch_accuracy = self.run_one_epoch(self.train_loader, logging_freq=10, eval=False)
            
            train_loss += epoch_loss
            train_accuracy += epoch_accuracy

            print("Mode: Train | Epoch: {} | Loss: {} | Accuracy: {}".format(
                  i + 1, train_loss / (i+1), train_accuracy / (i+1)
            ))
            if i % val_freq == 0:
                self.model.eval()
                with torch.no_grad():
                    print('-' * 30 + 'Val at Epoch{}'.format(i) + '-'*30 )
                    epoch_loss, epoch_accuracy = self.run_one_epoch(self.val_loader, logging_freq=10, eval=True)

                    val_loss += epoch_loss
                    val_accuracy += epoch_accuracy
                    print("Mode: Eval | Epoch: {} | Loss: {} | Accuracy: {}".format(
                        i + 1, val_loss / (i+1), val_accuracy / (i+1)
                    ))

            

In [203]:
import gc
gc.collect()
torch.cuda.empty_cache()
!nvidia-smi

Mon Jan 31 19:11:05 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.46       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   43C    P0    27W /  70W |  13800MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [196]:
trainer = Trainer(MyBertModel(), train_dataloader, val_dataloader)
trainer.train()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

------------------------------Train for Epoch 0------------------------------


RuntimeError: ignored

In [13]:
tk = AutoTokenizer.from_pretrained("bert-base-cased")

In [30]:
train_text = df_train['paragraph'].iloc[0]

In [31]:
token_enc = tk(train_text)['input_ids']
for ids in token_enc:
    print(ids, tk.decode(ids))

NotImplementedError: ignored

In [195]:
pcl_tokens[0]['tokens'][4].vector_norm

5.094723

In [8]:
train_text = []
for token_dict in pcl_tokens:
    train_text.append(' '.join([token.lemma_ for token in token_dict['tokens']]))


In [144]:
print('(num,email,url,bracket,quote,currency,oov)')
print(pcl_stats)
print(cat_stats)
print(pcl_tokens, pcl_sent_list)
print(cat_tokens, cat_sent_list)

(num,email,url,bracket,quote,currency,oov)
(6269, 1, 33, 3374, 7505, 166, 5432)
(169, 0, 0, 98, 316, 12, 248)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [148]:
cat_sent_list[1]

help them communicate with others

In [157]:
def highlight_text(words, sents, highlight_type):
    if isinstance(sents, list):
        for i, sent in enumerate(sents):
            if highlight_type in words[i].keys():
                result = " ".join(colored(t,'white','on_red') if t in words[i][highlight_type] else t.text for t in sent)
                print(colored(words[i][highlight_type],'white','on_blue'), result+'\n')
    else:
        if highlight_text in words.keys():
            result = " ".join(colored(t.text,'white','on_red') if t in words[highlight_text] else t.text for t in sents)

highlight_text(cat_tokens, cat_sent_list, 'oov')  

[44m[37m[Osterley][0m If they 're lucky and they come across COPE Galway and the ladies in [41m[37mOsterley[0m , then there 's hope

[44m[37m[Osterley][0m People do n't understand the hurt , people do n't understand the pain . I ' ve read about women with their children sleeping in cars , sleeping in hotel rooms and it 's criminal . If they 're lucky and they come across COPE Galway and the ladies in [41m[37mOsterley[0m , then there 's hope

[44m[37m[Osterley][0m If they 're lucky and they come across COPE Galway and the ladies in [41m[37mOsterley[0m , then there 's hope

[44m[37m[Sheepherding][0m [41m[37mSheepherding[0m in America has always been an immigrant 's job , too dirty , too cold and too lonely for anyone with options

[44m[37m[Karwan][0m organise [41m[37mKarwan[0m - e - Benazir rallies in all the provincial capitals and in Azad Jammu & amp ; Kashmir , Gilgit - Baltistan and Fata to celebrate the day and PPP 's achievements for empowerment of wom

In [52]:
df_train[df_train.paragraph.str.contains('GlobalBrain.net')]

Unnamed: 0_level_0,article_id,keyword,country_code,paragraph,label,para_len
par_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
8172,@@14078360,hopeless,nz,Grant Ryan Grant is a hopelessly addicted inve...,0,604


## Evaluation Metric

In [3]:
from sklearn.metrics import accuracy_score, classification_report, roc_curve, auc
import numpy as np

def evaluate(y_score, y_true):

    fpr, tpr, threshold = roc_curve(y_true, y_score)
    roc_auc = auc(fpr, tpr)
    print(f'AUC: {roc_auc:.4f}')
       
    # Get accuracy over the test set
    y_pred = np.where(y_score >= 0.5, 1, 0)
    accuracy = accuracy_score(y_true, y_pred)
    print(f'Accuracy: {accuracy*100:.2f}%')
    print(classification_report(y_true, y_pred))
    # Plot ROC AUC
    plt.title('Receiver Operating Characteristic')
    plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()



In [None]:

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

training_args = TrainingArguments(
    output_dir = '/content/drive/MyDrive/nlp_model',
    num_train_epochs=3,
    per_device_train_batch_size = 4,
    gradient_accumulation_steps = 16,    
    per_device_eval_batch_size= 8,
    evaluation_strategy = "epoch",
    save_strategy= "epoch",
    disable_tqdm = False, 
    load_best_model_at_end=True,
    warmup_steps=500,
    weight_decay=0.01,
    logging_steps = 8,
    fp16 = True,
    logging_dir= '/content/drive/MyDrive/nlp_log',
    dataloader_num_workers = 8,
    run_name = 'roberta-classification'
)

# def dummy_data_collector(features):
#     batch = {}
#     batch['input_ids'] = torch.stack([f[0] for f in features])
#     batch['attention_mask'] = torch.stack([f[1] for f in features])
#     batch['labels'] = torch.stack([f[2] for f in features])
    
#     return batch
    # encoded_texts = tokenizer(texts, padding = True, truncation = True, return_tensors = 'pt')
    # labels = torch.tensor(labels)
    # dataset = TensorDataset(encoded_texts['input_ids'], encoded_texts['attention_mask'], labels)
tk = AutoTokenizer.from_pretrained('bert-base-cased')
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_data,
    eval_dataset=val_data,
    tokenizer=tk
)
print(next(iter(trainer.get_train_dataloader())))