In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
%cd /content/drive/My Drive/Colab Notebooks/TextClassification

/content/drive/My Drive/Colab Notebooks/TextClassification


In [3]:
!pip install pytorch-transformers
!pip install transformers==3



In [4]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import precision_recall_fscore_support

import seaborn as sns
import pandas as pd

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import torch
from torchtext.data import Field, TabularDataset, BucketIterator, Iterator

from transformers import RobertaTokenizer, RobertaModel, AdamW, get_linear_schedule_with_warmup

import warnings
warnings.filterwarnings('ignore')

import logging
logging.getLogger("transformers.tokenization_utils_base").setLevel(logging.ERROR)

import csv

In [5]:
## Check if Cuda is Available
print(torch.cuda.is_available())

True


In [6]:
train_raw = pd.read_json("data/train.jsonl", lines=True, encoding="utf-8")
test_raw = pd.read_json("data/test.jsonl", lines=True, encoding="utf-8")

In [27]:
train_raw['conext_string'] = train_raw.context.apply(lambda x: ' '.join(x[::-1][:3]))
test_raw['conext_string'] = test_raw.context.apply(lambda x: ' '.join(x[::-1][:3]))

In [28]:
train_raw.head(10)

Unnamed: 0,label,response,context,conext_string
0,SARCASM,@USER @USER @USER I don't get this .. obviousl...,[A minor child deserves privacy and should be ...,@USER If your child isn't named Barron ... #Be...
1,SARCASM,@USER @USER trying to protest about . Talking ...,[@USER @USER Why is he a loser ? He's just a P...,@USER @USER having to make up excuses of why y...
2,SARCASM,@USER @USER @USER He makes an insane about of ...,[Donald J . Trump is guilty as charged . The e...,@USER I ’ ll remember to not support you at th...
3,SARCASM,@USER @USER Meanwhile Trump won't even release...,[Jamie Raskin tanked Doug Collins . Collins lo...,@USER But not half as stupid as Schiff looks ....
4,SARCASM,@USER @USER Pretty Sure the Anti-Lincoln Crowd...,[Man ... y ’ all gone “ both sides ” the apoca...,@USER They already did . Obama said many times...
5,SARCASM,@USER @USER @USER -> per your tag line : never...,[Donald Trump tapped into voters ’ populist sh...,@USER because these privileged white boys are ...
6,SARCASM,@USER @USER he does ! It excites him then he k...,[@USER @USER Coo-Coo . Keep on supporting fema...,@USER @USER do you masturbate to these videos ...
7,SARCASM,"Oh look , it's the #racist @USER offering soli...","[Hi , I'm Dennis , I'll be looking after lily'...",@USER Dennis please pass on my love and solida...
8,SARCASM,@USER @USER @USER As they are the biggest bull...,[Tips for children and young people from @USER...,@USER @USER @USER Please forward on to the Soc...
9,SARCASM,@USER @USER @USER responds to facts by tossing...,[The response of Sanders ' team to his quote f...,"@USER Careful , Bernie ’ s supporters get trig..."


In [29]:
encode_label = {'NOT_SARCASM' : 0, 'SARCASM' : 1}

train_raw['target'] = train_raw['label'].map(encode_label)
train_raw['all_string'] = train_raw['response'] + ". " + train_raw['conext_string']
test_raw['all_string'] = test_raw['response'] + ". " + test_raw['conext_string']

In [30]:
train_raw['all_string'] = train_raw['all_string'].apply(lambda x: x.lower())
test_raw['all_string'] = test_raw['all_string'].apply(lambda x: x.lower())

In [31]:
train_raw.head(10)

Unnamed: 0,label,response,context,conext_string,target,all_string
0,SARCASM,@USER @USER @USER I don't get this .. obviousl...,[A minor child deserves privacy and should be ...,@USER If your child isn't named Barron ... #Be...,1,@user @user @user i don't get this .. obviousl...
1,SARCASM,@USER @USER trying to protest about . Talking ...,[@USER @USER Why is he a loser ? He's just a P...,@USER @USER having to make up excuses of why y...,1,@user @user trying to protest about . talking ...
2,SARCASM,@USER @USER @USER He makes an insane about of ...,[Donald J . Trump is guilty as charged . The e...,@USER I ’ ll remember to not support you at th...,1,@user @user @user he makes an insane about of ...
3,SARCASM,@USER @USER Meanwhile Trump won't even release...,[Jamie Raskin tanked Doug Collins . Collins lo...,@USER But not half as stupid as Schiff looks ....,1,@user @user meanwhile trump won't even release...
4,SARCASM,@USER @USER Pretty Sure the Anti-Lincoln Crowd...,[Man ... y ’ all gone “ both sides ” the apoca...,@USER They already did . Obama said many times...,1,@user @user pretty sure the anti-lincoln crowd...
5,SARCASM,@USER @USER @USER -> per your tag line : never...,[Donald Trump tapped into voters ’ populist sh...,@USER because these privileged white boys are ...,1,@user @user @user -> per your tag line : never...
6,SARCASM,@USER @USER he does ! It excites him then he k...,[@USER @USER Coo-Coo . Keep on supporting fema...,@USER @USER do you masturbate to these videos ...,1,@user @user he does ! it excites him then he k...
7,SARCASM,"Oh look , it's the #racist @USER offering soli...","[Hi , I'm Dennis , I'll be looking after lily'...",@USER Dennis please pass on my love and solida...,1,"oh look , it's the #racist @user offering soli..."
8,SARCASM,@USER @USER @USER As they are the biggest bull...,[Tips for children and young people from @USER...,@USER @USER @USER Please forward on to the Soc...,1,@user @user @user as they are the biggest bull...
9,SARCASM,@USER @USER @USER responds to facts by tossing...,[The response of Sanders ' team to his quote f...,"@USER Careful , Bernie ’ s supporters get trig...",1,@user @user @user responds to facts by tossing...


In [32]:
test_raw.head(10)

Unnamed: 0,id,response,context,conext_string,all_string
0,twitter_1,"@USER @USER @USER My 3 year old , that just fi...","[Well now that ’ s problematic AF <URL>, @USER...",@USER @USER @USER No .. he actually in the gif...,"@user @user @user my 3 year old , that just fi..."
1,twitter_2,@USER @USER How many verifiable lies has he to...,[Last week the Fake News said that a section o...,@USER The mainstream media doesn't report the ...,@user @user how many verifiable lies has he to...
2,twitter_3,@USER @USER @USER Maybe Docs just a scrub of a...,[@USER Let ’ s Aplaud Brett When he deserves i...,@USER @USER He did try keep korkmaz in in the ...,@user @user @user maybe docs just a scrub of a...
3,twitter_4,@USER @USER is just a cover up for the real ha...,[Women generally hate this president . What's ...,@USER I've hated him before he was placed in o...,@user @user is just a cover up for the real ha...
4,twitter_5,@USER @USER @USER The irony being that he even...,"[Dear media Remoaners , you excitedly sharing ...",@USER @USER Quite an articulate and considered...,@user @user @user the irony being that he even...
5,twitter_6,@USER @USER Doesn't matter . Those guys weren'...,[Wilt Chamberlain rejects the skyhook twice in...,@USER plus he ’ s around 34 years old at that ...,@user @user doesn't matter . those guys weren'...
6,twitter_7,"@USER @USER @USER So , my #kindnesscascade are...",[I want to start something magical . I don ’ t...,@USER @USER @USER It really was . I'm packing ...,"@user @user @user so , my #kindnesscascade are..."
7,twitter_8,@USER @USER @USER They need to be an MSP to be...,[He ’ s finished . If true this is grooming an...,@USER @USER I think it will be Cherry & I susp...,@user @user @user they need to be an msp to be...
8,twitter_9,@USER @USER @USER In which Constitution is it ...,[Now students can ’ t bring stones in librarie...,@USER this one ? @USER aap to bahut logical ha...,@user @user @user in which constitution is it ...
9,twitter_10,@USER @USER ... he says while the GOP is overw...,[One of these things is not like the others . ...,@USER It's more diverse than the Democratic de...,@user @user ... he says while the gop is overw...


In [33]:
train_raw.to_csv("data/train_new.csv")
test_raw.to_csv("data/test_new.csv")

reference：
https://towardsdatascience.com/fine-tuning-bert-and-roberta-for-high-accuracy-text-classification-in-pytorch-c9e63cf64646
https://github.com/aramakus/ML-and-Data-Analysis/blob/master/RoBERTa%20for%20text%20classification.ipynb

In [7]:
# Set random seed and set device to GPU.
torch.manual_seed(17)

if torch.cuda.is_available():
    device = torch.device('cuda:0')
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
else:
    device = torch.device('cpu')

print(device)

cuda:0


In [8]:
# Initialize tokenizer.
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=898823.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




In [41]:
# Set tokenizer hyperparameters.
MAX_SEQ_LEN = 256
BATCH_SIZE = 16
PAD_INDEX = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
UNK_INDEX = tokenizer.convert_tokens_to_ids(tokenizer.unk_token)


# Define columns to read.
label_field = Field(sequential=False, use_vocab=False, batch_first=True)
text_field = Field(use_vocab=False, 
                   tokenize=tokenizer.encode, 
                   include_lengths=False, 
                   batch_first=True,
                   fix_length=MAX_SEQ_LEN, 
                   pad_token=PAD_INDEX, 
                   unk_token=UNK_INDEX)

fields = {'all_string' : ('all_string', text_field), 'target' : ('target', label_field)}


# Read preprocessed CSV into TabularDataset and split it into train, test and valid.
train_data, valid_data = TabularDataset(path="data/train_new.csv", 
                                        format='CSV', 
                                        fields=fields, 
                                        skip_header=False).split(split_ratio=[0.80, 0.2], 
                                        stratified=True, 
                                        strata_field='target')

# Create train and validation iterators.
train_iter, valid_iter = BucketIterator.splits((train_data, valid_data),
                                               batch_size=BATCH_SIZE,
                                               device=device,
                                               shuffle=True,
                                               sort_key=lambda x: len(x.all_string), 
                                               sort=True, 
                                               sort_within_batch=False)
id_field = Field(use_vocab=True, sequential=False)
fields2 = {'all_string' : ('all_string', text_field)}
test_data = TabularDataset(path="data/test_new.csv", 
                           format='CSV', 
                           fields=fields2, 
                           skip_header=False)

# Test iterator, no shuffling or sorting required.
test_iter = Iterator(test_data, batch_size=BATCH_SIZE, device=device, train=False, shuffle=False, sort=False)

In [36]:
# Model with extra layers on top of RoBERTa
class ROBERTAClassifier(torch.nn.Module):
    def __init__(self, dropout_rate=0.3):
        super(ROBERTAClassifier, self).__init__()
        
        self.roberta = RobertaModel.from_pretrained('roberta-base')
        self.d1 = torch.nn.Dropout(dropout_rate)
        self.l1 = torch.nn.Linear(768, 64)
        self.bn1 = torch.nn.LayerNorm(64)
        self.d2 = torch.nn.Dropout(dropout_rate)
        self.l2 = torch.nn.Linear(64, 2)
        
    def forward(self, input_ids, attention_mask):
        _, x = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        x = self.d1(x)
        x = self.l1(x)
        x = self.bn1(x)
        x = torch.nn.ReLU()(x)
        x = self.d2(x)
        x = self.l2(x)
        
        return x

In [15]:
# Training Function

def train(model,
          optimizer,
          train_iter,
          valid_iter,
          scheduler = None,
          num_epochs = 5,
          valid_period = len(train_iter),
          output_path = '/content/drive/My Drive/Colab Notebooks/TextClassification'):
    
    # Initialize losses and loss histories
    train_loss = 0.0
    valid_loss = 0.0
    train_loss_list = []
    valid_loss_list = []
    best_valid_loss = float('Inf')
    
    global_step = 0
    global_steps_list = []
    
    model.train()
    
    # Train loop
    for epoch in range(num_epochs):
        for (source, target), _ in train_iter:
            mask = (source != PAD_INDEX).type(torch.uint8)

            y_pred = model(input_ids=source,  
                           attention_mask=mask)
            #output = model(input_ids=source,
            #              labels=target,
            #              attention_mask=mask)
            
            loss = torch.nn.CrossEntropyLoss()(y_pred, target)
            #loss = output[0]
            
            loss.backward()
            
            #torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
            
            # Optimizer and scheduler step
            optimizer.step()    
            scheduler.step()
                
            optimizer.zero_grad()
            
            # Update train loss and global step
            train_loss += loss.item()
            global_step += 1

            # Validation loop. Save progress and evaluate model performance.
            if global_step % valid_period == 0:
                model.eval()
                pred = []
                actual = []
                
                with torch.no_grad():                    
                    for (source, target), _ in valid_iter:
                        mask = (source != PAD_INDEX).type(torch.uint8)

                        y_pred = model(input_ids=source, 
                                       attention_mask=mask)
                        #output = model(input_ids=source,
                        #               labels=target,
                        #               attention_mask=mask)
                        
                        loss = torch.nn.CrossEntropyLoss()(y_pred, target)
                        #loss = output[0]
                        
                        valid_loss += loss.item()
                        pred.extend(torch.argmax(y_pred, axis=-1).tolist())
                        actual.extend(target.tolist())

                # Store train and validation loss history
                train_loss = train_loss / valid_period
                valid_loss = valid_loss / len(valid_iter)
                train_loss_list.append(train_loss)
                valid_loss_list.append(valid_loss)
                global_steps_list.append(global_step)

                # print summary
                print('Epoch [{}/{}], global step [{}/{}], Train Loss: {:.4f}, Valid Loss: {:.4f}, precision, recall, f1:'
                      .format(epoch+1, num_epochs, global_step, num_epochs*len(train_iter),
                              train_loss, valid_loss), precision_recall_fscore_support(actual, pred, average='macro'))
                
                # checkpoint
                if best_valid_loss > valid_loss:
                    best_valid_loss = valid_loss
                    save_checkpoint(output_path + '/model_RoBERTa_relu_nopretrain.pkl', model, best_valid_loss)
                    save_metrics(output_path + '/metric2.pkl', train_loss_list, valid_loss_list, global_steps_list)
                        
                train_loss = 0.0                
                valid_loss = 0.0
                model.train()
    
    save_metrics(output_path + '/metric2.pkl', train_loss_list, valid_loss_list, global_steps_list)
    print('Training done!')

In [21]:
# Functions for saving and loading model parameters and metrics.
def save_checkpoint(path, model, valid_loss):
    torch.save({'model_state_dict': model.state_dict(),
                  'valid_loss': valid_loss}, path)

    
def load_checkpoint(path, model):    
    state_dict = torch.load(path, map_location=device)
    model.load_state_dict(state_dict['model_state_dict'], strict=False)
    
    return state_dict['valid_loss']


def save_metrics(path, train_loss_list, valid_loss_list, global_steps_list):   
    state_dict = {'train_loss_list': train_loss_list,
                  'valid_loss_list': valid_loss_list,
                  'global_steps_list': global_steps_list}
    
    torch.save(state_dict, path)


def load_metrics(path):    
    state_dict = torch.load(path, map_location=device)
    return state_dict['train_loss_list'], state_dict['valid_loss_list'], state_dict['global_steps_list']

In [37]:
NUM_EPOCHS = 12
steps_per_epoch = len(train_iter)

model = ROBERTAClassifier(0.3)
model = model.to(device)


print("======================= Start training =================================")
optimizer = AdamW(model.parameters(), lr=1e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=steps_per_epoch*2, 
                                            num_training_steps=steps_per_epoch*NUM_EPOCHS)

train(model=model, 
      train_iter=train_iter, 
      valid_iter=valid_iter, 
      optimizer=optimizer, 
      scheduler=scheduler, 
      num_epochs=NUM_EPOCHS)

Epoch [1/12], global step [250/3000], Train Loss: 0.6655, Valid Loss: 0.6959, precision, recall, f1: (0.25, 0.5, 0.3333333333333333, None)
Epoch [2/12], global step [500/3000], Train Loss: 0.5225, Valid Loss: 0.4991, precision, recall, f1: (0.7650920334784932, 0.762, 0.7613039623542248, None)
Epoch [3/12], global step [750/3000], Train Loss: 0.4469, Valid Loss: 0.4494, precision, recall, f1: (0.8144998085653339, 0.7989999999999999, 0.7964925851415288, None)
Epoch [4/12], global step [1000/3000], Train Loss: 0.3483, Valid Loss: 0.4443, precision, recall, f1: (0.8114756839922403, 0.798, 0.7957912784679089, None)
Epoch [5/12], global step [1250/3000], Train Loss: 0.2573, Valid Loss: 0.5105, precision, recall, f1: (0.7880184331797235, 0.788, 0.7879966079457271, None)
Epoch [6/12], global step [1500/3000], Train Loss: 0.1955, Valid Loss: 0.5394, precision, recall, f1: (0.7790904252977965, 0.779, 0.7789820975499016, None)
Epoch [7/12], global step [1750/3000], Train Loss: 0.1552, Valid Loss:

In [39]:
# Evaluation Function

def evaluate(model, test_loader):
    y_pred = []

    model.eval()
    with torch.no_grad():
        for (source), _ in test_loader:
                mask = (source != PAD_INDEX).type(torch.uint8)
                
                output = model(source, attention_mask=mask)

                y_pred.extend(torch.argmax(output, axis=-1).tolist())

    
    output = pd.DataFrame()
    output['Pred'] = y_pred

    return output

In [42]:
model = ROBERTAClassifier()
model = model.to(device)

load_checkpoint('/content/drive/My Drive/Colab Notebooks/TextClassification/model_RoBERTa_relu_nopretrain.pkl', model)

prediction=evaluate(model, test_iter)

In [43]:
prediction

Unnamed: 0,Pred
0,1
1,1
2,1
3,1
4,1
...,...
1795,0
1796,1
1797,1
1798,0


In [44]:
encode_label = {0 : 'NOT_SARCASM', 1 : 'SARCASM'}

test_raw['Pred']=prediction['Pred'].map(encode_label)
test_raw[['id', 'Pred']].to_csv('answer.txt', header=None, index=None, sep=',', quoting=csv.QUOTE_NONE, escapechar = ' ')