In [None]:
!nvidia-smi

# Lib import
---

In [None]:
# !pip uninstall transformers -y &> /dev/null
!pip install transformers==3.5.1 &> /dev/null
# !pip uninstall torch &> /dev/null
!pip install torch==1.4.0 &> /dev/null
print("finished installing requirements")

In [None]:
import transformers
print(transformers.__version__)

In [None]:
import numpy as np
import pandas as pd 
import os
import re
import time
import string
import random
import matplotlib.pyplot as plt
import seaborn as sns
from plotly import graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff
from collections import Counter
import nltk
from tqdm import tqdm
import os
import spacy
from spacy.util import compounding
from spacy.util import minibatch
import torch
from sklearn.model_selection import train_test_split
from torch import optim
import warnings
warnings.filterwarnings("ignore")

%matplotlib inline
torch.cuda.empty_cache()

In [None]:
torch.cuda.empty_cache()

# Data Load
---

In [None]:
data = pd.read_csv("../input/indonesiahatespeechpreprocessed/notsoclean/data_preprocessed.csv")
data.sample(5)

# Model class section (Using IndoBert SmSA - Sentence-level Sentiment Analysis)
---
1. Requirement Imports

In [None]:
import random
import numpy as np
import pandas as pd
import torch
from torch import optim
import torch.nn.functional as F
from tqdm import tqdm

from transformers import BertForSequenceClassification, BertConfig, BertTokenizer
from nltk.tokenize import TweetTokenizer

2. Include:
    - DocumentSentimentDataset
    - DocumentSentimentDataLoader

In [None]:
import numpy as np
import pandas as pd
import string
import torch
import re
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer

class DocumentSentimentDataset(Dataset):
    # Static constant variable
    LABEL2INDEX = {'positive': 1, 'negative': 0}
    INDEX2LABEL = {1: 'positive', 0: 'negative'}
    NUM_LABELS = 2
    
    def load_dataset(self, path): 
        df = pd.read_csv(path)
        # df.columns = ['sentiment','text']
        df['hs_class'] = df['hs_class'].apply(lambda lab: self.LABEL2INDEX[lab])
        return df
    
    def __init__(self, dataset_path, tokenizer, no_special_token=False, *args, **kwargs):
        self.data = self.load_dataset(dataset_path)
        self.tokenizer = tokenizer
        self.no_special_token = no_special_token
    
    def __getitem__(self, index):
        data = self.data.loc[index,:]
        text, hs_class = data['text'], data['hs_class']
        subwords = self.tokenizer.encode(text, add_special_tokens=not self.no_special_token)
        return np.array(subwords), np.array(hs_class), data['text']
    
    def __len__(self):
        return len(self.data)    
        
class DocumentSentimentDataLoader(DataLoader):
    def __init__(self, max_seq_len=512, *args, **kwargs):
        super(DocumentSentimentDataLoader, self).__init__(*args, **kwargs)
        self.collate_fn = self._collate_fn
        self.max_seq_len = max_seq_len
        
    def _collate_fn(self, batch):
        batch_size = len(batch)
        max_seq_len = max(map(lambda x: len(x[0]), batch))
        max_seq_len = min(self.max_seq_len, max_seq_len)
        
        subword_batch = np.zeros((batch_size, max_seq_len), dtype=np.int64)
        mask_batch = np.zeros((batch_size, max_seq_len), dtype=np.float32)
        sentiment_batch = np.zeros((batch_size, 1), dtype=np.int64)
        
        seq_list = []
        for i, (subwords, sentiment, raw_seq) in enumerate(batch):
            subwords = subwords[:max_seq_len]
            subword_batch[i,:len(subwords)] = subwords
            mask_batch[i,:len(subwords)] = 1
            sentiment_batch[i,0] = sentiment
            
            seq_list.append(raw_seq)
            
        return subword_batch, mask_batch, sentiment_batch, seq_list

3. Include:
    - forward_sequence_classification

In [None]:
def forward_sequence_classification(model, batch_data, i2w, is_test=False, device='cpu', **kwargs):
    # Unpack batch data
    if len(batch_data) == 3:
        (subword_batch, mask_batch, label_batch) = batch_data
        token_type_batch = None
    elif len(batch_data) == 4:
        (subword_batch, mask_batch, token_type_batch, label_batch) = batch_data
    
    # Prepare input & label
    subword_batch = torch.LongTensor(subword_batch)
    mask_batch = torch.FloatTensor(mask_batch)
    token_type_batch = torch.LongTensor(token_type_batch) if token_type_batch is not None else None
    label_batch = torch.LongTensor(label_batch)
    if device == "cuda":
        subword_batch = subword_batch.cuda()
        mask_batch = mask_batch.cuda()
        token_type_batch = token_type_batch.cuda() if token_type_batch is not None else None
        label_batch = label_batch.cuda()

    # Forward model
    outputs = model(subword_batch, attention_mask=mask_batch, token_type_ids=token_type_batch, labels=label_batch)
    loss, logits = outputs[:2]
    # generate prediction & label list
    list_hyp = []
    list_label = []
    hyp = torch.topk(logits, 1)[1]
    for j in range(len(hyp)):
        list_hyp.append(i2w[hyp[j].item()])
        list_label.append(i2w[label_batch[j][0].item()])
        
    return loss, list_hyp, list_label

4. Metrics Include :
    - document_sentiment_metrics_fn

In [None]:
import itertools
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

def document_sentiment_metrics_fn(list_hyp, list_label):
    metrics = {}
    metrics["ACC"] = accuracy_score(list_label, list_hyp)
    metrics["F1"] = f1_score(list_label, list_hyp, average='macro')
    metrics["REC"] = recall_score(list_label, list_hyp, average='macro')
    metrics["PRE"] = precision_score(list_label, list_hyp, average='macro')
    return metrics

5. Methods Include:
    - get_lr
    - count_param
    - metrics_to_string
    - set_seed

In [None]:
import itertools
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

def get_lr(optimizer):
    for param_group in optimizer.param_groups:
        return param_group['lr']
    
def count_param(module, trainable=False):
    if trainable:
        return sum(p.numel() for p in module.parameters() if p.requires_grad)
    else:
        return sum(p.numel() for p in module.parameters())

def metrics_to_string(metric_dict):
    string_list = []
    for key, value in metric_dict.items():
        string_list.append('{}:{:.2f}'.format(key, value))
    return ' '.join(string_list)

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    
set_seed(26092020)


# Pretrained Model Load
---

In [None]:
# Load Tokenizer and Config
tokenizer = BertTokenizer.from_pretrained('indobenchmark/indobert-base-p1')
config = BertConfig.from_pretrained('indobenchmark/indobert-base-p1')
config.num_labels = DocumentSentimentDataset.NUM_LABELS

# Instantiate model
model = BertForSequenceClassification.from_pretrained('indobenchmark/indobert-base-p1', config=config)

# tokenizer = BertTokenizer.from_pretrained("indobenchmark/indobert-base-p1")
# model = BertForMultiLabelClassification.from_pretrained('indobenchmark/indobert-base-p1', config=config)

'''
if you are loading saved model
loaded_model = torch.load("./model.pkl")
model = loaded_model
'''
# model

In [None]:
count_param(model)

In [None]:
train_dataset_path = "../input/indonesiahatespeechpreprocessed/notsoclean/train_split.csv"
test_dataset_path = "../input/indonesiahatespeechpreprocessed/notsoclean/test_split.csv"
valid_dataset_path = "../input/indonesiahatespeechpreprocessed/notsoclean/validate_split.csv"

train_na = pd.read_csv(train_dataset_path).dropna().to_csv("train_new.csv", index=False)
test_na = pd.read_csv(test_dataset_path).dropna().to_csv("test_new.csv", index=False)
valid_na = pd.read_csv(valid_dataset_path).dropna().to_csv("validate_new.csv", index=False)

train_dataset_path = './train_new.csv'
test_dataset_path = './test_new.csv'
valid_dataset_path = './validate_new.csv'

train_dataset = pd.read_csv(train_dataset_path)
test_dataset = pd.read_csv(test_dataset_path)
valid_dataset = pd.read_csv(valid_dataset_path)

In [None]:
df = pd.read_csv(train_dataset_path, header=None)
df.columns = ['sentiment', 'text']
df.head()

In [None]:
train_dataset = DocumentSentimentDataset(train_dataset_path, tokenizer, lowercase=True)
test_dataset = DocumentSentimentDataset(test_dataset_path, tokenizer, lowercase=True)
valid_dataset = DocumentSentimentDataset(valid_dataset_path, tokenizer, lowercase=True)

train_loader = DocumentSentimentDataLoader(dataset=train_dataset, max_seq_len=512, batch_size=16, num_workers=16, shuffle=True)  
test_loader = DocumentSentimentDataLoader(dataset=test_dataset, max_seq_len=512, batch_size=16, num_workers=16, shuffle=False)
valid_loader = DocumentSentimentDataLoader(dataset=valid_dataset, max_seq_len=512, batch_size=16, num_workers=16, shuffle=False)  


w2i, i2w = DocumentSentimentDataset.LABEL2INDEX, DocumentSentimentDataset.INDEX2LABEL
print(w2i)
print(i2w)

# Testing single row data without training
---

In [None]:
text = 'Woi dasar kau anjing babi antek komunis'
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

In [None]:
text = 'Budi pergi ke pondok indah mall membeli cakwe'
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

# Fine Tuning
---

In [None]:
optimizer = optim.Adam(model.parameters(), lr=3e-6)
model = model.cuda()

In [None]:
# Train
n_epochs = 10
for epoch in range(n_epochs):
    model.train()
    torch.set_grad_enabled(True)
 
    total_train_loss = 0
    list_hyp, list_label = [], []

    train_pbar = tqdm(train_loader, leave=True, total=len(train_loader))
    for i, batch_data in enumerate(train_pbar):
        # Forward model
        loss, batch_hyp, batch_label = forward_sequence_classification(model, batch_data[:-1], i2w=i2w, device='cuda')

        # Update model
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        tr_loss = loss.item()
        total_train_loss = total_train_loss + tr_loss

        # Calculate metrics
        list_hyp += batch_hyp
        list_label += batch_label

        train_pbar.set_description("(Epoch {}) TRAIN LOSS:{:.4f} LR:{:.8f}".format((epoch+1),
            total_train_loss/(i+1), get_lr(optimizer)))

    # Calculate train metric
    metrics = document_sentiment_metrics_fn(list_hyp, list_label)
    print("(Epoch {}) TRAIN LOSS:{:.4f} {} LR:{:.8f}".format((epoch+1),
        total_train_loss/(i+1), metrics_to_string(metrics), get_lr(optimizer)))

    # Evaluate on validation
    model.eval()
    torch.set_grad_enabled(False)
    
    total_loss, total_correct, total_labels = 0, 0, 0
    list_hyp, list_label = [], []

    pbar = tqdm(valid_loader, leave=True, total=len(valid_loader))
    for i, batch_data in enumerate(pbar):
        batch_seq = batch_data[-1]        
        loss, batch_hyp, batch_label = forward_sequence_classification(model, batch_data[:-1], i2w=i2w, device='cuda')
        
        # Calculate total loss
        valid_loss = loss.item()
        total_loss = total_loss + valid_loss

        # Calculate evaluation metrics
        list_hyp += batch_hyp
        list_label += batch_label
        metrics = document_sentiment_metrics_fn(list_hyp, list_label)

        pbar.set_description("VALID LOSS:{:.4f} {}".format(total_loss/(i+1), metrics_to_string(metrics)))
        
    metrics = document_sentiment_metrics_fn(list_hyp, list_label)
    print("(Epoch {}) VALID LOSS:{:.4f} {}".format((epoch+1),
        total_loss/(i+1), metrics_to_string(metrics)))

## Test fine-tuned model on sample sentences

In [None]:
correct, incorrect = 0, 0
for i in range(10):
    print('='*40)
#     print()
    single_row = data.sample()
    test_text = single_row['text'].values[0]
    test_res = single_row['hs_class'].values[0]
    print(f'test sentence:\n{test_text}\n')
    print(f'actual class : {test_res}')

    text = test_text

    subwords = tokenizer.encode(text)
    subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

    logits = model(subwords)[0]
    labels = [torch.topk(logit, k=1, dim=-1)[1].squeeze().item() for logit in logits]
    
    for i, label in enumerate(labels):
        print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')
        if(i2w[label]==test_res):
            print("> CORRECT PREDICTION!\n")
            correct+=1
        else:
            print("> INCORRECT PREDICTION!\n")
            incorrect+=1
print(f'\n\ncorrect result: {correct}/10')
print(f'incorect result: {incorrect}/10')

In [None]:
def return_prediction_result(text):
    subwords = tokenizer.encode(text)
    subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

    logits = model(subwords)[0]
    labels = [torch.topk(logit, k=1, dim=-1)[1].squeeze().item() for logit in logits]

    for i, label in enumerate(labels):
        return i2w[label]

In [None]:
test_data = pd.read_csv(test_dataset_path)
total_len, correct = len(test_data.values), 0
for row in test_data.values:
    actual = row[0]
    text = row[1]
    pred = (return_prediction_result(text))
    if actual == pred:
        correct+=1
print(f'Score based on test data: {correct}/{total_len}  Percentage:{round(correct/total_len * 100,2)}%')

# Saving Model to PKL 
---

In [None]:
torch.save(model, "model.pkl")
loaded_model = torch.load("./model.pkl")
single_row = data.sample()
test_text = single_row['text'].values[0]
test_res = single_row['hs_class'].values[0]
print(f'test sentence:\n{test_text}\n')
print(f'actual class : {test_res}')

text = test_text

subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = loaded_model(subwords)[0]
labels = [torch.topk(logit, k=1, dim=-1)[1].squeeze().item() for logit in logits]

for i, label in enumerate(labels):
    print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')