# Preprocessing
External data for preprocesing and EDA:
1. https://github.com/okkyibrohim/id-multi-label-hate-speech-and-abusive-language-detection
2. https://www.kaggle.com/oswinrh/indonesian-stoplist


Steps:
1. Lower casing all text, 
2. Remove non alpha-numeric characters 
3. Remove unnecessary characters such as"\n" "\r"
4. Normalization using 'Alay' dictionary 
5. Remove Emojis

References:

[1] Muhammad Okky Ibrohim and Indra Budi. 2019. Multi-label Hate Speech and Abusive Language Detection in Indonesian Twitter. In ALW3: 3rd Workshop on Abusive Language Online, 46-57.  
[2] Tala, F. Z. (2003). A Study of Stemming Effects on Information Retrieval in Bahasa Indonesia. M.Sc. Thesis. Master of Logic Project. Institute for Logic, Language and Computation. Universiteit van Amsterdam, The Netherlands.  

In [None]:
!git clone https://github.com/okkyibrohim/id-multi-label-hate-speech-and-abusive-language-detection.git

In [None]:
import numpy as np
import pandas as pd

!ls '../input'

In [None]:
!ls ./*

## Load Data

In [None]:
data = pd.read_csv('../input/penyisihan-datavidia-7-0/train.csv')
data_test = pd.read_csv('../input/penyisihan-datavidia-7-0/test.csv')

alay_dict = pd.read_csv('id-multi-label-hate-speech-and-abusive-language-detection/new_kamusalay.csv', encoding='latin-1', header=None)
alay_dict = alay_dict.rename(columns={0: 'original', 
                                      1: 'replacement'})

id_stopword_dict = pd.read_csv('../input/indonesian-stoplist/stopwordbahasa.csv', header=None)
id_stopword_dict = id_stopword_dict.rename(columns={0: 'stopword'})

### Text Data

In [None]:
data.head()

### "Alay" Dictionary

In [None]:
alay_dict.head()

### Indonesian Stopwords

In [None]:
id_stopword_dict.head()

## Preprocessing The Data

In [None]:
import re

def lowercase(text):
    return text.lower()

def remove_unnecessary_char(text):
    text = re.sub('\n',' ',text) # Remove every '\n'
    text = re.sub('\r', ' ', text)
    text = re.sub('  +', ' ', text) # Remove extra spaces
    return text
    
def remove_nonaplhanumeric(text):
    text = re.sub('[^0-9a-zA-Z]+', ' ', text) 
    return text

alay_dict_map = dict(zip(alay_dict['original'], alay_dict['replacement']))
def normalize_alay(text):
    return ' '.join([alay_dict_map[word] if word in alay_dict_map else word for word in text.split(' ')])


print("remove_nonaplhanumeric: ", remove_nonaplhanumeric("Halooo,,,,, duniaa!!"))
print("lowercase: ", lowercase("Halooo, duniaa!"))
print("remove_unnecessary_char: ", remove_unnecessary_char("Hehe\n\n \r\r apa kabs  hehe"))
print("normalize_alay: ", normalize_alay("aamiin adek abis"))

In [None]:
import emoji
def emoji_cleaning(text):
    
    # Change emoji to text
    text = emoji.demojize(text).replace(":", " ")
    
    # Delete repeated emoji
    tokenizer = text.split()
    repeated_list = []
    
    for word in tokenizer:
        if word not in repeated_list:
            repeated_list.append(word)
    
    text = ' '.join(text for text in repeated_list)
    text = text.replace("_", " ").replace("-", " ")
    return text

In [None]:
def preprocess(text):
    text = lowercase(text) # 1
    text = remove_nonaplhanumeric(text) # 2
    text = remove_unnecessary_char(text) # 2
    text = normalize_alay(text) # 3
    text = emoji_cleaning(text) # 6
    return text

In [None]:
data['review_text'] = data['review_text'].apply(preprocess)

In [None]:
data_test['review_text'] = data_test['review_text'].apply(preprocess)

In [None]:
data.head(15)

In [None]:
data_test.head(15)

In [None]:
data.to_csv('preprocessed_review_train.csv', index=False)
data_test.to_csv('preprocessed_review_test.csv', index=False)

In [None]:
pd.concat([data['review_text'],data_test['review_text']]).reset_index(drop=True).to_csv('all_text.txt', index=False)

# EDA

In [None]:
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns

In [None]:
pos_len = data[data['category']==1].shape[0]
neg_len = data[data['category']==0].shape[0]

In [None]:
plt.rcParams['figure.figsize'] = (7, 5)
plt.bar(10,pos_len,3, label="Positive Reviews", color='blue')
plt.bar(15,neg_len,3, label="Negative Reviews", color='red')
plt.legend()
plt.ylabel('Number of examples')
plt.title('Propertion of examples')
plt.show()

In [None]:
data['length'] = data['review_text'].apply(lambda x: len(x))

In [None]:
plt.rcParams['figure.figsize'] = (18.0, 6.0)
bins = 150
plt.hist(data[data['category'] == 0]['length'], alpha = 0.6, bins=bins, label='Negative Reviews')
plt.hist(data[data['category'] == 1]['length'], alpha = 0.8, bins=bins, label='Postive Reviews')
plt.xlabel('length')
plt.ylabel('numbers')
plt.legend(loc='upper right')
plt.title('Characters in review')
plt.xlim(0,150)
plt.grid()
plt.show()

In [None]:
fig,(ax1,ax2)=plt.subplots(1,2,figsize=(10,5))
tweet_len=data[data['category']==1]['review_text'].str.len()
ax1.hist(tweet_len,color='blue')
ax1.set_title('Positive Reviews')
tweet_len=data[data['category']==0]['review_text'].str.len()
ax2.hist(tweet_len,color='red')
ax2.set_title('Negative Reviews')
fig.suptitle('Characters in review')
plt.show()


In [None]:
fig,(ax1,ax2)=plt.subplots(1,2,figsize=(10,5))
tweet_len=data[data['category']==1]['review_text'].str.split().map(lambda x: len(x))
ax1.hist(tweet_len,color='blue')
ax1.set_title('Positive Reviews')
tweet_len=data[data['category']==0]['review_text'].str.split().map(lambda x: len(x))
ax2.hist(tweet_len,color='red')
ax2.set_title('Negative Reviews')
fig.suptitle('Words in a Review')
plt.show()

In [None]:
fig,(ax1,ax2)=plt.subplots(1,2,figsize=(10,5))
word=data[data['category']==1]['review_text'].str.split().apply(lambda x : [len(i) for i in x])
sns.distplot(word.map(lambda x: np.mean(x)),ax=ax1,color='blue')
ax1.set_title('Positive Reviews')
word=data[data['category']==0]['review_text'].str.split().apply(lambda x : [len(i) for i in x])
sns.distplot(word.map(lambda x: np.mean(x)),ax=ax2,color='red')
ax2.set_title('Negative Reviews')
fig.suptitle('Average word length in each review')

In [None]:
from collections import defaultdict
def create_corpus(target):
    corpus=[]
    
    for x in data[data['category']==target]['review_text'].str.split():
        for i in x:
            corpus.append(i)
    return corpus

In [None]:
stop = list(id_stopword_dict['stopword'])

In [None]:
corpus=create_corpus(0)

dic=defaultdict(int)
for word in corpus:
    if word in stop:
        dic[word]+=1
        
top=sorted(dic.items(), key=lambda x:x[1],reverse=True)[:10]

plt.rcParams['figure.figsize'] = (18.0, 6.0)
plt.title('Top Words for negative reviews')
x,y=zip(*top)
plt.bar(x,y)

In [None]:
corpus=create_corpus(1)

dic=defaultdict(int)
for word in corpus:
    if word in stop:
        dic[word]+=1
        
top=sorted(dic.items(), key=lambda x:x[1],reverse=True)[:10]

plt.rcParams['figure.figsize'] = (18.0, 6.0)
plt.title('Top Words for positive reviews')
x,y=zip(*top)
plt.bar(x,y)

# Pretraining Language Model with MLM Training
Pretraining LM sebenarnya dilakukan di notebook terpisah, karena membutuhkan waktu yang lama. Berikut Kode yang digunakan untuk pretraining tetapi dimodifikasi untuk waktu running.

Pretraining dilakukan untuk mengadaptasi Language Model ke domain dari text yang diberikan.
Pretraining menggunakan initial weight dari  https://github.com/indobenchmark/indonlu

Reference:

[1] Bryan Wilie, Karissa Vincentio, Genta Indra Winata, Samuel Cahyawijaya, X. Li, Zhi Yuan Lim, S. Soleman, R. Mahendra, Pascale Fung, Syafri Bahar, & A. Purwarianti (2020). IndoNLU: Benchmark and Resources for Evaluating Indonesian Natural Language Understanding. In Proceedings of the 1st Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics and the 10th International Joint Conference on Natural Language Processing.

In [None]:
import os, sys

import random
import numpy as np
import pandas as pd
import torch
from torch import optim
import torch.nn.functional as F
from tqdm import tqdm

from transformers import BertForMaskedLM, BertForSequenceClassification, BertConfig, BertTokenizer
from nltk.tokenize import TweetTokenizer

from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer

from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from sklearn.model_selection import train_test_split

In [None]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)

In [None]:
set_seed(25012021)

In [None]:
tokenizer = BertTokenizer.from_pretrained('indobenchmark/indobert-base-p1')
config = BertConfig.from_pretrained('indobenchmark/indobert-base-p1')
config.num_labels = 2

# Instantiate model
model = BertForMaskedLM.from_pretrained('indobenchmark/indobert-base-p1', config=config)

In [None]:
model

In [None]:
from transformers import LineByLineTextDataset

dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="./all_text.txt",
    block_size=128,
)

In [None]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./datavidia_lm",
    overwrite_output_dir=True,
    num_train_epochs=1, #100, di notebook aslinya dilakukan 100 epoch training
    per_gpu_train_batch_size=16,
    save_steps=10_000,
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

In [None]:
%%time
trainer.train()

In [None]:
trainer.save_model("./datavidia_lm")

# Classifier Training

## Indobert

In [None]:
df_train = pd.read_csv('./preprocessed_review_train.csv')

In [None]:
train_split, valid_split = train_test_split(df_train.index, test_size=0.1, stratify=df_train['category'])

In [None]:
def get_lr(optimizer):
    for param_group in optimizer.param_groups:
        return param_group['lr']
def count_param(module, trainable=False):
    if trainable:
        return sum(p.numel() for p in module.parameters() if p.requires_grad)
    else:
        return sum(p.numel() for p in module.parameters())
    

def metrics_to_string(metric_dict):
    string_list = []
    for key, value in metric_dict.items():
        string_list.append('{}:{:.2f}'.format(key, value))
    return ' '.join(string_list)

In [None]:
set_seed(25012021)

In [None]:
tokenizer = BertTokenizer.from_pretrained('indobenchmark/indobert-base-p1')
config = BertConfig.from_pretrained('../input/lm-training/datavidia_lm/')
config.num_labels = 2

# Instantiate model
model = BertForSequenceClassification.from_pretrained('../input/lm-training/datavidia_lm/', config=config)

In [None]:
class DocumentSentimentDataset(Dataset):
    # Static constant variable
    NUM_LABELS = 2
    
    def load_dataset(self, path, split): 
        df = pd.read_csv(path)
        if split is not None:
            df = df.iloc[split].reset_index(drop=True)
        df['review_text'] = df['review_text'].values.astype('U')
        df['review_text'] = df['review_text'].apply(lambda x: x.lower())
        return df
    
    def __init__(self, dataset_path, tokenizer, no_special_token=False, train=True, split=None, *args, **kwargs):
        self.data = self.load_dataset(dataset_path, split)
        self.tokenizer = tokenizer
        self.no_special_token = no_special_token
        if not train:
            self.data['category'] = 0
    
    def __getitem__(self, index):
        data = self.data.loc[index,:]
        text, sentiment = data['review_text'], data['category']
        subwords = self.tokenizer.encode(text, add_special_tokens=not self.no_special_token)
        return np.array(subwords), np.array(sentiment), data['review_text']
    
    def __len__(self):
        return len(self.data)    
        
class DocumentSentimentDataLoader(DataLoader):
    def __init__(self, max_seq_len=512, *args, **kwargs):
        super(DocumentSentimentDataLoader, self).__init__(*args, **kwargs)
        self.collate_fn = self._collate_fn
        self.max_seq_len = max_seq_len
        
    def _collate_fn(self, batch):
        batch_size = len(batch)
        max_seq_len = max(map(lambda x: len(x[0]), batch))
        max_seq_len = min(self.max_seq_len, max_seq_len)
        
        subword_batch = np.zeros((batch_size, max_seq_len), dtype=np.int64)
        mask_batch = np.zeros((batch_size, max_seq_len), dtype=np.float32)
        sentiment_batch = np.zeros((batch_size, 1), dtype=np.int64)
        
        seq_list = []
        for i, (subwords, sentiment, raw_seq) in enumerate(batch):
            subwords = subwords[:max_seq_len]
            subword_batch[i,:len(subwords)] = subwords
            mask_batch[i,:len(subwords)] = 1
            sentiment_batch[i,0] = sentiment
            
            seq_list.append(raw_seq)
            
        return subword_batch, mask_batch, sentiment_batch, seq_list

In [None]:
train_dataset = DocumentSentimentDataset('./preprocessed_review_train.csv', tokenizer, split=train_split)
valid_dataset = DocumentSentimentDataset('./preprocessed_review_train.csv', tokenizer, split=valid_split)
test_dataset = DocumentSentimentDataset('./preprocessed_review_test.csv', tokenizer, train=False)

In [None]:
train_loader = DocumentSentimentDataLoader(dataset=train_dataset, max_seq_len=512, batch_size=32, num_workers=16, shuffle=True)  
valid_loader = DocumentSentimentDataLoader(dataset=valid_dataset, max_seq_len=512, batch_size=32, num_workers=16, shuffle=False)
test_loader = DocumentSentimentDataLoader(dataset=test_dataset, max_seq_len=512, batch_size=32, num_workers=16, shuffle=False)

In [None]:
optimizer = optim.Adam(model.parameters(), lr=3e-6)
model = model.cuda()

In [None]:
def forward_sequence_classification(model, batch_data, is_test=False, device='cpu', **kwargs):
    # Unpack batch data
    if len(batch_data) == 3:
        (subword_batch, mask_batch, label_batch) = batch_data
        token_type_batch = None
    elif len(batch_data) == 4:
        (subword_batch, mask_batch, token_type_batch, label_batch) = batch_data
    
    # Prepare input & label
    subword_batch = torch.LongTensor(subword_batch)
    mask_batch = torch.FloatTensor(mask_batch)
    token_type_batch = torch.LongTensor(token_type_batch) if token_type_batch is not None else None
    label_batch = torch.LongTensor(label_batch)
            
    if device == "cuda":
        subword_batch = subword_batch.cuda()
        mask_batch = mask_batch.cuda()
        token_type_batch = token_type_batch.cuda() if token_type_batch is not None else None
        label_batch = label_batch.cuda()

    # Forward model
    outputs = model(subword_batch, attention_mask=mask_batch, token_type_ids=token_type_batch, labels=label_batch)
    loss, logits = outputs[:2]
    probs = F.softmax(logits)
    
    # generate prediction & label list
    list_hyp = []
    list_label = []
    list_probs = []
    hyp = torch.topk(logits, 1)[1]
    for j in range(len(hyp)):
        list_hyp.append(hyp[j].item())
        list_label.append(label_batch[j][0].item())
        list_probs.append(probs[j][1].item())
        
    return loss, list_hyp, list_label, list_probs
def document_sentiment_metrics_fn(list_hyp, list_label):
    metrics = {}
    metrics["ACC"] = accuracy_score(list_label, list_hyp)
    metrics["F1"] = f1_score(list_label, list_hyp, average='macro')
    metrics["REC"] = recall_score(list_label, list_hyp, average='macro')
    metrics["PRE"] = precision_score(list_label, list_hyp, average='macro')
    return metrics

In [None]:
n_epochs = 1
#n_epochs = 25 # sebenarnya kami menggunakan 25 epoch untuk training classifier Indobert
for epoch in range(n_epochs):
    model.train()
    torch.set_grad_enabled(True)
 
    total_train_loss = 0
    list_hyp, list_label, list_probs = [], [], []

    train_pbar = tqdm(train_loader, leave=True, total=len(train_loader))
    for i, batch_data in enumerate(train_pbar):
        # Forward model
        loss, batch_hyp, batch_label, batch_probs = forward_sequence_classification(model, batch_data[:-1], device='cuda')

        # Update model
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        tr_loss = loss.item()
        total_train_loss = total_train_loss + tr_loss

        # Calculate metrics
        list_hyp += batch_hyp
        list_label += batch_label
        list_probs += batch_probs
        train_pbar.set_description("(Epoch {}) TRAIN LOSS:{:.4f} LR:{:.8f}".format((epoch+1),
            total_train_loss/(i+1), get_lr(optimizer)))

    # Calculate train metric
    metrics = document_sentiment_metrics_fn(list_hyp, list_label)
    print("(Epoch {}) TRAIN LOSS:{:.4f} {} LR:{:.8f}".format((epoch+1),
        total_train_loss/(i+1), metrics_to_string(metrics), get_lr(optimizer)))

    # Evaluate on validation
    model.eval()
    torch.set_grad_enabled(False)
    
    total_loss, total_correct, total_labels = 0, 0, 0
    list_hyp, list_label, list_probs = [], [], []

    pbar = tqdm(valid_loader, leave=True, total=len(valid_loader))
    for i, batch_data in enumerate(pbar):
        batch_seq = batch_data[-1]        
        loss, batch_hyp, batch_label, batch_probs = forward_sequence_classification(model, batch_data[:-1], device='cuda')
        
        # Calculate total loss
        valid_loss = loss.item()
        total_loss = total_loss + valid_loss

        # Calculate evaluation metrics
        list_hyp += batch_hyp
        list_label += batch_label
        list_probs += batch_probs
        metrics = document_sentiment_metrics_fn(list_hyp, list_label)

        pbar.set_description("VALID LOSS:{:.4f} {}".format(total_loss/(i+1), metrics_to_string(metrics)))
        
    metrics = document_sentiment_metrics_fn(list_hyp, list_label)
    print("(Epoch {}) VALID LOSS:{:.4f} {}".format((epoch+1),
        total_loss/(i+1), metrics_to_string(metrics)))

In [None]:
model.eval()
torch.set_grad_enabled(False)

total_loss, total_correct, total_labels = 0, 0, 0
list_hyp, list_label = [], []
list_prob = []

pbar = tqdm(test_loader, leave=True, total=len(test_loader))
for i, batch_data in enumerate(pbar):
    _, batch_hyp, _, batch_prob = forward_sequence_classification(model, batch_data[:-1], device='cuda')
    list_hyp += batch_hyp
    list_prob += batch_prob

# Save prediction
sub = pd.read_csv('../input/penyisihan-datavidia-7-0/sample_submission.csv')
sub['category'] = list_hyp
sub.to_csv('submission.csv', index=False)
sub = pd.read_csv('../input/penyisihan-datavidia-7-0/sample_submission.csv')
sub['category'] = list_prob
sub.to_csv('submission_prob.csv', index=False)

In [None]:
OOF_dataset = DocumentSentimentDataset('./preprocessed_review_train.csv', tokenizer)
OOF_loader = DocumentSentimentDataLoader(dataset=OOF_dataset, max_seq_len=512, batch_size=32, num_workers=16)

In [None]:
model.eval()
torch.set_grad_enabled(False)

total_loss, total_correct, total_labels = 0, 0, 0
list_hyp, list_label = [], []
list_prob_train = []

pbar = tqdm(OOF_loader, leave=True, total=len(OOF_loader))
for i, batch_data in enumerate(pbar):
    _, batch_hyp, _, batch_prob = forward_sequence_classification(model, batch_data[:-1], device='cuda')
    list_hyp += batch_hyp
    list_prob_train += batch_prob

In [None]:
train_probs = df_train[['review_id', 'review_text', 'category']].copy()
train_probs['category'] = list_prob_train
train_probs.to_csv('train_probability_indobert_pretrained.csv')

## XLM-Roberta
Karena batasan waktu, kami tidak melakukan pretraining MLM pada model XLM Roberta

In [None]:
from transformers import XLMRobertaForSequenceClassification, XLMRobertaConfig, XLMRobertaTokenizer

In [None]:
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')
config = XLMRobertaConfig.from_pretrained('xlm-roberta-base')
config.num_labels = 2

# Instantiate model
model = XLMRobertaForSequenceClassification.from_pretrained('xlm-roberta-base', config=config)

In [None]:
optimizer = optim.Adam(model.parameters(), lr=3e-6)
model = model.cuda()

In [None]:
n_epochs = 1
#n_epochs = 5 # sebenarnya kami menggunakan 5 epoch untuk training classifier XLM
for epoch in range(n_epochs):
    model.train()
    torch.set_grad_enabled(True)
 
    total_train_loss = 0
    list_hyp, list_label, list_probs = [], [], []

    train_pbar = tqdm(train_loader, leave=True, total=len(train_loader))
    for i, batch_data in enumerate(train_pbar):
        # Forward model
        loss, batch_hyp, batch_label, batch_probs = forward_sequence_classification(model, batch_data[:-1], device='cuda')

        # Update model
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        tr_loss = loss.item()
        total_train_loss = total_train_loss + tr_loss

        # Calculate metrics
        list_hyp += batch_hyp
        list_label += batch_label
        list_probs += batch_probs
        train_pbar.set_description("(Epoch {}) TRAIN LOSS:{:.4f} LR:{:.8f}".format((epoch+1),
            total_train_loss/(i+1), get_lr(optimizer)))

    # Calculate train metric
    metrics = document_sentiment_metrics_fn(list_hyp, list_label)
    print("(Epoch {}) TRAIN LOSS:{:.4f} {} LR:{:.8f}".format((epoch+1),
        total_train_loss/(i+1), metrics_to_string(metrics), get_lr(optimizer)))

    # Evaluate on validation
    model.eval()
    torch.set_grad_enabled(False)
    
    total_loss, total_correct, total_labels = 0, 0, 0
    list_hyp, list_label, list_probs = [], [], []

    pbar = tqdm(valid_loader, leave=True, total=len(valid_loader))
    for i, batch_data in enumerate(pbar):
        batch_seq = batch_data[-1]        
        loss, batch_hyp, batch_label, batch_probs = forward_sequence_classification(model, batch_data[:-1], device='cuda')
        
        # Calculate total loss
        valid_loss = loss.item()
        total_loss = total_loss + valid_loss

        # Calculate evaluation metrics
        list_hyp += batch_hyp
        list_label += batch_label
        list_probs += batch_probs
        metrics = document_sentiment_metrics_fn(list_hyp, list_label)

        pbar.set_description("VALID LOSS:{:.4f} {}".format(total_loss/(i+1), metrics_to_string(metrics)))
        
    metrics = document_sentiment_metrics_fn(list_hyp, list_label)
    print("(Epoch {}) VALID LOSS:{:.4f} {}".format((epoch+1),
        total_loss/(i+1), metrics_to_string(metrics)))

In [None]:
model.eval()
torch.set_grad_enabled(False)

total_loss, total_correct, total_labels = 0, 0, 0
list_hyp, list_label = [], []
list_prob = []

pbar = tqdm(test_loader, leave=True, total=len(test_loader))
for i, batch_data in enumerate(pbar):
    _, batch_hyp, _, batch_prob = forward_sequence_classification(model, batch_data[:-1], device='cuda')
    list_hyp += batch_hyp
    list_prob += batch_prob

# Save prediction
sub = pd.read_csv('../input/penyisihan-datavidia-7-0/sample_submission.csv')
sub['category'] = list_hyp
sub.to_csv('submission.csv', index=False)
sub = pd.read_csv('../input/penyisihan-datavidia-7-0/sample_submission.csv')
sub['category'] = list_prob
sub.to_csv('submission_prob.csv', index=False)

In [None]:
model.eval()
torch.set_grad_enabled(False)

total_loss, total_correct, total_labels = 0, 0, 0
list_hyp, list_label = [], []
list_prob_train = []

pbar = tqdm(OOF_loader, leave=True, total=len(OOF_loader))
for i, batch_data in enumerate(pbar):
    _, batch_hyp, _, batch_prob = forward_sequence_classification(model, batch_data[:-1], device='cuda')
    list_hyp += batch_hyp
    list_prob_train += batch_prob

In [None]:
train_probs = df_train[['review_id', 'review_text', 'category']].copy()
train_probs['category'] = list_prob_train
train_probs.to_csv('train_probability_xlm_pretrained.csv')

## TF-IDF dengan SVM

In [None]:
df_train = pd.read_csv('./preprocessed_review_train.csv')
df_train['review_text'] = df_train['review_text'].values.astype('U')

In [None]:
train, valid = train_test_split(df_train, random_state = 0)

In [None]:
from sklearn.pipeline import Pipeline
from mlxtend.feature_selection import ColumnSelector
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import SVC

svm = Pipeline([
    ('col_selector', ColumnSelector(cols=('review_text'),drop_axis=True)),
    ('tfidf', TfidfVectorizer()),
    ('classifier', SVC(kernel='rbf', probability=True)),
])


svm.fit(train,train['category'])

## TF-IDF dengan XGB

In [None]:
from xgboost import XGBClassifier

xgb = Pipeline([
    ('col_selector', ColumnSelector(cols=('review_text'),drop_axis=True)),
    ('tfidf', TfidfVectorizer()),
    ('classifier', XGBClassifier()),
])
xgb.fit(train,train['category'])

## Ensemble Model

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
xlm_proba = pd.read_csv('../input/datavidia-xlm-roberta/train_probability_xlm_pretrained.csv')
indobert_proba = pd.read_csv('../input/datavidia-bert-pretrained/train_probability_indobert_pretrained.csv')

In [None]:
df_meta = pd.DataFrame({
    'svc': svm.predict_proba(train)[:, -1],
    'xgb': xgb.predict_proba(train)[:, -1],
    'indobert': list(train[['review_id']].merge(indobert_proba)['category']),
    'xlm': list(train[['review_id']].merge(xlm_proba)['category'])
})
df_meta_valid = pd.DataFrame({
    'svc': svm.predict_proba(valid)[:, -1],
    'xgb': xgb.predict_proba(valid)[:, -1],
    'indobert': list(valid[['review_id']].merge(indobert_proba)['category']),
    'xlm': list(valid[['review_id']].merge(xlm_proba)['category'])
})

In [None]:
meta_learner = LogisticRegression()
meta_learner.fit(df_meta, train['category'])

# Evaluasi Model

In [None]:
print('model'.ljust(10), 'f1-score')
for model in df_meta_valid.columns:
    print(model.ljust(10), f1_score((df_meta_valid[model]>0.5).astype(int), valid['category']))

In [None]:
print('ensemble'.ljust(10), f1_score(meta_learner.predict(df_meta_valid), valid['category']))

Hasil terbaik dari tim kami adalah ensemble dengan 3 model yaitu
1. SVM
2. XGBoost
3. Indobert

# Membuat Submission

In [None]:
df_test = pd.read_csv('./preprocessed_review_test.csv')
df_test['review_text'] = df_test['review_text'].values.astype('U')
indobert_test_prob = pd.read_csv('../input/datavidia-bert-pretrained/submission_prob.csv')
xlm_test_prob = pd.read_csv('../input/datavidia-xlm-roberta/submission_prob.csv')

In [None]:
df_meta_test = pd.DataFrame({
    'svc': svm.predict_proba(df_test)[:, -1],
    'xgb': xgb.predict_proba(df_test)[:, -1],
    'indobert': indobert_test_prob['category'],
    'xlm': xlm_test_prob['category']
})

In [None]:
sub = df_test[['review_id']]
sub['category'] = meta_learner.predict(df_meta_test)

In [None]:
sub.to_csv('final_submission.csv', index=False)