# Install Library

In [None]:
!pip install PySastrawi
!pip install catboost
!pip install -qq transformers
!pip install -U imbalanced-learn

In [None]:
!pip install numpy requests nlpaug wget
!pip install --upgrade --no-cache-dir gdown
!pip install sentence-transformers

# Import Library

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re

In [None]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

factory = StemmerFactory()
stemmer = factory.create_stemmer()

In [None]:
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
from imblearn.over_sampling import SMOTE, ADASYN, BorderlineSMOTE

In [None]:
import xgboost as xgb
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.linear_model import Perceptron
from sklearn.neural_network import MLPClassifier

In [None]:
import random

import torch
import torch.nn.functional as F
import torch.nn as nn

from transformers import AutoTokenizer, AutoModel, get_constant_schedule_with_warmup
from transformers import TrainingArguments, Trainer, AutoConfig
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

In [None]:
seed_val = 2023
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)

<torch._C.Generator at 0x7f2ee6b32530>

In [None]:
if torch.cuda.is_available():
  torch.cuda.manual_seed_all(seed_val)

In [None]:
if torch.cuda.is_available():

    device = torch.device("cuda:0")
    print('Tersedia sejumlah %d GPU(s).' % torch.cuda.device_count())
    print('GPU yang akan digunakan:', torch.cuda.get_device_name(0))
else:
    print('Tidak mendukung GPU; hanya CPU')
    device = torch.device("cpu")

Tersedia sejumlah 1 GPU(s).
GPU yang akan digunakan: Tesla T4


In [None]:
import os
!git clone https://github.com/makcedward/nlpaug.git
os.environ["MODEL_DIR"] = '../model'

Cloning into 'nlpaug'...
remote: Enumerating objects: 5828, done.[K
remote: Counting objects: 100% (1355/1355), done.[K
remote: Compressing objects: 100% (443/443), done.[K
remote: Total 5828 (delta 965), reused 1247 (delta 902), pack-reused 4473[K
Receiving objects: 100% (5828/5828), 3.31 MiB | 6.67 MiB/s, done.
Resolving deltas: 100% (4127/4127), done.


In [None]:
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
import nlpaug.flow as nafc

from nlpaug.util import Action
from gensim.models.fasttext import FastText
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
from imblearn.over_sampling import ADASYN, BorderlineSMOTE

# Import Dataset

## Download dataset

In [None]:
!gdown --id "1HkCJzto0j5VN-Fg5WxVLAbVPDIu8QAfX"
!gdown --id "1um6RmGbv1Tz3pMGx7RCh-nuV_MOWURcQ"

## Transform to dataframe

In [None]:
doc_df = pd.read_csv('/content/anotasi_non_expert_dan_expert.csv', delimiter=",",index_col=False)

In [None]:
doc_df = doc_df[(doc_df['Category']!=np.nan) & (doc_df['Expert']!='OtherEffect')]
doc_df.reset_index(inplace=True, drop=True)

In [None]:
unlabelled_df = pd.read_csv('/content/unlabelled.csv')
unlabelled_df = unlabelled_df[unlabelled_df['Category'] == 'Question'][['Sentence']]

# Stopwords

In [None]:
def get_stopwords(df_used):
  dict_med_words = {}

  for i in range(df_used.shape[0]):
    sentence = df_used['Sentence'][i]
    res = re.sub("\d", "", str(sentence))
    res = re.sub("\s+", " ", res.lower())
    res = re.sub("[^\w\s]", "", res)

    try:
      words = word_tokenize(res)
      for word in words:
        if word not in dict_med_words.keys():
          dict_med_words[word] = 1
        else:
          dict_med_words[word] += 1
    except:
      pass

  five_percent = round(len(dict_med_words.keys()) * 0.05)

  top_5 = dict(sorted(dict_med_words.items(), key=lambda x:x[1], reverse=True)[:five_percent]).keys()
  bot_5 = dict(sorted(dict_med_words.items(), key=lambda x:x[1])[:five_percent]).keys()
  med_stopwords = []

  med_stopwords.extend(top_5)
  med_stopwords.extend(bot_5)

  return med_stopwords

In [None]:
med_stopwords = get_stopwords(doc_df)

# Pre-processing

In [None]:
labels = doc_df['Expert'].unique()[1:]
map_name_id = {class_name:id for id, class_name in enumerate(labels)}

In [None]:
map_name_id

{'Information': 0,
 'Cause': 1,
 'Management': 2,
 'Diagnosis': 3,
 'PersonORG': 4,
 'Manifestation': 5,
 'Complication': 6,
 'Prognosis': 7,
 'Susceptibility': 8,
 'NotDisease': 9,
 'Anatomy': 10,
 'Other': 11}

In [None]:
bias_words = ["sinusitis", "kanker", "panu", "hepatitis", "wasir", "tumor", "jerawat", "kista", "demam", "hernia", "vertigo"]

In [None]:
def pre_processing(x, stem=False):
  res = re.sub("\d", "", str(x))
  res = re.sub("\s+", " ", res)
  res = re.sub("[^\w\s]", "", res)
  res = res.lower()
  if(stem):
    tokens = word_tokenize(res)
    temp = ""
    for i in tokens:
      if(i not in med_stopwords):
        temp += " " + stemmer.stem(i.lower().strip())
    res = temp
  return res

In [None]:
doc_df['Sentence'] = doc_df['Sentence'].apply(pre_processing)

In [None]:
doc_df['Expert'].value_counts()

Management        226
Diagnosis         223
Information       170
Cause             117
Complication       74
Prognosis          73
Manifestation      27
PersonORG          17
Susceptibility     12
NotDisease         10
Other               7
Anatomy             5
Name: Expert, dtype: int64

# Modelling

## Function

### Oversampling

In [None]:
dict_label_words = {
    'Management': ['obat','solusi','saran','atas', 'operasi','hilang'],
    'Diagnosis' : ['derita','gangguan','sebenarnya','wajar'],
    'Information': ['boleh', 'formula','program'],
    'Prognosis': ['sembuh','berapa','bisakah'],
    'Cause': ['sebab', 'faktor','kok'],
    'Susceptibility': ['kemungkinan'],
    'Anatomy': ['bagian','tubuh'],
    'Manifestation': ['ciriciri'],
    'PersonORG': ['spesialis', 'periksa','baik'],
    'Complication' : ['efek',  'bahaya'],
    'NotDisease' : [''],
    'Other': ['biaya', 'harga', 'taksir']
}

In [None]:
def oversampling_augmentation(df, states):
  MAX_VALUE = df['Expert'].value_counts()[0]
  dict_label_count = df['Expert'].value_counts()[1:].to_dict()

  for k,v in dict_label_count.items():
    TO_ADD = MAX_VALUE - v
    try:
      dict_label_count[k] = TO_ADD
      df_temp = keyword_oversampling(k, dict_label_words[k])
      df_temp = df_temp.sample(n = TO_ADD, random_state=states)
      df = pd.concat([df, df_temp], ignore_index=True)
    except:
      pass


  return df

In [None]:
def keyword_oversampling(class_name, words):
  df_word = pd.DataFrame(columns=['Sentence','Expert'])

  for data in unlabelled_df['Sentence']:
    tokenized = word_tokenize(data)
    for token in tokenized:
      if(token in words):
        for k, v in dict_label_words.items():
          if(k != class_name):
            if(token not in v):
              if data not in df_word['Sentence'].values:
                tokenized = word_tokenize(data)
                temp = ""
                for i in tokenized:
                  temp+= stemmer.stem(i)
                df_temp  = pd.DataFrame({
                    'Sentence': [temp],
                    'Expert': [map_name_id[class_name]]
                })
                df_word = pd.concat([df_word, df_temp], ignore_index=True)
  return df_word

In [None]:
def pseudolabelling_oversampling(df, vectorizer):
  if('Management' in df['Expert'].unique()):
    df['Expert'] = [map_name_id[class_name] for class_name in df['Expert'].values]

  X = df['Sentence']
  y = df['Expert']

  X_vectors = vectorizer.transform(X.tolist())

  model = Perceptron(max_iter=300)
  model.fit(X_vectors, y)

  X_test = unlabelled_df['Sentence'].values
  X_test_vectors = vectorizer.transform(X_test)

  y_pred = model.predict(X_test_vectors)

  MAX_VALUE = df['Expert'].value_counts()[0]
  dict_label_to_add = df['Expert'].value_counts()[1:].to_dict()

  dict_label_count = dict.fromkeys(map_name_id.keys(), 0)
  df_word = pd.DataFrame(columns=['Sentence','Expert'])

  for k,v in dict_label_to_add.items():
    TO_ADD = MAX_VALUE - v
    dict_label_to_add[k] = TO_ADD

  for i in range(len(y_pred)):
    label = y_pred[i]
    try:
      if(dict_label_count[label] < dict_label_to_add[label]):
        df_temp = pd.DataFrame({
            'Sentence': [X_test[i]],
            'Expert': [label]
        })
        df_word = pd.concat([df_word, df_temp], ignore_index=True)
        dict_label_count[label] += 1
    except:
      pass

  df = pd.concat([df, df_word], ignore_index=True)

  return df

### ML

In [None]:
n_gram = {
    (1,1): 'Unigram',
    (2,2): 'Bigram',
    (1,2): 'Kombinasi'
}

metrics = ['Precision', 'Recall', 'F1']

In [None]:
def evaluate(report):
  dict_res = {}

  res = report.split('\n\n')[1].split('\n')
  for i in range(len(res)):
    items = res[i].split(" ")
    items = list(filter(None, items))
    dict_res[map_name_id[items[0]]] = list(map(float,items[1:]))
  res2 = report.split('\n\n')[2].split('\n')
  for i in range(len(res2)-1):
    items = res2[i].split("  ")
    items = list(filter(None, items))
    dict_res[items[0]] = list(map(float,items[1:-1]))
  return dict_res

In [None]:
def train_ML(df_used, model,n, states, oversampling_smote, oversampling_pseudolabelling, augmentation):
  dataset = df_used[['Sentence','Expert']]
  y = df_used['Expert']
  dataset['Expert'] = [map_name_id[class_name] for class_name in y.values]

  dict_scores = {}
  dict_avg = {}
  dict_stds = {}

  dict_final = {}
  dict_all_result = {}
  vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(lowercase=True, ngram_range=(1,1))
  vectorizer.fit(dataset['Sentence'])

  for i in range(n):
    if(oversampling_smote):
      train, val = train_test_split(dataset, test_size=0.3, stratify=y, random_state=states[i])
      #smote = SMOTE(random_state=states[i],  k_neighbors=2)

      #smote = ADASYN(random_state=states[i], n_neighbors=2, sampling_strategy='minority')
      smote = BorderlineSMOTE(random_state=states[i], k_neighbors=2)
      X_train_vectors = vectorizer.transform(train['Sentence'])
      X_test_vectors = vectorizer.transform(val['Sentence'])
      X_test, y_test = X_test_vectors, val['Expert']

      X_train, y_train = smote.fit_resample(X_train_vectors, train['Expert'])
    elif(oversampling_pseudolabelling):
      train, val = train_test_split(dataset, test_size=0.3, stratify=y, random_state=states[i])
      train = pseudolabelling_oversampling(train, vectorizer)
      X_train = vectorizer.transform(train['Sentence'].tolist())
      y_train = list(train['Expert'])
      X_test_vectors = vectorizer.transform(val['Sentence'])
      X_test, y_test = X_test_vectors, val['Expert']
    elif(augmentation):
      train, val = train_test_split(dataset, test_size=0.3, stratify=y, random_state=states[i])
      idx = train.shape[0]
      train = oversampling_augmentation(train, states[i])

      temp = pd.concat([train, val], ignore_index=True)
      y_temp = temp['Expert'].values

      X_vectors = vectorizer.transform(temp['Sentence'])
      X_train, X_test, y_train, y_test = X_vectors[:idx], X_vectors[idx:], y_temp[:idx], y_temp[idx:]
    elif(not oversampling_smote and not oversampling_pseudolabelling  and not augmentation):
      X_vectors = vectorizer.transform(dataset['Sentence'])
      y_mapped = [map_name_id[class_name] for class_name in y.values]
      X_train, X_test, y_train, y_test = train_test_split(X_vectors, y_mapped, test_size=0.3, stratify=y, random_state=states[i])

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    report = classification_report(y_test, y_pred, target_names=map_name_id.keys())

    dict_all_result[i] = evaluate(report)

    dict_std, dict_final = find_std(dict_all_result,n)

    for i in dict_final:
      temp = [j / n for j in dict_final[i]]
      dict_final[i] = temp

  dict_scores = dict_all_result
  dict_avg = dict_final
  dict_stds = dict_std

  return dict_avg, dict_stds, dict_scores

In [None]:
def model_ML(df_used, n, states, oversampling_smote=False, oversampling_pseudolabelling=False, augmentation=False):
  dict_scores = {}
  dict_std = {}
  dict_avg = {}

  xgb_model = xgb.XGBClassifier(objective="multi:softprob")
  xgb_res, std, scores = train_ML(df_used, xgb_model, n, states, oversampling_smote, oversampling_pseudolabelling, augmentation)
  dict_std['XGBoost'] = std
  dict_avg['XGBoost'] = xgb_res
  dict_scores['XGBoost'] = scores

  mnb = MultinomialNB()
  mnb_res, std, scores= train_ML(df_used, mnb, n, states, oversampling_smote, oversampling_pseudolabelling, augmentation)
  dict_scores['Naive Bayes'] = scores
  dict_std['Naive Bayes'] = std
  dict_avg['Naive Bayes'] = mnb_res


  dtree_model = DecisionTreeClassifier(max_depth = 20)
  dtree_res, std, scores = train_ML(df_used, dtree_model,n,states, oversampling_smote, oversampling_pseudolabelling,augmentation)
  dict_std['Decision Tree'] = std
  dict_avg['Decision Tree'] = dtree_res
  dict_scores['Decision Tree'] = scores

  pcp = Perceptron(max_iter=300)
  pcp_res, std, scores = train_ML(df_used, pcp, n, states, oversampling_smote, oversampling_pseudolabelling, augmentation)
  dict_std['Perceptron'] = std
  dict_avg['Perceptron'] = pcp_res
  dict_scores['Perceptron'] = scores


  mlp = MLPClassifier(hidden_layer_sizes=2, activation='relu', solver='adam', max_iter=300, learning_rate='invscaling')
  mlp_res, std, scores = train_ML(df_used, mlp, n, states, oversampling_smote, oversampling_pseudolabelling, augmentation)
  dict_std['MLP'] = std
  dict_avg['MLP'] = mlp_res
  dict_scores['MLP'] = scores


  svm_model = svm.SVC(kernel='poly', degree=3, C=1)
  svm_res, std, scores = train_ML(df_used, svm_model, n, states, oversampling_smote, oversampling_pseudolabelling, augmentation)
  dict_std['SVM'] = std
  dict_avg['SVM'] = svm_res
  dict_scores['SVM'] = scores

  return dict_avg, dict_std, dict_scores

In [None]:
def find_std(dict_all_result,n):
  dict_std = {}
  dict_avg = {}
  for i in dict_all_result[0]:
    res_temp = []
    temp_avg = []
    for k in range(3):
      try:
        temp = []
        for j in range(n):
          temp.append(dict_all_result[j][i][k])
        std = np.std(temp)
        mean = np.mean(temp)

        temp_avg.append(round(mean,2))
        res_temp.append(round(std,2))
      except:
        pass
    dict_avg[i] = temp_avg
    dict_std[i] = res_temp
  return dict_std, dict_avg

In [None]:
def table_visualization(dict_avg,dict_std,n, m):
  df = pd.DataFrame(columns=['Label', 'XGBoost', 'Naive Bayes', 'Perceptron', 'MLP', 'Decision Tree', 'SVM'])
  for i in dict_avg:
    res = []
    model_score = dict_avg[i][n]
    std = dict_std[i][n]
    for j in model_score:
      if(j !='accuracy' and j!=' macro avg'):
        v = model_score[j][m]
        s = std[j][m]

        res.append(f"{round(v,2):.2f} +- {round(s,2):.2f}")
    df[i] = res
  df['Label'] = list(map_name_id.keys()) + list(['Weighted Average'])
  return df

In [None]:
def convert_to_latex(df, title,n):
  res = '''
  {
  \scriptsize
  \\begin{longtable}{c c c c c c c c}
  '''
  res +=  "\caption{\\bo{Eksperimen 1}: Skor F1 dengan faktor pembobot " + n.lower() + " dan tanpa \\f{oversampling} untuk \\f{input} 1}"
  res += "\label{tab:" + title +"} \\\\"
  res += '''
      \hline
    \\bo{Label} & \\bo{XGBoost} & \\bo{Naive Bayes} & \\bo{Perceptron} & \\bo{MLP} & \\bo{Decision Tree} & \\bo{SVM} \\\\ \hline
  '''
  for i in df.index:
    temp = df.loc[i]
    if(i!=12):
      res += temp['Label'].upper() + ' & ' + temp['XGBoost'].replace('+-','$\pm$') + ' & ' + temp['Naive Bayes'].replace('+-','$\pm$') + ' & ' + temp['Perceptron'].replace('+-','$\pm$') + ' & ' +  temp['MLP'].replace('+-','$\pm$')  + ' & ' +  temp['Decision Tree'].replace('+-','$\pm$')  + ' & ' +  temp['SVM'].replace('+-','$\pm$') + '\\\\ \n'
    else:
      res += '\hline \\f{' + temp['Label'] + '} & ' + temp['XGBoost'].replace('+-','$\pm$') + ' & ' + temp['Naive Bayes'].replace('+-','$\pm$') + ' & ' + temp['Perceptron'].replace('+-','$\pm$') + ' & ' +  temp['MLP'].replace('+-','$\pm$')  + ' & ' +  temp['Decision Tree'].replace('+-','$\pm$')  + ' & ' +  temp['SVM'].replace('+-','$\pm$') + '\\\\ \hline \n \end{longtable} \n}'
  files = open(f'{title}.tex', 'w')
  y = files.write(res)
  files.close()
  return res

### BERT

In [None]:
#--------------------------------
#  Some Parameters
#--------------------------------
MAX_SEQ_LENGTH = 64
BATCH_SIZE = 32
WARMUP_PROPORTION = 0.4
MODEL_NAME = "stevenwh/indobert-base-p2-finetuned-mer-80k"
LEARNING_RATE = 5e-4
EPOCH = 10
transformer = AutoModel.from_pretrained(MODEL_NAME)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/498M [00:00<?, ?B/s]

Some weights of the model checkpoint at stevenwh/indobert-base-p2-finetuned-mer-80k were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at stevenwh/indobert-base-p2-finetuned-mer-80k and are newly initialized: ['bert.poo

Downloading (…)okenizer_config.json:   0%|          | 0.00/527 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/229k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/709k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [None]:
def create_data_loader(instances, label_map):

  input_ids = [] #list of token id
  input_att_mask = [] #list od attentionn mask
  input_label = [] #list of labels
  for (instance, label) in instances:
    encoded_sent = tokenizer.encode(instance, \
                                  add_special_tokens = True, \
                                  max_length = MAX_SEQ_LENGTH, \
                                  padding = "max_length", \
                                  truncation = True)
    input_ids.append(encoded_sent)
    input_att_mask.append([int(token_id > 0) for token_id in encoded_sent])
    input_label.append(label_map[label])

  # ubah ke Torch's Tensor
  input_ids = torch.tensor(input_ids).to(device)
  input_att_mask = torch.tensor(input_att_mask).to(device)
  input_label = torch.tensor(input_label).to(device)

  # gunakan Torch's Dataset
  dataset = TensorDataset(input_ids, input_att_mask, input_label)

  # mengembalikan sebuah abstraksi DataLoader dari Torch
  # DataLoader untuk manajemen batch
  return DataLoader(dataset,
                    sampler = SequentialSampler(dataset),
                    #sampler = RandomSampler(dataset),
                    batch_size = BATCH_SIZE)

In [None]:
def model_bert(df_used, states, n, pseudolabelling=False, augmentation=False):
  dataset = df_used[['Sentence','Expert']]
  y = dataset['Expert']

  config = AutoConfig.from_pretrained(MODEL_NAME)
  classifier = Classifier(input_size = config.hidden_size)
  dict_final = {}
  dict_all_result = {}

  vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(lowercase=True, ngram_range=(1,1))
  vectorizer.fit(dataset['Sentence'])

  inv_map = {v: k for k, v in map_name_id.items()}

  for i in range(n):
    train, val = train_test_split(dataset, test_size=0.3, stratify=y, random_state=states[i])

    if(augmentation):
      train = oversampling_augmentation(train, states[i])
      for k in range(len(train['Expert'])):
        try:
          l = inv_map[train['Expert'].iloc[k]]
          train.at[k,'Expert'] = l
        except:
          pass
    elif(pseudolabelling):
      train = pseudolabelling_oversampling(train, vectorizer)
      train['Expert'] = [inv_map[class_name] for class_name in train['Expert'].values]

    train_docs = list(train.itertuples(index=False, name=None))
    valid_docs = list(val.itertuples(index=False, name=None))

    temp = pd.get_dummies(train['Expert'].unique())
    label_map = {}
    for l in temp:
      label_map[l] = list(temp[l])

    y_pred, y_true = train_bert(train_docs, valid_docs, label_map, classifier)
    report = classification_report(y_true, y_pred, target_names=map_name_id.keys())

    dict_all_result[i] = evaluate(report)


  dict_std, dict_final = find_std(dict_all_result,n)
  return dict_final, dict_std, dict_all_result

In [None]:
# Classifier adalah layer on top of BERT yang digunakan untuk klasifikasi
# inputnya adalah vektor CLS dari BERT
class Classifier(nn.Module):
    def __init__(self, input_size = 512, output_size = 12, dropout_rate = 0.1):
        super(Classifier, self).__init__()
        layers = []
        layers.append(nn.Linear(input_size, output_size))
        layers.append(nn.LeakyReLU(0.2, inplace = True))
        layers.append(nn.Dropout(dropout_rate))
        layers.append(nn.Softmax(dim = -1))
        self.layers = nn.Sequential(*layers)

    def forward(self, input):
        return self.layers(input)

In [None]:
def train_bert(train_docs, valid_docs, label_map, classifier):
  train_instances = create_data_loader(train_docs,label_map)
  valid_instances = create_data_loader(valid_docs,label_map)

  transformer_vars = [p for p in transformer.parameters()]
  classifier_vars = [p for p in classifier.parameters()]
  all_vars = transformer_vars + classifier_vars

  # jika GPU available
  # Put everything in the GPU if available
  if torch.cuda.is_available():
    classifier.cuda()
    transformer.cuda()

  # optimizer
  optimizer = torch.optim.AdamW(all_vars, lr = LEARNING_RATE)

  # kita gunakan scheduler untuk warming-up
  # Vaswani mengusulkan suatu rumus penentuan learning_rate saat warming-up
  # dan setelahnya
  num_train_instances = len(train_instances)
  num_train_steps = int(num_train_instances / BATCH_SIZE * EPOCH)
  num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)

  #scheduler = get_constant_schedule_with_warmup(optimizer,
  #                                             num_warmup_steps = num_warmup_steps)

  for epoch_i in range(0, EPOCH):
    print(f"Epoch {epoch_i + 1} / {EPOCH}")
    # Reset the total loss for this epoch.
    tr_loss = 0

    # Put the model into training mode.
    transformer.train()
    classifier.train()

    # Untuk setiap batch di training data
    for step, batch in enumerate(train_instances):
      print(f">>>> Batch {step + 1}")

      # Unpack sebuah batch dari DataLoader
      batch_input_ids = batch[0]
      batch_input_att_mask = batch[1]
      batch_label = batch[2]

      # Encode real data in the Transformer
      bert_outputs = transformer(batch_input_ids, attention_mask = batch_input_att_mask)
      cls_hidden_states = bert_outputs.pooler_output
      probs = classifier(cls_hidden_states)

      # Categorical Cross-Entropy
      loss = -torch.mean(torch.sum(batch_label * torch.log(probs), dim = 1))
      #loss.requires_grad = True

      # Agar gradient tidak menumpuk; ini diperlukan sebelum
      # menghitung gradient dengan loss.backward()
      optimizer.zero_grad()

      # hitung gradient
      loss.backward()

      # update parameter
      optimizer.step()

      # akumulasi loss dalam 1 epoch
      tr_loss += loss.item()

      # update learning rate
      #scheduler.step()

    # rata-rata loss dalam 1 epoch
    avg_loss = tr_loss / len(train_instances)

    print(f"Average loss: {avg_loss}")

    ### Uji coba di Validation Data
    print("Uji coba di Validation Data ...")

    # model eval, agar layer seperti dropout yang menghasilkan
    # sesuai yang random tidak digunakan
    classifier.eval()
    transformer.eval()

    val_loss = 0
    pred_labels = []
    true_labels = []

    # Untuk setiap batch di validation data
    for step, batch in enumerate(valid_instances):

      # Unpack sebuah batch dari DataLoader
      batch_input_ids = batch[0].to(device)
      batch_input_att_mask = batch[1].to(device)
      batch_label = batch[2].to(device)

      # jangan track gradient! ini sedang evaluasi, bukan training
      with torch.no_grad():
        bert_outputs = transformer(batch_input_ids, attention_mask = batch_input_att_mask)
        cls_hidden_states = bert_outputs.pooler_output
        probs = classifier(cls_hidden_states)

        # Categorical Cross-Entropy
        loss = -torch.mean(torch.sum(batch_label * torch.log(probs), dim = 1))
        #loss.requires_grad = True
        # akumulasi loss di validation set
        val_loss += loss.item()

      # akumulasi prediksi
      _, pred_l = torch.max(probs, dim = 1) # outnya adalah 2-tuple: (max, max_indices)
      pred_labels += pred_l.detach().cpu()
      _, true_l = torch.max(batch_label, dim = 1)
      true_labels += true_l.detach().cpu()

    # average loss
    avg_val_loss = val_loss / len(valid_instances)

    # accuracy
    pred_labels = torch.stack(pred_labels).numpy()
    true_labels = torch.stack(true_labels).numpy()
    val_accuracy = np.sum(pred_labels == true_labels) / len(pred_labels)
    print(f"Average loss di validation data: {avg_val_loss}")
    print(f"Accuracy di validation data: {val_accuracy}")

    print("")
  return pred_labels,true_labels


## Scenario 1: Question Only

In [None]:
question_df = doc_df[doc_df['Category']=='Question'][['Sentence','Expert']]

### ML

In [None]:
dict_avg, dict_std, dict_scores  = model_ML(question_df, 20, states, oversampling_smote=False, oversampling_pseudolabelling=True, augmentation=False)

In [None]:
dict_std

{'XGBoost': {0: [0.05, 0.08, 0.06],
  1: [0.08, 0.09, 0.08],
  2: [0.05, 0.05, 0.04],
  3: [0.04, 0.06, 0.05],
  4: [0.36, 0.15, 0.19],
  5: [0.28, 0.13, 0.15],
  6: [0.09, 0.1, 0.09],
  7: [0.09, 0.1, 0.09],
  8: [0.24, 0.08, 0.11],
  9: [0.24, 0.12, 0.15],
  10: [0.11, 0.11, 0.11],
  11: [0.33, 0.34, 0.29],
  'accuracy': [0.02],
  ' macro avg': [0.05, 0.03, 0.04],
  'weighted avg': [0.02, 0.02, 0.02]},
 'Naive Bayes': {0: [0.08, 0.06, 0.06],
  1: [0.11, 0.03, 0.05],
  2: [0.03, 0.02, 0.02],
  3: [0.02, 0.03, 0.02],
  4: [0.0, 0.0, 0.0],
  5: [0.0, 0.0, 0.0],
  6: [0.45, 0.03, 0.06],
  7: [0.43, 0.04, 0.07],
  8: [0.0, 0.0, 0.0],
  9: [0.0, 0.0, 0.0],
  10: [0.0, 0.0, 0.0],
  11: [0.0, 0.0, 0.0],
  'accuracy': [0.02],
  ' macro avg': [0.04, 0.01, 0.01],
  'weighted avg': [0.04, 0.02, 0.02]},
 'Decision Tree': {0: [0.03, 0.1, 0.04],
  1: [0.1, 0.06, 0.06],
  2: [0.05, 0.06, 0.05],
  3: [0.05, 0.06, 0.03],
  4: [0.45, 0.14, 0.18],
  5: [0.37, 0.13, 0.17],
  6: [0.14, 0.06, 0.08],
  7: [

In [None]:
dict_scores['XGBoost'][(1,1)][19]

{0: [0.25, 0.31, 0.28, 51.0],
 1: [0.45, 0.4, 0.42, 35.0],
 2: [0.66, 0.75, 0.7, 68.0],
 3: [0.57, 0.58, 0.58, 67.0],
 4: [0.0, 0.0, 0.0, 5.0],
 5: [0.0, 0.0, 0.0, 8.0],
 6: [0.46, 0.55, 0.5, 22.0],
 7: [0.57, 0.36, 0.44, 22.0],
 8: [0.0, 0.0, 0.0, 4.0],
 9: [0.0, 0.0, 0.0, 3.0],
 10: [0.0, 0.0, 0.0, 2.0],
 11: [0.5, 1.0, 0.67, 2.0],
 'accuracy': [0.49],
 ' macro avg': [0.29, 0.33, 0.3],
 'weighted avg': [0.47, 0.49, 0.48]}

In [None]:
files = open(f'Eksperimen 2 - Input 1.txt', 'w')
res = "dict_avg = " + str(dict_avg) + "\ndict_std = " + str(dict_std) + "\ndict_scores = " + str(dict_scores)
y = files.write(res)
files.close()

### BERT

In [None]:
dict_final, dict_std, dict_scores = model_bert(question_df, states, 20, augmentation=True, pseudolabelling=False)

In [None]:
dict_std

{0: [0.0, 0.22, 0.01],
 1: [0.0, 0.0, 0.0],
 2: [0.0, 0.0, 0.0],
 3: [0.0, 0.0, 0.0],
 4: [0.0, 0.0, 0.0],
 5: [0.02, 0.22, 0.03],
 6: [0.0, 0.0, 0.0],
 7: [0.01, 0.46, 0.01],
 8: [0.01, 0.4, 0.01],
 9: [0.01, 0.43, 0.01],
 10: [0.01, 0.36, 0.01],
 11: [0.0, 0.0, 0.0],
 'accuracy': [0.01],
 ' macro avg': [0.0, 0.0, 0.0],
 'weighted avg': [0.0, 0.01, 0.0]}

In [None]:
metrics = ['Precision', 'Recall', 'F1']

In [None]:
files = open(f'Eksperimen 1 - Input 1 - BERT - Oversampling - Keyword-Based.txt', 'w')
res = "dict_avg = " + str(dict_final) + "\ndict_std = " + str(dict_std) + "\ndict_scores = " + str(dict_scores)
y = files.write(res)
files.close()

## Scenario 2: Concat Background + Ignore

In [None]:
def scenario_2(df_used):
  final_df = pd.DataFrame(columns=['Sentence','Expert'])
  for i in df_used['Document ID'].unique():
    df = df_used[doc_df['Document ID']==i]
    questions = df[df['Category']=='Question'].index
    for i in questions:
      to_be_deleted = np.delete(questions, np.where(questions==i))
      res_df = df.drop(index=to_be_deleted)
      res = ' '.join(res_df['Sentence'])
      temp = {'Sentence':res, 'Expert':df['Expert'][i]}
      df_temp = pd.DataFrame([temp])
      final_df = pd.concat([final_df,df_temp],ignore_index=True)
  final_df.reset_index(inplace=True, drop=True)
  return final_df

In [None]:
sc2_df = scenario_2(doc_df)

### ML

In [None]:
dict_avg2, dict_std2, dict_scores2  = model_ML(sc2_df, 20, states, oversampling_smote=False, oversampling_pseudolabelling=False, augmentation=False)

In [None]:
for n in n_gram:
  for m in metrics:
    id_m = metrics.index(m)
    t = table_visualization(dict_avg2,dict_std2, n, id_m)

    title = f'Eksperimen 2 - Input 2 - {n_gram[n]} - {m}'
    t.to_csv(f'{title}.csv')

    a = convert_to_latex(t, title, n_gram[n])


In [None]:
files = open(f'Eksperimen 2 - Input 2.txt', 'w')
res = "dict_avg = " + str(dict_avg2) + "\ndict_std = " + str(dict_std2) + "\ndict_scores = " + str(dict_scores2)
y = files.write(res)
files.close()

### BERT

In [None]:
dict_final2, dict_std2, dict_scores2 = model_bert(sc2_df, states, 20, augmentation=False, pseudolabelling=False)

In [None]:
dict_std2

{0: [0.1, 0.43, 0.16],
 1: [0.07, 0.3, 0.11],
 2: [0.1, 0.43, 0.16],
 3: [0.08, 0.36, 0.14],
 4: [0.06, 0.3, 0.1],
 5: [0.07, 0.3, 0.11],
 6: [0.05, 0.22, 0.08],
 7: [0.0, 0.0, 0.0],
 8: [0.0, 0.0, 0.0],
 9: [0.0, 0.0, 0.0],
 10: [0.0, 0.0, 0.0],
 11: [0.0, 0.0, 0.0],
 'accuracy': [0.02],
 ' macro avg': [0.0, 0.0, 0.0],
 'weighted avg': [0.01, 0.02, 0.01]}

In [None]:
files = open(f'Eksperimen 1 - Input 2 - BERT - Oversampling - Pseudolabelling.txt', 'w')
res = "dict_avg = " + str(dict_final2) + "\ndict_std = " + str(dict_std2) + "\ndict_scores = " + str(dict_scores2)
y = files.write(res)
files.close()

## Scenario 3: Concat Background

In [None]:
def scenario_3(df_used):
  final_df = pd.DataFrame(columns=['Sentence','Expert'])
  for i in df_used['Document ID'].unique():
    df = df_used[(doc_df['Document ID']==i) & (doc_df['Category'] != 'Ignore')]
    questions = df[df['Category']=='Question'].index
    for i in questions:
      to_be_deleted = np.delete(questions, np.where(questions==i))
      res_df = df.drop(index=to_be_deleted)
      res = ' '.join(res_df['Sentence'])
      temp = {'Sentence':res, 'Expert':df['Expert'][i] }
      df_temp = pd.DataFrame([temp])
      final_df = pd.concat([final_df,df_temp],ignore_index=True)
  final_df.reset_index(inplace=True, drop=True)
  return final_df

In [None]:
sc3_df = scenario_3(doc_df)

### ML

In [None]:
dict_avg3, dict_std3, dict_scores3  = model_ML(sc3_df, 20, states, oversampling_smote=False, oversampling_pseudolabelling=False, augmentation=False)

In [None]:
for n in n_gram:
  for m in metrics:
    id_m = metrics.index(m)
    t = table_visualization(dict_avg3,dict_std3, n, id_m)

    title = f'Eksperimen 2 - Input 3 - {n_gram[n]} - {m}'
    t.to_csv(f'{title}.csv')

    a = convert_to_latex(t, title, n_gram[n])


In [None]:
files = open(f'Eksperimen 2 - Input 3.txt', 'w')
res = "dict_avg = " + str(dict_avg3) + "\ndict_std = " + str(dict_std3) + "\ndict_scores = " + str(dict_scores3)
y = files.write(res)
files.close()

### BERT

In [None]:
dict_final3, dict_std3, dict_scores3 = model_bert(sc3_df, states, 20, augmentation=False, pseudolabelling=False)

In [None]:
dict_std3

{0: [0.08, 0.36, 0.14],
 1: [0.08, 0.36, 0.14],
 2: [0.09, 0.4, 0.14],
 3: [0.09, 0.43, 0.15],
 4: [0.06, 0.3, 0.1],
 5: [0.06, 0.3, 0.1],
 6: [0.05, 0.22, 0.08],
 7: [0.0, 0.0, 0.0],
 8: [0.0, 0.0, 0.0],
 9: [0.0, 0.0, 0.0],
 10: [0.0, 0.0, 0.0],
 11: [0.0, 0.0, 0.0],
 'accuracy': [0.02],
 ' macro avg': [0.0, 0.0, 0.0],
 'weighted avg': [0.01, 0.02, 0.02]}

In [None]:
files = open(f'Eksperimen 1 - Input 3 - BERT - Oversampling - Pseudolabelling.txt', 'w')
res = "dict_avg = " + str(dict_final3) + "\ndict_std = " + str(dict_std3) + "\ndict_scores = " + str(dict_scores3)
y = files.write(res)
files.close()

## Scenario 4: Concat only background sentences that appear before the question

In [None]:
def scenario_4(df_used):
  final_df = pd.DataFrame(columns=['Sentence','Expert'])
  for i in df_used['Document ID'].unique():
    df = df_used[(doc_df['Document ID']==i) & (doc_df['Category'] != 'Ignore')]
    questions = df[df['Category']=='Question'].index
    for i in questions:
      to_be_deleted = np.delete(questions, np.where(questions==i))
      res_df = df.drop(index=to_be_deleted)
      res = ' '.join(res_df['Sentence'][:i+1])
      temp = {'Sentence':res, 'Expert':df['Expert'][i] }
      df_temp = pd.DataFrame([temp])
      final_df = pd.concat([final_df,df_temp],ignore_index=True)
  final_df.reset_index(inplace=True, drop=True)
  return final_df

In [None]:
sc4_df = scenario_4(doc_df)
sc4_df.head()

Unnamed: 0,Sentence,Expert
0,saya pernah kali melahirkan secara caesar ter...,Information
1,saya pernah kali melahirkan secara caesar ter...,Information
2,saya sudah tahun ini olahraga rutin tiga hari ...,Cause
3,saya sudah tahun ini olahraga rutin tiga hari ...,Management
4,saya sering mengalami pusing sebelah atau selu...,Diagnosis


### ML

In [None]:
dict_avg4, dict_std4, dict_scores4  = model_ML(sc4_df, 20, states, oversampling_smote=False, oversampling_pseudolabelling=False, augmentation=False)

In [None]:
for n in n_gram:
  for m in metrics:
    id_m = metrics.index(m)
    t = table_visualization(dict_avg4,dict_std4, n, id_m)

    title = f'Eksperimen 2 - Input 4 - {n_gram[n]} - {m}'
    t.to_csv(f'{title}.csv')

    a = convert_to_latex(t, title, n_gram[n])


In [None]:
files = open(f'Eksperimen 2 - Input 4.txt', 'w')
res = "dict_avg = " + str(dict_avg4) + "\ndict_std = " + str(dict_std4) + "\ndict_scores = " + str(dict_scores4)
y = files.write(res)
files.close()

In [None]:
dict_scores4['XGBoost'][(1,1)][19]

{0: [0.36, 0.39, 0.37, 51.0],
 1: [0.53, 0.46, 0.49, 35.0],
 2: [0.54, 0.68, 0.6, 68.0],
 3: [0.39, 0.45, 0.42, 67.0],
 4: [0.0, 0.0, 0.0, 5.0],
 5: [0.17, 0.12, 0.14, 8.0],
 6: [0.39, 0.32, 0.35, 22.0],
 7: [0.31, 0.18, 0.23, 22.0],
 8: [0.0, 0.0, 0.0, 4.0],
 9: [0.0, 0.0, 0.0, 3.0],
 10: [0.0, 0.0, 0.0, 2.0],
 11: [0.0, 0.0, 0.0, 2.0],
 'accuracy': [0.43],
 ' macro avg': [0.22, 0.22, 0.22],
 'weighted avg': [0.4, 0.43, 0.41]}

### BERT

In [None]:
dict_final4, dict_std4, dict_scores4 = model_bert(sc4_df, states, 20, augmentation=False, pseudolabelling=False)

In [None]:
dict_final4

{0: [0.02, 0.1, 0.04],
 1: [0.05, 0.2, 0.08],
 2: [0.05, 0.2, 0.08],
 3: [0.06, 0.25, 0.1],
 4: [0.01, 0.05, 0.02],
 5: [0.01, 0.05, 0.02],
 6: [0.02, 0.1, 0.04],
 7: [0.01, 0.05, 0.02],
 8: [0.0, 0.0, 0.0],
 9: [0.0, 0.0, 0.0],
 10: [0.0, 0.0, 0.0],
 11: [0.0, 0.0, 0.0],
 'accuracy': [0.24],
 ' macro avg': [0.02, 0.08, 0.03],
 'weighted avg': [0.06, 0.24, 0.09]}

In [None]:
dict_std4

{0: [0.07, 0.3, 0.11],
 1: [0.09, 0.4, 0.15],
 2: [0.1, 0.4, 0.15],
 3: [0.1, 0.43, 0.16],
 4: [0.05, 0.22, 0.08],
 5: [0.05, 0.22, 0.08],
 6: [0.07, 0.3, 0.11],
 7: [0.05, 0.22, 0.08],
 8: [0.0, 0.0, 0.0],
 9: [0.0, 0.0, 0.0],
 10: [0.0, 0.0, 0.0],
 11: [0.0, 0.0, 0.0],
 'accuracy': [0.0],
 ' macro avg': [0.0, 0.0, 0.0],
 'weighted avg': [0.0, 0.0, 0.0]}

In [None]:
for m in metrics:
  n = 'Pseudolabelling'
  id_m = metrics.index(m)
  t = table_visualization(dict_avg, dict_std, n, id_m)

  title = f'Eksperimen 1 - Input 4 - BERT - {m} - {n} Oversampling'
  t.to_csv(f'{title}.csv')

  a = convert_to_latex(t, title, n, m)

In [None]:
files = open(f'Eksperimen 1 - Input 4 - BERT - Oversampling - Pseudolabelling.txt', 'w')
res = "dict_avg = " + str(dict_final4) + "\ndict_std = " + str(dict_std4) + "\ndict_scores = " + str(dict_scores4)
y = files.write(res)
files.close()