# Library dan Konfigurasi

## Install dan Import Library

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip3 install wandb --upgrade --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m29.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.6/190.6 kB[0m [31m24.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m252.8/252.8 kB[0m [31m30.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.7/62.7 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# Dependencies for nlpaug
!pip install numpy requests nlpaug --quiet
# Dependencies for BackTranslationAug, ContextualWordEmbsAug
!pip install torch>=1.6.0 transformers>=4.11.3 sentencepiece --quiet
# Dependencies for WordEmbsAug (word2vec, glove or fasttext)
!pip install gensim>=4.1.2 --quiet
!pip install sacremoses --quiet
!pip install sentencepiece --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.5/410.5 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import wandb
wandb.login()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [1]:
# Libraries

import matplotlib.pyplot as plt
import pandas as pd
import torch
import numpy as np
import os
from tqdm.notebook import tqdm
import random
from sklearn import metrics, model_selection, preprocessing
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
import ast

# Models

import torch.nn as nn
import transformers
from transformers import AutoTokenizer, AutoModel
from transformers import AdamW, get_linear_schedule_with_warmup
import torch.optim as optim
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

ModuleNotFoundError: No module named 'matplotlib'

In [None]:
'''''''''
import torch

# Create a sample tensor with floating-point values
input_data = torch.tensor([0.3, 0.7, 0.2, 0.8, 0.1])

# Applying the .round() function
output_data = input_data.round()

# Display the output tensor
print(output_data)

tensor([0., 1., 0., 1., 0.])


In [None]:
# Import the nlpaug module and its methods
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
import nlpaug.flow as nafc
from nlpaug.util import Action

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.dev
device

device(type='cuda')

In [None]:
!nvidia-smi

In [None]:
n_labels = 5
bert_model = AutoModel.from_pretrained("indobenchmark/indobert-base-p1")
tokenizer = AutoTokenizer.from_pretrained("indobenchmark/indobert-base-p1")

config.json:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/498M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/229k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

## Config (Sweep)

In [None]:
sweep_config = {
    'method': 'grid', #grid, random, bayesian
    'name': '',
    'metric': {
      'name': 'auc_score',
      'goal': 'maximize'
    },
    'parameters': {

        'learning_rate': {
            'values': [1e-5, 1e-4, 1e-3]
        },
        'batch_size': {
            'values': [2, 4, 8]
        },
        'epochs':{
            'values': [5, 10, 15]
        },
        'dropout':{
            'values': [0.3, 0.4, 0.5]
        },
        'tokenizer_max_len': {
            'values': [512, 256, 128]
        },
    }
}


sweep_defaults = {
    'method': 'grid', #grid, random, bayesian
    'name': 'bert_no_aug',
    'metric': {
      'name': 'auc_score',
      'goal': 'maximize'
    },
    'parameters': {

        'learning_rate': {
            'values': [1e-5]
        },
        'batch_size': {
            'values': [4]
        },
        'epochs':{
            'values': [5]
        },
        'dropout':{
            'values': [0.3]
        },
        'tokenizer_max_len': {
            'values': [512]
        },
    }
}

In [None]:
#sweep_id = wandb.sweep(sweep_config, project='hfacs')
sweep_id = wandb.sweep(sweep_defaults, project='hfacs')

Create sweep with ID: id11xdex
Sweep URL: https://wandb.ai/rhraihanhaikal/hfacs/sweeps/id11xdex


# Preprocessing

## Preprocess 1 (Data Preparation)

In [None]:
# Membaca file Excel
data = pd.read_excel('/content/drive/MyDrive/Skripsi/Dataset/Subclass/HFACS Label Full Manual_Subclass_Pisah.xlsx', sheet_name='Sheet1')

In [None]:
data.to_csv('/content/drive/MyDrive/Skripsi/Dataset/Subclass/dataset_knkt_subclass_pisah.csv', index=False)

In [None]:
# Membaca file csv
data = pd.read_csv("/content/drive/MyDrive/Skripsi/Dataset/Subclass/dataset_knkt_subclass_pisah.csv", engine="python")
data.head()

Unnamed: 0,Teks,ER (LVL1),VIO (LVL1),EF (LVL2),CO (LVL2),PF (LVL2)
0,Berdasarkan hasil investigasi dan analisis dap...,,1.0,,,
1,Faktor penyebab fatalitas awak truk trailer ta...,,,1.0,,
2,Berdasarkan hasil investigasi dan analisis dap...,,,1.0,,
3,Fatalitas korban terjadi karena pengemudi meng...,,,,1.0,
4,Pembantu pengemudi mengalami tidur sesaat / mi...,,,,,1.0


In [None]:
#data = data.drop(columns=['Alasan'])
data = data.replace('-', 0)
data = data.replace('?', 0)
data = data.replace('--', 0)
data = data.fillna(0)

In [None]:
data['ER (LVL1)'] = data['ER (LVL1)'].astype(float)
data['VIO (LVL1)'] = data['VIO (LVL1)'].astype(float)
data['EF (LVL2)'] = data['EF (LVL2)'].astype(float)
data['CO (LVL2)'] = data['CO (LVL2)'].astype(float)
data['PF (LVL2)'] = data['PF (LVL2)'].astype(float)

In [None]:
data['TARGET_LIST'] = data[['ER (LVL1)', 'VIO (LVL1)', 'EF (LVL2)',
       'CO (LVL2)', 'PF (LVL2)']].values.tolist()

In [None]:
value_counts = data['TARGET_LIST'].apply(tuple).value_counts()

In [None]:
print(value_counts)

(1.0, 0.0, 0.0, 0.0, 0.0)    64
(0.0, 1.0, 0.0, 0.0, 0.0)    53
(0.0, 0.0, 1.0, 0.0, 0.0)    51
(0.0, 0.0, 0.0, 0.0, 0.0)    35
(0.0, 0.0, 0.0, 0.0, 1.0)    27
(0.0, 0.0, 0.0, 1.0, 0.0)    21
Name: TARGET_LIST, dtype: int64


In [None]:
class_names = {
    (0.0, 0.0, 0.0, 0.0, 0.0): 'Neutral',
    (1.0, 0.0, 0.0, 0.0, 0.0): 'ER (LVL1)',
    (0.0, 1.0, 0.0, 0.0, 0.0): 'VIO (LVL1)',
    (0.0, 0.0, 1.0, 0.0, 0.0): 'EF (LVL2)',
    (0.0, 0.0, 0.0, 1.0, 0.0): 'CO (LVL2)',
    (0.0, 0.0, 0.0, 0.0, 1.0): 'PF (LVL2)'
}

In [None]:
sorted_value_counts = sorted(value_counts.items(), key=lambda x: class_names[x[0]])

for class_tuple, count in sorted_value_counts:
    class_name = class_names[class_tuple]
    print(f"{class_name} {count}")

CO (LVL2) 21
EF (LVL2) 51
ER (LVL1) 64
Neutral 35
PF (LVL2) 27
VIO (LVL1) 53


In [None]:
data = data.drop(columns=['ER (LVL1)', 'VIO (LVL1)', 'EF (LVL2)',
       'CO (LVL2)', 'PF (LVL2)'])

In [None]:
data = data.rename(columns={'Teks': 'TEXT'})
data = data.rename(columns={'target_list': 'TARGET_LIST'})

In [None]:
# train test split
train_dataset, test_dataset = train_test_split(data, test_size=0.2,
stratify=data.TARGET_LIST, random_state=1)

print(f'Train shape: {train_dataset.shape}')
print(f'Test shape: {test_dataset.shape}')
# export to csv
train_dataset.to_csv('/content/drive/MyDrive/Skripsi/Dataset/Subclass/train_dataset_pisah.csv', index=False)
test_dataset.to_csv('/content/drive/MyDrive/Skripsi/Dataset/Subclass/test_dataset_pisah.csv', index=False)

Train shape: (200, 2)
Test shape: (51, 2)


## Preprocessing 3 (Tokenisasi dan Encoding)

In [None]:
# Membaca file Excel
data = pd.read_excel('/content/drive/MyDrive/Skripsi/Dataset/Subclass/modified_train_dataset_pisah_with_target.xlsx', sheet_name='Sheet1')

In [None]:
data.to_csv('/content/drive/MyDrive/Skripsi/Dataset/Subclass/modified_train_dataset_pisah_with_target.csv', index=False)

In [None]:
#train = pd.read_csv("/content/drive/MyDrive/Skripsi/Dataset/Dataset/Aug After Split/train_dataset_aug.csv", engine="python") # with aug
train = pd.read_csv('/content/drive/MyDrive/Skripsi/Dataset/Subclass/modified_train_dataset_pisah_with_target.csv', engine="python") # no aug
#valid = pd.read_csv("/content/drive/MyDrive/Skripsi/Dataset/Dataset/Aug After Split/val_dataset.csv", engine="python")
test = pd.read_csv("/content/drive/MyDrive/Skripsi/Dataset/Subclass/test_dataset_pisah.csv", engine="python")

In [None]:
train.drop(columns=['Unnamed: 0'], inplace=True)

In [None]:
train.head()

Unnamed: 0,TEXT,TARGET_LIST
0,"Dengan demikian, kemungkinan rangkaian penyeba...","[1.0, 0.0, 0.0, 0.0, 0.0]"
1,Penyebab utama kecelakaan ini meliputi ketidak...,"[1.0, 0.0, 0.0, 0.0, 0.0]"
2,Kecelakaan ini kemungkinan disebabkan oleh ket...,"[1.0, 0.0, 0.0, 0.0, 0.0]"
3,Rangkaian penyebab kecelakaan mungkin termasuk...,"[1.0, 0.0, 0.0, 0.0, 0.0]"
4,Kemungkinan penyebab kecelakaan ini adalah pen...,"[1.0, 0.0, 0.0, 0.0, 0.0]"


In [None]:
value_counts_train = train['TARGET_LIST'].value_counts()
value_counts_test = test['TARGET_LIST'].value_counts()

In [None]:
print(value_counts_train)
#total 6169 dari 199 kalimat asal, masing2 30 augmentasi menggunakan kemampuan generatif chatgpt

[1.0, 0.0, 0.0, 0.0, 0.0]    1581
[0.0, 1.0, 0.0, 0.0, 0.0]    1302
[0.0, 0.0, 1.0, 0.0, 0.0]    1271
[0.0, 0.0, 0.0, 0.0, 0.0]     837
[0.0, 0.0, 0.0, 0.0, 1.0]     651
[0.0, 0.0, 0.0, 1.0, 0.0]     527
Name: TARGET_LIST, dtype: int64


In [None]:
print(value_counts_test)

[1.0, 0.0, 0.0, 0.0, 0.0]    13
[0.0, 1.0, 0.0, 0.0, 0.0]    11
[0.0, 0.0, 1.0, 0.0, 0.0]    10
[0.0, 0.0, 0.0, 0.0, 0.0]     7
[0.0, 0.0, 0.0, 0.0, 1.0]     6
[0.0, 0.0, 0.0, 1.0, 0.0]     4
Name: TARGET_LIST, dtype: int64


In [None]:
train['TEXT'] = train['TEXT'].astype(str)
train['TEXT'] = train['TEXT'].tolist()

### Class Dataset

In [None]:
class CustomDataset:
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = [torch.tensor(ast.literal_eval(label), dtype=torch.long) for label in labels]

        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = self.texts[index]
        label = self.labels[index]

        inputs = self.tokenizer.__call__(text,
                                        None,
                                        add_special_tokens=True,
                                        max_length=self.max_len,
                                        padding="max_length",
                                        truncation=True,
                                        )
        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]

        return {
            "ids": torch.tensor(ids, dtype=torch.long),
            "mask": torch.tensor(mask, dtype=torch.long),
            "labels": label
        }

In [None]:
class CustomDataset2:
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = [torch.tensor(ast.literal_eval(label), dtype=torch.long) for label in labels]
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = self.texts[index]
        label = self.labels[index]

        # Simplified tokenizer call
        inputs = self.tokenizer(text, add_special_tokens=True, max_length=self.max_len, padding="max_length", truncation=True)

        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]

        return {
            "ids": torch.tensor(ids, dtype=torch.long),
            "mask": torch.tensor(mask, dtype=torch.long),
            "labels": label
        }

In [None]:
train_dataset = CustomDataset(train.TEXT.tolist(), train.TA

                              GET_LIST.values.tolist(), tokenizer, max_len = 512)

In [None]:
test_dataset = CustomDataset(test.TEXT.tolist(), test.TARGET_LIST.values.tolist(), tokenizer, max_len = 512)

In [None]:
#debugging
import pandas as pd

# Assuming df is your pandas DataFrame
# Convert all elements in the text column to strings
train['TEXT'] = train['TEXT'].astype(str)

# Handle missing or null values if necessary
# For example, replacing null values with a placeholder string:
train['TEXT'].fillna('Missing Text', inplace=True)

# Now your texts list can be created from this column
texts = train['TEXT'].tolist()

In [None]:
labels = train['TARGET_LIST'].values.tolist()

In [None]:
max_len = 512

In [None]:
class CustomDataset3:
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts  # This should now be a list of strings
        self.labels = [torch.tensor(ast.literal_eval(label), dtype=torch.long) for label in labels]
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = self.texts[index]
        label = self.labels[index]

        inputs = self.tokenizer(text, add_special_tokens=True, max_length=self.max_len, padding="max_length", truncation=True)

        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]

        return {
            "ids": torch.tensor(ids, dtype=torch.long),
            "mask": torch.tensor(mask, dtype=torch.long),
            "labels": label
        }

In [None]:
# Create an instance of CustomDataset
train_dataset = CustomDataset3(texts, labels, tokenizer, max_len)

# Assuming you have a DataLoader setup
train_data_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)

# Test iterating over the DataLoader
for batch in train_data_loader:
    print(batch)
    break  # Just to test the first batch

{'ids': tensor([[    2,  5320,    92,  ...,     0,     0,     0],
        [    2,   562, 15530,  ...,     0,     0,     0],
        [    2,  2363,  5320,  ...,     0,     0,     0],
        [    2, 12156,   292,  ...,     0,     0,     0]]), 'mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([[0, 1, 0, 0, 0],
        [0, 0, 1, 0, 0],
        [1, 0, 0, 0, 0],
        [0, 1, 0, 0, 0]])}


### Fungsi Pembuat Dataset

In [None]:
def build_dataset(tokenizer_max_len):

    train_dataset = CustomDataset3(train.TEXT.tolist(), train.TARGET_LIST.values.tolist(), tokenizer, tokenizer_max_len)
    test_dataset = CustomDataset3(test.TEXT.tolist(), test.TARGET_LIST.values.tolist(), tokenizer, tokenizer_max_len)

    return train_dataset, test_dataset

def build_dataloader(train_dataset, test_dataset, batch_size):
    train_data_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_data_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    return train_data_loader, test_data_loader

In [None]:
#debugging
train_dataset, test_dataset = build_dataset(tokenizer_max_len = 512)
train_data_loader, test_data_loader = build_dataloader(train_dataset, test_dataset, batch_size = 4)

In [None]:
for batch in train_data_loader:
    print(batch)
    break  # Just to test the first batch

# Klasifikasi BERT

## Class BERT Model

In [None]:
class BERT(nn.Module):
    def __init__(self, n_train_steps, n_classes, do_prob, bert_model):
        super(BERT, self).__init__()
        self.bert = bert_model
        self.dropout = nn.Dropout(do_prob)
        self.out = nn.Linear(768, n_classes)
        self.n_train_steps = n_train_steps
        self.step_scheduler_after = "batch"

    def forward(self, ids, mask):
        output_1 = self.bert(ids, attention_mask=mask)["pooler_output"]
        output_2 = self.dropout(output_1)
        output = self.out(output_2)
        return output

## Fungsi Pemanggil Model

In [None]:
def ret_model(n_train_steps, do_prob):
  model = BERT(n_train_steps, n_labels, do_prob, bert_model=bert_model)
  return model

## Fungsi Pemanggil Optimizer, Scheduler, dan Loss

In [None]:
def ret_optimizer(model):

    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias"]
    optimizer_parameters = [
        {
            "params": [
                p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.001,
        },
        {
            "params": [
                p for n, p in param_optimizer if any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.0,
        },
    ]
    opt = AdamW(optimizer_parameters, lr=wandb.config.learning_rate)
    return opt

def ret_scheduler(optimizer, num_train_steps):
    sch = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)
    return sch

def loss_fn(outputs, labels):
    if labels is None:
        return None
    return nn.BCEWithLogitsLoss()(outputs, labels.float())

## Fungsi Metrik Evaluasi

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score

def log_metrics(preds, labels):
    preds = torch.stack(preds)
    preds = preds.cpu().detach().numpy()
    labels = torch.stack(labels)
    labels = labels.cpu().detach().numpy()


    fpr_micro, tpr_micro, _ = metrics.roc_curve(labels.ravel(), preds.ravel())

    auc_micro = metrics.auc(fpr_micro, tpr_micro)

    return {"auc_micro": auc_micro}

In [None]:
def log_metrics2(preds, labels):
    preds = torch.stack(preds)
    preds = preds.cpu().detach().numpy()
    labels = torch.stack(labels)
    labels = labels.cpu().detach().numpy()


    # FPR dan TPR mungkin tidak dapat dihitung untuk multi-label dengan cara ini
    # fpr_micro, tpr_micro, _ = metrics.roc_curve(labels.ravel(), preds.ravel())
    # auc_micro = metrics.auc(fpr_micro, tpr_micro)

    jacc_micro = metrics.jaccard_score(labels, preds, average="micro")
    recall_micro = metrics.recall_score(labels, preds, average='micro')
    precision_micro = metrics.precision_score(labels, preds, average='micro')

    return jacc_micro, recall_micro, precision_micro

In [None]:
preds = np.array([[0, 0, 1, 0, 0], [0, 1, 0, 0, 0], [0, 0, 0, 1, 0], [0, 0, 0, 0, 1]])
x = preds.ravel()

In [None]:
print(x)

[0 0 1 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 1]


## Fungsi Training

In [None]:
def train_fn(data_loader, model, optimizer, device, scheduler):

    train_loss = 0.0
    model.train()
    for bi, d in tqdm(enumerate(data_loader), total=len(data_loader)):
        ids = d["ids"]
        mask = d["mask"]
        targets = d["labels"]

        ids = ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        targets = targets.to(device, dtype=torch.float)

        optimizer.zero_grad()
        outputs = model(ids=ids, mask=mask)

        loss = loss_fn(outputs, targets)
        loss.backward()
        train_loss += loss.item()
        optimizer.step()
        scheduler.step()
    return train_loss

## Fungsi Evaluasi

In [None]:
def eval_fn(data_loader, model, device):

    eval_loss = 0.0
    model.eval()
    fin_targets = []
    fin_outputs = []
    with torch.no_grad():
        for bi, d in tqdm(enumerate(data_loader), total=len(data_loader)):
            ids = d["ids"]
            mask = d["mask"]
            targets = d["labels"]

            ids = ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)
            targets = targets.to(device, dtype=torch.float)

            outputs = model(ids=ids, mask=mask)
            loss = loss_fn(outputs, targets)
            eval_loss += loss.item()
            fin_targets.extend(targets)
            fin_outputs.extend(torch.sigmoid(outputs))
    return eval_loss, fin_outputs, fin_targets

In [None]:
def eval_fn2(data_loader, model, device, threshold=0.5):
    eval_loss = 0.0
    model.eval()
    fin_targets = []
    fin_outputs = []

    with torch.no_grad():
        for bi, d in tqdm(enumerate(data_loader), total=len(data_loader)):
            ids = d["ids"]
            mask = d["mask"]
            targets = d["labels"]

            ids = ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)
            targets = targets.to(device, dtype=torch.float)

            outputs = model(ids=ids, mask=mask)
            loss = loss_fn(outputs, targets)
            eval_loss += loss.item()

            # Menggunakan softmax untuk mendapatkan probabilitas
            probs = torch.softmax(outputs, dim=1)
            fin_targets.extend(targets.cpu())

            # Mengubah probabilitas menjadi prediksi biner berdasarkan threshold
            preds = (probs > threshold).float()
            fin_outputs.extend(preds.cpu())

    return eval_loss, fin_outputs, fin_targets

## Fungsi Utama

In [None]:
def main(config=None):
    with wandb.init(config=config):
        config = wandb.config

        train_dataset, valid_dataset = build_dataset(config.tokenizer_max_len)
        train_data_loader, valid_data_loader = build_dataloader(train_dataset, valid_dataset, config.batch_size)
        print("Length of Train Dataloader: ", len(train_data_loader))
        print("Length of Valid Dataloader: ", len(valid_data_loader))

        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

        n_train_steps = int(len(train_dataset) / config.batch_size * 10)

        model = ret_model(n_train_steps, config.dropout)
        optimizer = ret_optimizer(model)
        scheduler = ret_scheduler(optimizer, n_train_steps)
        model.to(device)
        model = nn.DataParallel(model)
        wandb.watch(model)

        n_epochs = config.epochs

        best_val_loss = 100
        for epoch in tqdm(range(n_epochs)):
            train_loss = train_fn(train_data_loader, model, optimizer, device, scheduler)
            eval_loss, preds, labels = eval_fn2(valid_data_loader, model, device)

            jacc_score, recall_score, precision_score = log_metrics2(preds, labels)

            print("JACC score: ", jacc_score)
            print("RECALL score: ", recall_score)
            print("PRECISION score: ", precision_score)

            avg_train_loss, avg_val_loss = train_loss / len(train_data_loader), eval_loss / len(valid_data_loader)
            wandb.log({
                "epoch": epoch + 1,
                "train_loss": avg_train_loss,
                "val_loss": avg_val_loss,
                "jacc_score": jacc_score,
                "recall_score": recall_score,
                "precision_score": precision_score,
            })
            print("Average Train loss: ", avg_train_loss)
            print("Average Valid loss: ", avg_val_loss)

            if avg_val_loss < best_val_loss:
                best_val_loss = avg_val_loss
                torch.save(model.state_dict(), "/content/drive/MyDrive/Skripsi/Model BERT/best_model.pt")
                print("Model saved as current val_loss is: ", best_val_loss)

In [None]:
wandb.agent(sweep_id, function=main, count=1) #random

[34m[1mwandb[0m: Agent Starting Run: egj1z0gw with config:
[34m[1mwandb[0m: 	batch_size: 4
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	learning_rate: 1e-05
[34m[1mwandb[0m: 	tokenizer_max_len: 512


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112881177773993, max=1.0…

Length of Train Dataloader:  1543
Length of Valid Dataloader:  13




  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/1543 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

JACC score:  0.16666666666666666
RECALL score:  0.29545454545454547
PRECISION score:  0.2765957446808511
Average Train loss:  0.20689909264797796
Average Valid loss:  0.68976963024873
Model saved as current val_loss is:  0.68976963024873


  0%|          | 0/1543 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

JACC score:  0.1875
RECALL score:  0.3409090909090909
PRECISION score:  0.29411764705882354
Average Train loss:  0.06693425354785006
Average Valid loss:  0.7895665512635157


  0%|          | 0/1543 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

JACC score:  0.1625
RECALL score:  0.29545454545454547
PRECISION score:  0.2653061224489796
Average Train loss:  0.044805140990211025
Average Valid loss:  0.8738672137260437


  0%|          | 0/1543 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

JACC score:  0.19230769230769232
RECALL score:  0.3409090909090909
PRECISION score:  0.30612244897959184
Average Train loss:  0.03298901507814432
Average Valid loss:  0.8491349930946643


  0%|          | 0/1543 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

JACC score:  0.15584415584415584
RECALL score:  0.2727272727272727
PRECISION score:  0.26666666666666666
Average Train loss:  0.024495453967900534
Average Valid loss:  0.888947858260228


VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▃▅▆█
jacc_score,▃▇▂█▁
precision_score,▃▆▁█▁
recall_score,▃█▃█▁
train_loss,█▃▂▁▁
val_loss,▁▅▇▇█

0,1
epoch,5.0
jacc_score,0.15584
precision_score,0.26667
recall_score,0.27273
train_loss,0.0245
val_loss,0.88895


In [None]:
wandb.agent(sweep_id, function=trainer2) #grid

[34m[1mwandb[0m: Agent Starting Run: xhpd8ll5 with config:
[34m[1mwandb[0m: 	batch_size: 4
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	learning_rate: 1e-05
[34m[1mwandb[0m: 	tokenizer_max_len: 512
[34m[1mwandb[0m: Currently logged in as: [33mrhraihanhaikal[0m. Use [1m`wandb login --relogin`[0m to force relogin


Length of Train Dataloader:  1543
Length of Valid Dataloader:  13




  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/1543 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

JACC score:  0.19230769230769232
RECALL score:  0.3409090909090909
PRECISION score:  0.30612244897959184
Average Train loss:  0.2104519608280273
Average Valid loss:  0.6979921414301946
Model saved as current val_loss is:  0.6979921414301946


  0%|          | 0/1543 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

JACC score:  0.19230769230769232
RECALL score:  0.3409090909090909
PRECISION score:  0.30612244897959184
Average Train loss:  0.06981004422442409
Average Valid loss:  0.7880208182793397


  0%|          | 0/1543 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

JACC score:  0.19480519480519481
RECALL score:  0.3409090909090909
PRECISION score:  0.3125
Average Train loss:  0.04289564378776878
Average Valid loss:  0.8234052062034607


  0%|          | 0/1543 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

JACC score:  0.19736842105263158
RECALL score:  0.3409090909090909
PRECISION score:  0.3191489361702128
Average Train loss:  0.03302961003113169
Average Valid loss:  0.7918248210962002


  0%|          | 0/1543 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

JACC score:  0.20833333333333334
RECALL score:  0.3409090909090909
PRECISION score:  0.3488372093023256
Average Train loss:  0.025072153330685805
Average Valid loss:  0.7450489206955984


VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▃▅▆█
jacc_score,▁▁▂▃█
precision_score,▁▁▂▃█
recall_score,▁▁▁▁▁
train_loss,█▃▂▁▁
val_loss,▁▆█▆▄

0,1
epoch,5.0
jacc_score,0.20833
precision_score,0.34884
recall_score,0.34091
train_loss,0.02507
val_loss,0.74505


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Sweep Agent: Exiting.


# Reversed Engineer/Kalkulasi Manual

In [None]:
teks_manual = "Berdasarkan hasil investigasi dan analisis dapat disimpulkan bahwa penyebab truk tronton meluncur tak terkendali di bentang menurun dan terjadi tabrakan beruntun adalah gagalnya perlambatan dikarenakan kondisi jalan turunan panjang dan curam yang cenderung memaksa pengguna jalan untuk melakukan pengereman menggunakan rem utama berkali-kali, tanpa diawali teknik pengereman mesin (engine braking)"
target_list = [0, 0, 0, 0, 0]

data = {'TEXT': [teks_manual], 'TARGET_LIST': [target_list]}
df_manual = pd.DataFrame(data)
df_manual_dataset = CustomDataset(df_manual, tokenizer, MAX_LEN)

In [None]:
sample = df_manual_dataset[0]

print('input_ids:', sample['input_ids'])
print('attention_mask:', sample['attention_mask'])
print('token_type_ids:', sample['token_type_ids'])

In [None]:
input_ids = sample['input_ids']
tokens = tokenizer.convert_ids_to_tokens(input_ids)

print(tokens)

['[CLS]', 'berdasarkan', 'hasil', 'investigasi', 'dan', 'analisis', 'dapat', 'disimpulkan', 'bahwa', 'penyebab', 'truk', 'tro', '##nt', '##on', 'meluncur', 'tak', 'terkendali', 'di', 'bentang', 'menurun', 'dan', 'terjadi', 'tabrakan', 'beruntun', 'adalah', 'gagal', '##nya', 'perl', '##ambatan', 'dikarenakan', 'kondisi', 'jalan', 'turunan', 'panjang', 'dan', 'curam', 'yang', 'cenderung', 'memaksa', 'pengguna', 'jalan', 'untuk', 'melakukan', 'pengereman', 'menggunakan', 'rem', 'utama', 'berkali', '-', 'kali', ',', 'tanpa', 'diawali', 'teknik', 'pengereman', 'mesin', '(', 'engine', 'bra', '##king', ')', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD

In [None]:
ManualDataLoader = torch.utils.data.DataLoader(
    df_manual_dataset,
    batch_size=TEST_BATCH_SIZE,
    shuffle=False,
    num_workers=0
)

In [None]:
def evaluatemanual(model, test_loader):

    logittensor, _ = evaluate(model, test_loader)

    return prediksi


In [None]:
evaluatemanual(model, ManualDataLoader)

[[1.0, 0.0, 0.0, 0.0, 0.0]]