In [7]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.model_selection import train_test_split

In [3]:
label_mapping = {-1: 0, 0: 1, 1: 2}

data = pd.read_csv('./balanced_twitter_dataset_indo.csv')
data = data[['Tweet', 'sentimen']]
data.columns = ['Teks', 'Label'] 

# Konversi label ke angka
data['Label'] = data['Label'].map(label_mapping)


In [18]:
#Split data into training and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(data['Teks'], data['Label'], test_size=0.2)

In [4]:
train_dataset = Dataset.from_dict({'text': train_texts, 'label': train_labels})
test_dataset = Dataset.from_dict({'text': test_texts, 'label': test_labels})

# Load pre-trained tokenizer
tokenizer = BertTokenizer.from_pretrained('indobenchmark/indobert-base-p2')



In [5]:
# Tokenize data
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=128)

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)



Map: 100%|██████████| 6220/6220 [00:01<00:00, 4600.25 examples/s]
Map: 100%|██████████| 1556/1556 [00:00<00:00, 4865.93 examples/s]


In [15]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# Load pre-trained model for classification
model = BertForSequenceClassification.from_pretrained('indobenchmark/indobert-base-p2', num_labels=3)

#Define training arguments
training_args = TrainingArguments(
    output_dir='./hasilTrain',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
# Define the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

trainer.train()


                                                  
 33%|███▎      | 389/1167 [08:02<12:51,  1.01it/s]

{'eval_loss': 0.7551379203796387, 'eval_runtime': 40.7594, 'eval_samples_per_second': 38.175, 'eval_steps_per_second': 2.404, 'epoch': 1.0}


 43%|████▎     | 500/1167 [11:58<22:32,  2.03s/it]  

{'loss': 0.7285, 'grad_norm': 6.585900783538818, 'learning_rate': 1.1431019708654672e-05, 'epoch': 1.29}


                                                  
 67%|██████▋   | 778/1167 [20:30<08:49,  1.36s/it]

{'eval_loss': 0.7491735816001892, 'eval_runtime': 30.9123, 'eval_samples_per_second': 50.336, 'eval_steps_per_second': 3.17, 'epoch': 2.0}


 86%|████████▌ | 1000/1167 [26:14<03:54,  1.41s/it] 

{'loss': 0.4049, 'grad_norm': 20.17035675048828, 'learning_rate': 2.8620394173093403e-06, 'epoch': 2.57}


                                                   
100%|██████████| 1167/1167 [31:02<00:00,  1.60s/it]

{'eval_loss': 0.8920438885688782, 'eval_runtime': 28.9791, 'eval_samples_per_second': 53.694, 'eval_steps_per_second': 3.382, 'epoch': 3.0}
{'train_runtime': 1862.0117, 'train_samples_per_second': 10.021, 'train_steps_per_second': 0.627, 'train_loss': 0.5283720609767885, 'epoch': 3.0}





TrainOutput(global_step=1167, training_loss=0.5283720609767885, metrics={'train_runtime': 1862.0117, 'train_samples_per_second': 10.021, 'train_steps_per_second': 0.627, 'total_flos': 1227424093701120.0, 'train_loss': 0.5283720609767885, 'epoch': 3.0})

In [17]:
# Save the trained model
model.save_pretrained("./trained_model1")
tokenizer.save_pretrained("./trained_model1")

('./trained_model1\\tokenizer_config.json',
 './trained_model1\\special_tokens_map.json',
 './trained_model1\\vocab.txt',
 './trained_model1\\added_tokens.json')