In [1]:
# Installing Transformers and Datasets library
!pip install transformers==4.27.0 --q
!pip install datasets --q

[0m

In [2]:
# Import Library
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from transformers import AutoModelForSequenceClassification
from transformers import Trainer
from datasets import load_metric

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np


caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [3]:
! pip install gdown

Collecting gdown
  Downloading gdown-4.7.1-py3-none-any.whl (15 kB)
Installing collected packages: gdown
Successfully installed gdown-4.7.1
[0m

In [4]:
!gdown 1jGWDqO2NOhKy_WaUF9p-UsmCiRVbOwYW

Downloading...
From: https://drive.google.com/uc?id=1jGWDqO2NOhKy_WaUF9p-UsmCiRVbOwYW
To: /kaggle/working/cleaned_dataset.csv
100%|█████████████████████████████████████████| 818k/818k [00:00<00:00, 115MB/s]


In [5]:
# Read dataset and select selecting aspect
df = pd.read_csv("cleaned_dataset.csv")
df.rename(columns={"text_clean": "text"}, inplace=True)
df.replace("", float("NaN"), inplace=True)
df.dropna(inplace=True)

df['label'] = df['System']
df = df[['text', 'label']]
df.sample(5)

Unnamed: 0,text,label
514,aplikasinya membantu memanage uang fitur autod...,1
1235,baguu baru sehari invest beli portofolio hari ...,1
1633,pencairannya banget udah hari keluar notifikas...,2
866,cukup membantu fiturnya mudah dipahami pemula ...,1
821,investasi sebaiknya dipikir teman trauma kena ...,2


### **Tokenize Model**

In [6]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("indobenchmark/indobert-base-p2")

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/229k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [7]:
# function to tokenize dataset
def tokenize_function(text):
    return tokenizer(text["text"], padding='max_length', max_length=256)

### **Train Model**

In [49]:
# clearing cuda memory
import torch
torch.cuda.empty_cache()

device = "cuda:0" if torch.cuda.is_available() else "cpu"
device

'cuda:0'

In [65]:
# hyperparameter configuration for training model
from transformers import TrainingArguments

training_args = TrainingArguments(
    "test_trainer", 
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=4,
    learning_rate=5e-5,
    logging_steps=58,
    evaluation_strategy="steps", 
)

In [66]:
metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    return metric.compute(predictions=predictions, references=labels)

torch.cuda.empty_cache()

# K-fold Cross Validation

In [67]:
from sklearn.model_selection import KFold
from sklearn.metrics import precision_recall_fscore_support as score
from datasets import load_dataset
from sklearn.metrics import confusion_matrix

i=0
n=10
kf = KFold(n_splits=n, random_state=30, shuffle=True)

results = []
RANDOM_SEED = 32
f1_scores = []

for train_index, test_index in kf.split(df):
  
  # splitting Dataframe (dataset not included)
  df_cv_train = df.iloc[train_index]
  df_cv_test = df.iloc[test_index]

  df_cv_eval, df_cv_test = train_test_split(
    df_cv_test,
    
    random_state=RANDOM_SEED,
  )
  
  df_cv_train.to_csv("cv_train.csv", index=False)
  df_cv_test.to_csv("cv_test.csv", index=False)
  df_cv_eval.to_csv("cv_eval.csv", index=False)

  actual_label = df_cv_test['label']

  files = {
    "train": "cv_train.csv", 
    "test": "cv_test.csv",
    "eval": "cv_eval.csv",
  }

  dataset_cv = load_dataset('csv', data_files=files)

  tokenized_datasets_cv = dataset_cv.map(tokenize_function, batched=True)

  torch.cuda.empty_cache()

  cv_train_dataset = tokenized_datasets_cv["train"]
  cv_test_dataset = tokenized_datasets_cv["test"]
  cv_eval_dataset = tokenized_datasets_cv["eval"]

  cv_model = AutoModelForSequenceClassification.from_pretrained("indobenchmark/indobert-base-p2", num_labels=3)

  cv_trainer = Trainer(
    model=cv_model,
    args=training_args,
    train_dataset=cv_train_dataset,
    eval_dataset=cv_eval_dataset,
    compute_metrics=compute_metrics,
)

  #train
  training_history = cv_trainer.train()

  # evaluate
  evaluation_history_cv = cv_trainer.evaluate()
  results.append(evaluation_history_cv['eval_accuracy'])
  i+=1

  prediction = cv_trainer.predict(cv_test_dataset)
  prediction = prediction.predictions.argmax(1)

  # report = classification_report(prediction, actual_label, target_names=["Netral", "Positif", "Negatif"])
  precision,recall,fscore,support=score(actual_label,prediction,average='macro')
  f1_scores.append(fscore)

  conf_matrix = confusion_matrix(actual_label, prediction)
  print(f"Fold {i+1} Confusion Matrix:\n{conf_matrix}\n")

Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-f3fbd4b528472e6f/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-f3fbd4b528472e6f/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  csv_file_reader = pd.read_csv(file, iterator=True, dtype=dtype, **self.config.read_csv_kwargs)
  csv_file_reader = pd.read_csv(file, iterator=True, dtype=dtype, **self.config.read_csv_kwargs)
  csv_file_reader = pd.read_csv(file, iterator=True, dtype=dtype, **self.config.read_csv_kwargs)


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy
58,0.6495,0.648833,0.746667
116,0.3962,0.660497,0.737778






Fold 2 Confusion Matrix:
[[16 11  5]
 [ 6 15  1]
 [ 4  0 18]]

Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-41fd52d9c9267688/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-41fd52d9c9267688/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  csv_file_reader = pd.read_csv(file, iterator=True, dtype=dtype, **self.config.read_csv_kwargs)
  csv_file_reader = pd.read_csv(file, iterator=True, dtype=dtype, **self.config.read_csv_kwargs)
  csv_file_reader = pd.read_csv(file, iterator=True, dtype=dtype, **self.config.read_csv_kwargs)


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy
58,0.6542,0.625636,0.76
116,0.3916,0.712528,0.755556






Fold 3 Confusion Matrix:
[[13  2  6]
 [ 1 28  3]
 [ 4  1 18]]

Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-d8fb3edc1c4759ce/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-d8fb3edc1c4759ce/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  csv_file_reader = pd.read_csv(file, iterator=True, dtype=dtype, **self.config.read_csv_kwargs)
  csv_file_reader = pd.read_csv(file, iterator=True, dtype=dtype, **self.config.read_csv_kwargs)
  csv_file_reader = pd.read_csv(file, iterator=True, dtype=dtype, **self.config.read_csv_kwargs)


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy
58,0.6218,0.593908,0.786667
116,0.3932,0.588511,0.777778






Fold 4 Confusion Matrix:
[[10  7  3]
 [ 3 22  2]
 [ 4  2 23]]

Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-a7de5cfac9d32d63/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-a7de5cfac9d32d63/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  csv_file_reader = pd.read_csv(file, iterator=True, dtype=dtype, **self.config.read_csv_kwargs)
  csv_file_reader = pd.read_csv(file, iterator=True, dtype=dtype, **self.config.read_csv_kwargs)
  csv_file_reader = pd.read_csv(file, iterator=True, dtype=dtype, **self.config.read_csv_kwargs)


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy
58,0.6403,0.608165,0.764444
116,0.3795,0.638996,0.777778






Fold 5 Confusion Matrix:
[[11  8  1]
 [ 6 23  1]
 [ 2  2 22]]

Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-95c59e9ea43ff802/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-95c59e9ea43ff802/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  csv_file_reader = pd.read_csv(file, iterator=True, dtype=dtype, **self.config.read_csv_kwargs)
  csv_file_reader = pd.read_csv(file, iterator=True, dtype=dtype, **self.config.read_csv_kwargs)
  csv_file_reader = pd.read_csv(file, iterator=True, dtype=dtype, **self.config.read_csv_kwargs)


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy
58,0.6631,0.64555,0.742222
116,0.4255,0.714888,0.715556






Fold 6 Confusion Matrix:
[[ 9  6  4]
 [ 8 24  1]
 [ 0  0 23]]

Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-1e967bfc1c10dcf7/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-1e967bfc1c10dcf7/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  csv_file_reader = pd.read_csv(file, iterator=True, dtype=dtype, **self.config.read_csv_kwargs)
  csv_file_reader = pd.read_csv(file, iterator=True, dtype=dtype, **self.config.read_csv_kwargs)
  csv_file_reader = pd.read_csv(file, iterator=True, dtype=dtype, **self.config.read_csv_kwargs)


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy
58,0.6533,0.69005,0.715556
116,0.3754,0.775852,0.693333






Fold 7 Confusion Matrix:
[[10  6  2]
 [ 9 28  0]
 [ 4  1 15]]

Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-0c422fa7a440bc77/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-0c422fa7a440bc77/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  csv_file_reader = pd.read_csv(file, iterator=True, dtype=dtype, **self.config.read_csv_kwargs)
  csv_file_reader = pd.read_csv(file, iterator=True, dtype=dtype, **self.config.read_csv_kwargs)
  csv_file_reader = pd.read_csv(file, iterator=True, dtype=dtype, **self.config.read_csv_kwargs)


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy
58,0.652,0.646909,0.72
116,0.3823,0.717097,0.737778






Fold 8 Confusion Matrix:
[[12 10  1]
 [ 5 27  1]
 [ 0  2 17]]

Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-de761198f5fb32d4/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-de761198f5fb32d4/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  csv_file_reader = pd.read_csv(file, iterator=True, dtype=dtype, **self.config.read_csv_kwargs)
  csv_file_reader = pd.read_csv(file, iterator=True, dtype=dtype, **self.config.read_csv_kwargs)
  csv_file_reader = pd.read_csv(file, iterator=True, dtype=dtype, **self.config.read_csv_kwargs)


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy
58,0.6448,0.581562,0.782222
116,0.378,0.680182,0.786667






Fold 9 Confusion Matrix:
[[ 9  5  5]
 [ 2 31  1]
 [ 4  2 16]]

Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-83f9f9f759d49ffd/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-83f9f9f759d49ffd/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  csv_file_reader = pd.read_csv(file, iterator=True, dtype=dtype, **self.config.read_csv_kwargs)
  csv_file_reader = pd.read_csv(file, iterator=True, dtype=dtype, **self.config.read_csv_kwargs)
  csv_file_reader = pd.read_csv(file, iterator=True, dtype=dtype, **self.config.read_csv_kwargs)


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy
58,0.6522,0.58567,0.751111
116,0.408,0.617001,0.728889






Fold 10 Confusion Matrix:
[[13  8  2]
 [ 3 28  2]
 [ 3  1 15]]

Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-3f76daddff7c8ea2/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-3f76daddff7c8ea2/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  csv_file_reader = pd.read_csv(file, iterator=True, dtype=dtype, **self.config.read_csv_kwargs)
  csv_file_reader = pd.read_csv(file, iterator=True, dtype=dtype, **self.config.read_csv_kwargs)
  csv_file_reader = pd.read_csv(file, iterator=True, dtype=dtype, **self.config.read_csv_kwargs)


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy
58,0.6497,0.665353,0.746667
116,0.4315,0.546834,0.786667






Fold 11 Confusion Matrix:
[[11  2  3]
 [ 3 35  1]
 [ 3  1 16]]



In [68]:
print(f1_scores)
print(f"Mean-F1: {sum(f1_scores) / len(f1_scores)}")

[0.653110944527736, 0.7585185185185184, 0.7020595913517873, 0.7247537647537646, 0.7212885154061625, 0.6921311555457897, 0.7482456140350878, 0.7059318676965735, 0.7361737677527153, 0.791919191919192]
Mean-F1: 0.7234132931507327


In [69]:
print("results",results)
print(f"Mean-Validation: {sum(results) / len(results)}")

results [0.7511111111111111, 0.7288888888888889, 0.7866666666666666, 0.7911111111111111, 0.7066666666666667, 0.7333333333333333, 0.7333333333333333, 0.7777777777777778, 0.7555555555555555, 0.7911111111111111]
Mean-Validation: 0.7555555555555555


# Unlabel Data & Save Model

In [14]:
# !gdown 1Zv7RekcojCASu_tKDcqjHJYZ5MY5e1dt
# unlabeled_data = pd.read_csv('cleaned_unlabel_dataset.csv')

In [15]:
# unlabeled_data.sample(5)

In [16]:
# cv_model.save_pretrained("model")

In [17]:
# cv_model = AutoModelForSequenceClassification.from_pretrained("model")

In [18]:
# trainer = Trainer(model=cv_model)

In [19]:
# unlabeled_data['sentiments'] = 0

In [20]:
# def predict(text):
#     tokenized = tokenizer(text, padding='max_length', max_length=256)
#     label = trainer.predict([tokenized]).predictions.argmax(1)[0]
#     if label == 0:
#         print(f'Predicted: Netral [{label}]')
#     elif label == 1:
#         print(f'Predicted: Positif [{label}]')
#     else:
#         print(f'Predicted: Negatif [{label}]')

In [21]:
# for i, row in unlabeled_data.iterrows():
#     text = row['text_clean']
#     sentiment = predict(text)
#     unlabeled_data.at[i, 'sentiments'] = sentiment

In [22]:
# !gdown 1ytl7w06dJkbHqnzGEMzi6tuvCf7OQVk3

In [23]:
# # Tokenize data using the same tokenizer as the one used in the model
# unlabeled_data_tokenized = unlabeled_data["Ulasan"].apply((lambda x: tokenizer.encode(x, add_special_tokens=True, max_length=256, truncation=True, padding='max_length')))


In [24]:
# from torch.utils.data import Dataset

# class CustomDataset(Dataset):
#     def __init__(self, data):
#         self.data = data

#     def __len__(self):
#         return len(self.data)

#     def __getitem__(self, idx):
#         return {"input_ids": torch.tensor(self.data[idx], dtype=torch.long)}
        
# unlabeled_dataset = CustomDataset(unlabeled_data_tokenized)


In [25]:
# from torch.utils.data import DataLoader

# cv_model.eval()
# predictions = []

# with torch.no_grad():
#     for batch in DataLoader(unlabeled_dataset, batch_size=16, shuffle=False):
#         inputs = batch["input_ids"].to(device)
#         outputs = cv_model(inputs)[0]
#         predictions.append(outputs.argmax(-1).tolist())

# # Concatenate all predictions into a single list
# predictions = [item for sublist in predictions for item in sublist]


In [26]:
# from collections import Counter

# # Count the number of occurrences of each label
# label_counts = Counter(predictions)

# print(label_counts)