## Data Loading


In [None]:
import pandas as pd
model_data = pd.read_csv("../data/model-data/dataset.csv")

# Modeling Tasks


## Split and Tokenization


In [16]:
y = model_data.label.tolist()
X = model_data.msg_cased.tolist()
len(X)


5000

In [17]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=42, shuffle=True, stratify=y
)

X = X_test
y = y_test

X_test, X_validation, y_test, y_validation = train_test_split(
    X, y, test_size=0.50, random_state=42, shuffle=True, stratify=y
)

train = {"label": y_train, "msg": X_train}
test = {"label": y_test, "msg": X_test}
validation = {"label": y_validation, "msg": X_validation}


In [23]:
pd.DataFrame.from_dict(train).to_csv("../data/dataset/train.csv",index=False)
pd.DataFrame.from_dict(test).to_csv("../data/dataset/test.csv",index=False)
pd.DataFrame.from_dict(validation).to_csv("../data/dataset/validation.csv",index=False)

In [28]:
from transformers import AutoModel, AutoTokenizer, AutoModelForSequenceClassification

pretrained = "jcblaise/roberta-tagalog-base"
tokenizer = AutoTokenizer.from_pretrained(pretrained)
model = AutoModelForSequenceClassification.from_pretrained(pretrained, num_labels=5)

from datasets import Dataset

train_dataset = Dataset.from_dict(train)
test_dataset = Dataset.from_dict(test)
validation_dataset = Dataset.from_dict(validation)


Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at jcblaise/roberta-tagalog-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias', 'class

In [29]:
len(train_dataset["label"])


3500

In [30]:
def tokenize_function(examples):
    return tokenizer(
        examples["msg"],
        padding="max_length",
        truncation="longest_first",
        max_length=128,
        return_tensors="pt",
    )


In [None]:
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True).shuffle(
    seed=100
)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True).shuffle(
    seed=100
)
tokenized_validation_dataset = validation_dataset.map(
    tokenize_function, batched=True
).shuffle(seed=100)


In [32]:
from transformers import TrainingArguments

training_args = TrainingArguments(output_dir="test_trainer")


In [33]:
from transformers import TrainingArguments, Trainer
import numpy as np
import evaluate

training_args = TrainingArguments(
    output_dir="test_trainer", evaluation_strategy="epoch"
)

metric = evaluate.load("accuracy")



In [34]:

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_validation_dataset,
    compute_metrics=compute_metrics,
)


In [35]:

trainer.train()




  0%|          | 0/1314 [00:00<?, ?it/s]

  0%|          | 0/94 [00:00<?, ?it/s]

{'eval_loss': 1.228860855102539, 'eval_accuracy': 0.45866666666666667, 'eval_runtime': 231.2406, 'eval_samples_per_second': 3.243, 'eval_steps_per_second': 0.407, 'epoch': 1.0}
{'loss': 1.3101, 'learning_rate': 3.097412480974125e-05, 'epoch': 1.14}


  0%|          | 0/94 [00:00<?, ?it/s]

{'eval_loss': 1.341127634048462, 'eval_accuracy': 0.4613333333333333, 'eval_runtime': 225.5329, 'eval_samples_per_second': 3.325, 'eval_steps_per_second': 0.417, 'epoch': 2.0}
{'loss': 0.917, 'learning_rate': 1.1948249619482495e-05, 'epoch': 2.28}


  0%|          | 0/94 [00:00<?, ?it/s]

{'eval_loss': 1.5943032503128052, 'eval_accuracy': 0.468, 'eval_runtime': 236.4019, 'eval_samples_per_second': 3.173, 'eval_steps_per_second': 0.398, 'epoch': 3.0}
{'train_runtime': 9989.8706, 'train_samples_per_second': 1.051, 'train_steps_per_second': 0.132, 'train_loss': 0.9924126310072715, 'epoch': 3.0}


TrainOutput(global_step=1314, training_loss=0.9924126310072715, metrics={'train_runtime': 9989.8706, 'train_samples_per_second': 1.051, 'train_steps_per_second': 0.132, 'train_loss': 0.9924126310072715, 'epoch': 3.0})

In [None]:
from transformers import (
    XLNetTokenizer,
    XLNetForSequenceClassification,
    AdamW,
    XLNetConfig,
    get_linear_schedule_with_warmup,
)


## Training


In [None]:
from transformers import pipeline


In [None]:
classifier3 = pipeline(
    "sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english"
)


In [None]:
classifier4 = pipeline("text-classification", model="jcblaise/roberta-tagalog-large")


In [None]:
classifier2 = pipeline(
    "sentiment-analysis", model="nlptown/bert-base-multilingual-uncased-sentiment"
)


In [None]:
classifier = pipeline(model="jcblaise/roberta-tagalog-base")


In [None]:
classifier4(
    """
           XL ORDER KO HINDI SMALL AT NAVY BLUE ORDER KO HINDI MAROON NOT RECOMENDED SELLER WA KAYO BIBILI DITO Hindi nasunod kulay na order ko Hindi nasunod kulay na order ko Hindi nasunod kulay na order ko Hindi nasunod kulay na order ko Hindi nasunod kulay na order ko Hindi nasunod kulay na order ko
"""
)


In [None]:
data[data.label == 1]


In [None]:
data.msg_cased[37]
