## Data Loading


In [5]:
import pandas as pd

model_data = pd.read_csv("../data/model-data/dataset2.csv")


# Modeling Tasks


## Split


In [6]:
y = model_data.label.tolist()
X = model_data.msg_cased.tolist()
len(X)


15000

In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=42, shuffle=True, stratify=y
)

X = X_test
y = y_test

X_test, X_validation, y_test, y_validation = train_test_split(
    X, y, test_size=0.50, random_state=42, shuffle=True, stratify=y
)

train = {"label": y_train, "msg": X_train}
test = {"label": y_test, "msg": X_test}
validation = {"label": y_validation, "msg": X_validation}


In [4]:
pd.DataFrame.from_dict(train).to_csv("../data/dataset/train2.csv", index=False)
pd.DataFrame.from_dict(test).to_csv("../data/dataset/test2.csv", index=False)
pd.DataFrame.from_dict(validation).to_csv("../data/dataset/validation2.csv", index=False)


## Tokenize


In [6]:
from transformers import AutoModel, AutoTokenizer, AutoModelForSequenceClassification

pretrained = "jcblaise/roberta-tagalog-base"
tokenizer = AutoTokenizer.from_pretrained(pretrained)
model = AutoModelForSequenceClassification.from_pretrained(pretrained, num_labels=5)

from datasets import Dataset

train_dataset = Dataset.from_dict(train)
test_dataset = Dataset.from_dict(test)
validation_dataset = Dataset.from_dict(validation)


  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at jcblaise/roberta-tagalog-base and are newly initialized: ['classifier.out_proj.bias', 'classifier

In [7]:
len(train_dataset["label"])


3500

In [8]:
def tokenize_function(examples):
    return tokenizer(
        examples["msg"],
        padding="max_length",
        truncation="longest_first",
        max_length=128,
        return_tensors="pt",
    ).to("cuda")


In [9]:
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True).shuffle(
    seed=100
)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True).shuffle(
    seed=100
)
tokenized_validation_dataset = validation_dataset.map(
    tokenize_function, batched=True
).shuffle(seed=100)


                                                                  

## Modeling


In [10]:
from transformers import TrainingArguments

training_args = TrainingArguments(output_dir="test_trainer")


In [11]:
from transformers import TrainingArguments, Trainer
import numpy as np
import evaluate

training_args = TrainingArguments(
    output_dir="test_trainer", evaluation_strategy="epoch"
)

metric = evaluate.load("accuracy")


In [12]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_validation_dataset,
    compute_metrics=compute_metrics,
)


In [16]:
trainer.train()


                                                  
 33%|███▎      | 438/1314 [02:11<03:45,  3.89it/s]

{'eval_loss': 1.2273324728012085, 'eval_accuracy': 0.46266666666666667, 'eval_runtime': 7.1564, 'eval_samples_per_second': 104.802, 'eval_steps_per_second': 13.135, 'epoch': 1.0}


 38%|███▊      | 500/1314 [02:28<03:50,  3.53it/s]

{'loss': 1.2932, 'learning_rate': 3.097412480974125e-05, 'epoch': 1.14}


                                                  
 67%|██████▋   | 876/1314 [04:24<01:54,  3.82it/s]

{'eval_loss': 1.284609079360962, 'eval_accuracy': 0.4826666666666667, 'eval_runtime': 7.262, 'eval_samples_per_second': 103.277, 'eval_steps_per_second': 12.944, 'epoch': 2.0}


 76%|███████▌  | 1000/1314 [05:01<01:32,  3.40it/s]

{'loss': 0.8843, 'learning_rate': 1.1948249619482495e-05, 'epoch': 2.28}


                                                   
100%|██████████| 1314/1314 [06:43<00:00,  3.26it/s]

{'eval_loss': 1.5203909873962402, 'eval_accuracy': 0.49733333333333335, 'eval_runtime': 7.5352, 'eval_samples_per_second': 99.533, 'eval_steps_per_second': 12.475, 'epoch': 3.0}
{'train_runtime': 403.5547, 'train_samples_per_second': 26.019, 'train_steps_per_second': 3.256, 'train_loss': 0.9651067725055293, 'epoch': 3.0}





TrainOutput(global_step=1314, training_loss=0.9651067725055293, metrics={'train_runtime': 403.5547, 'train_samples_per_second': 26.019, 'train_steps_per_second': 3.256, 'train_loss': 0.9651067725055293, 'epoch': 3.0})

In [17]:
trainer.save_model("roberta-tagalog-base-sent")


# Inference


In [25]:
model = AutoModel.from_pretrained(
    "./roberta-tagalog-base-sent", local_files_only=True
).to("cuda")


Some weights of the model checkpoint at ./roberta-tagalog-base-sent were not used when initializing RobertaModel: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at ./roberta-tagalog-base-sent and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [31]:
from transformers import pipeline

classifier = pipeline(
    "sentiment-analysis",
    model="./roberta-tagalog-base-sent",
    tokenizer=tokenizer,
)


In [42]:
pred = classifier(X_test
)

pred 

[{'label': 'LABEL_1', 'score': 0.558783233165741},
 {'label': 'LABEL_1', 'score': 0.628821074962616},
 {'label': 'LABEL_0', 'score': 0.946861982345581},
 {'label': 'LABEL_3', 'score': 0.5846469402313232},
 {'label': 'LABEL_2', 'score': 0.761692225933075},
 {'label': 'LABEL_2', 'score': 0.7333346605300903},
 {'label': 'LABEL_4', 'score': 0.707918643951416},
 {'label': 'LABEL_4', 'score': 0.7651559114456177},
 {'label': 'LABEL_1', 'score': 0.597825825214386},
 {'label': 'LABEL_1', 'score': 0.7104532718658447},
 {'label': 'LABEL_3', 'score': 0.554987907409668},
 {'label': 'LABEL_2', 'score': 0.6620049476623535},
 {'label': 'LABEL_0', 'score': 0.9908666610717773},
 {'label': 'LABEL_0', 'score': 0.939798891544342},
 {'label': 'LABEL_0', 'score': 0.7251824736595154},
 {'label': 'LABEL_2', 'score': 0.8104661107063293},
 {'label': 'LABEL_3', 'score': 0.8303515911102295},
 {'label': 'LABEL_0', 'score': 0.9703061580657959},
 {'label': 'LABEL_4', 'score': 0.9152721166610718},
 {'label': 'LABEL_4'

In [None]:
import torch

s1 = "Ayon sa mga respondents, nahihirapan daw ang pag-rescue sa mga biktima dahil sa flash flood."
s2 = "Kumuha ng tulong ang respondents sa Philippine Red Cross para sa mga lifeboat."
tokens = tokenizer(
    X_test,
    padding="max_length",
    truncation="longest_first",
    max_length=128,
    return_tensors="pt",
).to("cuda")

with torch.no_grad():
    out = model(**tokens)[0]


In [None]:
out.argmax(1).item()


In [None]:
from transformers import pipeline
import torch

with torch.no_grad():
    out = model(**tokenized_validation_dataset[0])[0]


In [None]:
from transformers import (
    XLNetTokenizer,
    XLNetForSequenceClassification,
    AdamW,
    XLNetConfig,
    get_linear_schedule_with_warmup,
)


## Training


In [None]:
from transformers import pipeline


In [None]:
classifier3 = pipeline(
    "sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english"
)


In [None]:
classifier4 = pipeline("text-classification", model="jcblaise/roberta-tagalog-large")


In [None]:
classifier2 = pipeline(
    "sentiment-analysis", model="nlptown/bert-base-multilingual-uncased-sentiment"
)


In [None]:
classifier = pipeline(model="jcblaise/roberta-tagalog-base")


In [None]:
classifier4(
    """
           XL ORDER KO HINDI SMALL AT NAVY BLUE ORDER KO HINDI MAROON NOT RECOMENDED SELLER WA KAYO BIBILI DITO Hindi nasunod kulay na order ko Hindi nasunod kulay na order ko Hindi nasunod kulay na order ko Hindi nasunod kulay na order ko Hindi nasunod kulay na order ko Hindi nasunod kulay na order ko
"""
)


In [None]:
data[data.label == 1]


In [None]:
data.msg_cased[37]
