# New flow

In [21]:
import fire
import os
os.chdir('/home/jupyter/gitlab/advertorial-classifier')
import sys
sys.path.insert(0, os.getcwd())

#os.chdir('../../advertorial-classifier/')
#import sys
#sys.path.insert(0, )

# %%
from advertorial import dataset
from advertorial import utils
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments, Trainer
import wandb
import numpy as np
import evaluate

from datasets import Dataset, DatasetDict

epochs=10
envfile:str='.env'
use_wandb:bool=True 

In [22]:
"""
Train the advertorial classifier model by train/valid set stored in BQ and log the metrics in wandb.
To check the environment variables, please check 

Args:
    envfile (str, optional): Environment variables art listed in here. Defaults to '.env'.

Returns:
    _type_: None
"""
# check or set environment variables
utils.check_env(envfile)
today = utils.set_today()
log_dir = utils.get_based_path('log/')
prebuilt_dir = utils.get_based_path('prebuilt_model/')

print(f'use_wandb:{use_wandb}')
wandb.login(key=os.environ['WANDB_KEY'], 
            host=os.environ['WANDB_BASE_URL'])
wandb.init(
    mode= "online" if use_wandb else "disabled",
    project=os.environ['WANDB_PROJECT'],
    config={'epochs':epochs}
)

advertorial_dataset = dataset.train_valid_test_from_file()
id2label = {0: "no", 1: "yes"}
label2id = {"no": 0, "yes": 1}

pretrain_model ="hfl/chinese-bert-wwm-ext"
tokenizer = AutoTokenizer.from_pretrained(pretrain_model)
model = AutoModelForSequenceClassification.from_pretrained(
    pretrain_model, num_labels=2, id2label=id2label, label2id=label2id)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=-1)
    metric = evaluate.load("accuracy")
    return metric.compute(predictions=predictions, references=labels)


def preprocess_function(examples):
    return tokenizer(examples["text"], padding=True, truncation=True, max_length=512)


tokenized_advertorial = advertorial_dataset.map(preprocess_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


training_args = TrainingArguments(
    output_dir=log_dir,
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=epochs,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_advertorial["train"],
    eval_dataset=tokenized_advertorial["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()



env var: MODEL_PROJECT=advertorial-post-classifier
env var: GCP_PROJECT=milelens-dev
env var: GCP_REGION=asia-east1
env var: GCP_BQ_DATASET=ML
env var: GCP_BQ_TRAIN_TABLE=advertorial_post_classifier_train
env var: GCP_BQ_TEST_TABLE=advertorial_post_classifier_test
env var: GCP_BQ_META_TABLE=advertorial_post_classifier_meta
env var: GCS_BUCKET=milelens_ml
env var: GCS_MODEL_URI_BASE=gs://milelens_ml/advertorial_post_classification/
env var: GCV_AI_PIPELINE=gs://milelens_ml/advertorial_post_classification/pipeline
env var: WANDB_BASE_URL=http://34.134.95.35
env var: WANDB_KEY=local-2ac8bc271378c1f3e406c23d96af534182bdee32
env var: WANDB_PROJECT=advertorial-post-classifier
use_wandb:True




huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


env var: MODEL_PROJECT=advertorial-post-classifier
env var: GCP_PROJECT=milelens-dev
env var: GCP_REGION=asia-east1
env var: GCP_BQ_DATASET=ML
env var: GCP_BQ_TRAIN_TABLE=advertorial_post_classifier_train
env var: GCP_BQ_TEST_TABLE=advertorial_post_classifier_test
env var: GCP_BQ_META_TABLE=advertorial_post_classifier_meta
env var: GCS_BUCKET=milelens_ml
env var: GCS_MODEL_URI_BASE=gs://milelens_ml/advertorial_post_classification/
env var: GCV_AI_PIPELINE=gs://milelens_ml/advertorial_post_classification/pipeline
env var: WANDB_BASE_URL=http://34.134.95.35
env var: WANDB_KEY=local-2ac8bc271378c1f3e406c23d96af534182bdee32
env var: WANDB_PROJECT=advertorial-post-classifier
SELECT post_text AS text, cate AS label FROM `milelens-dev.ML.advertorial_post_classifier_train`
SELECT post_text AS text, cate AS label FROM `milelens-dev.ML.advertorial_post_classifier_test`


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at hfl/chinese-bert-wwm-ext and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 49198/49198 [00:19<00:00, 2532.78 examples/s]
Map: 100%|██████████| 12048/12048 [00:04<00:00, 2518.31 examples/s]
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss


# Old flow

In [None]:
import os
os.chdir('/home/jupyter/gitlab/advertorial-classifier/')

In [None]:
from advertorial import dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments, Trainer

#import wandb
import numpy as np
import evaluate
from datetime import date


In [None]:
# advertorial_dataset = dataset.train_valid_test_from_file(csv_file_path= './data/milelens_advertorial_dataset_formatted.csv')
# train, validation, test = advertorial_dataset['train'], advertorial_dataset['validation'], advertorial_dataset['test'] 
# id2label = {0: "no", 1: "yes"}
# label2id = {"no": 0, "yes": 1}

# pretrain_model ="hfl/chinese-bert-wwm-ext"
# tokenizer = AutoTokenizer.from_pretrained(pretrain_model)
# model = AutoModelForSequenceClassification.from_pretrained(
#     pretrain_model, num_labels=2, id2label=id2label, label2id=label2id)


train_ratio, validation_ratio, test_ratio = 0.8, 0.2, 0
advertorial_dataset = dataset.train_valid_test_from_file(csv_file_path= './data/milelens_advertorial_dataset_formatted_23634.csv', train_ratio=train_ratio, validation_ratio=validation_ratio, test_ratio=test_ratio)
today = date.today()

train = advertorial_dataset['train']
train.to_csv(f'./data/train_set_{today}.csv')

if validation_ratio:
    valid = advertorial_dataset['valid']
    valid.to_csv(f'./data/valid_set_{today}.csv')

if test_ratio:
    test = advertorial_dataset['valid']
    test.to_csv(f'./data/test_set_{today}.csv')



id2label = {0: "no", 1: "yes"}
label2id = {"no": 0, "yes": 1}

pretrain_model ="hfl/chinese-bert-wwm-ext"
tokenizer = AutoTokenizer.from_pretrained(pretrain_model)
model = AutoModelForSequenceClassification.from_pretrained(
    pretrain_model, num_labels=2, id2label=id2label, label2id=label2id)

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=-1)
    metric = evaluate.load("accuracy")
    return metric.compute(predictions=predictions, references=labels)


def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=512)

tokenized_advertorial = advertorial_dataset.map(preprocess_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
training_args = TrainingArguments(
    #logging_steps=10000,
    #save_steps=10000,
    output_dir="prebuilt_model/log",
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    #evaluation_strategy="steps"
    evaluation_strategy="epoch",
    save_strategy="epoch",
    #fp16=True,
    #load_best_model_at_end=True,
    #push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_advertorial["train"],
    eval_dataset=tokenized_advertorial["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=tokenized_advertorial["train"],
#     eval_dataset=tokenized_advertorial["train"],
#     tokenizer=tokenizer,
#     data_collator=data_collator,
#     compute_metrics=compute_metrics,
# )

trainer.train()