# Clean data for Vertex AI training

In [1]:
import os
os.chdir('/home/bonzo_yang/gitlab/advertorial-classifier/')

In [2]:
from advertorial import dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments, Trainer
import wandb
import numpy as np
import evaluate

In [3]:
advertorial_dataset = dataset.train_valid_test_from_file(csv_file_path= './data/milelens_advertorial_dataset_formatted.csv')
train, validation, test = advertorial_dataset['train'], advertorial_dataset['validation'], advertorial_dataset['test'] 
id2label = {0: "no", 1: "yes"}
label2id = {"no": 0, "yes": 1}

pretrain_model ="hfl/chinese-bert-wwm-ext"
tokenizer = AutoTokenizer.from_pretrained(pretrain_model)
model = AutoModelForSequenceClassification.from_pretrained(
    pretrain_model, num_labels=2, id2label=id2label, label2id=label2id)

Some weights of the model checkpoint at hfl/chinese-bert-wwm-ext were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkp

In [4]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    accuracy = evaluate.load("accuracy")
    return accuracy.compute(predictions=predictions, references=labels)

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=512)
tokenizer = AutoTokenizer.from_pretrained(pretrain_model)


tokenized_advertorial = advertorial_dataset.map(preprocess_function, batched=True)

# Data collator that will dynamically pad the inputs received
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

use_wandb = True
report_args = []
if use_wandb:
    wandb.init(project="milelens-ml-advertorial")

    report_args.append("wandb")

Map:   0%|          | 0/3680 [00:00<?, ? examples/s]

Map:   0%|          | 0/460 [00:00<?, ? examples/s]

Map:   0%|          | 0/460 [00:00<?, ? examples/s]

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[34m[1mwandb[0m: Currently logged in as: [33mbonzo-yang-cloudmile[0m ([33mcm-ml-team[0m). Use [1m`wandb login --relogin`[0m to force relogin


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [8]:
training_args = TrainingArguments(
    #logging_steps=10000,
    #save_steps=10000,
    output_dir="prebuilt_model/log",
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    #evaluation_strategy="steps"
    evaluation_strategy="epoch",
    save_strategy="epoch",
    #fp16=True,
    #load_best_model_at_end=True,
    #push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_advertorial["train"],
    eval_dataset=tokenized_advertorial["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 3680
  Num Epochs = 10
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 2300
  Number of trainable parameters = 102269186
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.402016,0.845652
2,No log,0.425779,0.808696
3,0.358600,0.52027,0.852174
4,0.358600,0.525738,0.847826


The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 460
  Batch size = 16
***** Running Evaluation *****
  Num examples = 460
  Batch size = 16
Saving model checkpoint to prebuilt_model/log/checkpoint-230
Configuration saved in prebuilt_model/log/checkpoint-230/config.json
Saving model checkpoint to prebuilt_model/log/checkpoint-230
Configuration saved in prebuilt_model/log/checkpoint-230/config.json
Model weights saved in prebuilt_model/log/checkpoint-230/pytorch_model.bin
tokenizer config file saved in prebuilt_model/log/checkpoint-230/tokenizer_config.json
Special tokens file saved in prebuilt_model/log/checkpoint-230/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `BertForSe

TrainOutput(global_step=2300, training_loss=0.1599029072471287, metrics={'train_runtime': 1217.0343, 'train_samples_per_second': 30.237, 'train_steps_per_second': 1.89, 'total_flos': 9588613746027840.0, 'train_loss': 0.1599029072471287, 'epoch': 10.0})

In [3]:
from advertorial.inference import AdvertorialModel
adv = AdvertorialModel(use_gpu=True)

/home/bonzo_yang/gitlab/advertorial-classifier


In [11]:
import torch.nn.functional as F

In [None]:
prediction, np.array([0, 1])

In [21]:
a = torch.randn(4, 4)
a
torch.argmax(a)

tensor(2)

In [42]:
import pandas as pd
df = pd.read_csv('./tests/test.csv')
texts = df['text'].tolist()

In [54]:
(F.softmax(adv(texts, return_logit=True).logits, dim=1) > 0.5).int().argmax(dim=1)

tensor([1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0], device='cuda:0')

In [56]:
df.prediction.tolist()

[0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1]

In [4]:
text = ['三年沒來日本 第一站先衝迪士尼🇯🇵', '拉麵王子推薦新宿拉麵看了嗎？吃個日本泡麵解拉麵癮']
# 0, 1
prediction, probs = adv(text, return_logit=False)

In [9]:
import numpy as np
(prediction.tolist() == np.array([0, 1])).tolist()#.all()

[True, True]

In [27]:
outputs_ =  (F.softmax(outputs.logits, dim=1) > 0.6).int()

In [37]:
outputs_.argmax(dim=1).cpu().numpy()

array([0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
       0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1])

In [32]:
a, b = adv(text)
a

array([0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
       0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1])

In [8]:
import torch
text = ['三年沒來日本 第一站先衝迪士尼🇯🇵', '拉麵王子推薦新宿拉麵看了嗎？吃個日本泡麵解拉麵癮']*17
outputs = adv(text, return_logit=True)
outputs

SequenceClassifierOutput(loss=None, logits=tensor([[ 4.3773, -4.4424],
        [-3.7513,  3.7397],
        [ 4.3773, -4.4424],
        [-3.7513,  3.7397],
        [ 4.3773, -4.4424],
        [-3.7513,  3.7397],
        [ 4.3773, -4.4424],
        [-3.7513,  3.7397],
        [ 4.3773, -4.4424],
        [-3.7513,  3.7397],
        [ 4.3773, -4.4424],
        [-3.7513,  3.7397],
        [ 4.3773, -4.4424],
        [-3.7513,  3.7397],
        [ 4.3773, -4.4424],
        [-3.7513,  3.7397],
        [ 4.3773, -4.4424],
        [-3.7513,  3.7397],
        [ 4.3773, -4.4424],
        [-3.7513,  3.7397],
        [ 4.3773, -4.4424],
        [-3.7513,  3.7397],
        [ 4.3773, -4.4424],
        [-3.7513,  3.7397],
        [ 4.3773, -4.4424],
        [-3.7513,  3.7397],
        [ 4.3773, -4.4424],
        [-3.7513,  3.7397],
        [ 4.3773, -4.4424],
        [-3.7513,  3.7397],
        [ 4.3773, -4.4424],
        [-3.7513,  3.7397],
        [ 4.3773, -4.4424],
        [-3.7513,  3.7397]], devi

In [10]:
np.argmax(outputs)

0

In [90]:
import pandas as pd
def perf_report(model, dataset, name='train'):
    from tqdm import tqdm
    N = len(dataset)
    step = 20
    ones = 0
    zeros = 0
    hits = 0
    miss = 0
    predictions = []
    for s in tqdm(range(0, N, step)):
        s, e = s, s+step
        prediction, probs = model(dataset[s:e]['text'])
        
        hits += np.sum(dataset[s:e]['label'] == prediction)
        miss += np.sum(dataset[s:e]['label'] != prediction)
        zeros += np.sum(dataset[s:e]['label'] == np.array(0))
        ones += np.sum(dataset[s:e]['label'] == np.array(1))
        predictions.append(prediction) 

    accuracy = hits/N
    print(f'accuracy:{accuracy:.2f}, positive samples:{ones}, negative samples:{zeros}')  
    performance_df = pd.DataFrame({'dataset':[name], 
                                   'records':[N], 
                                   'positive samples':[ones], 
                                   'negative samples':[zeros], 
                                   'hit':[hits],
                                   'miss':[miss],
                                   'accuracy':[accuracy], 'miss rate':[1-accuracy]})

    predictions = np.concatenate(predictions)
    error_ids = predictions != dataset['label']
    error_df = pd.DataFrame({'text':np.array(dataset['text'])[error_ids], 'label':np.array(dataset['label'])[error_ids], 'prediction':predictions[error_ids]})
    return error_df, performance_df

In [91]:
train_error, train_perf = perf_report(adv, train, 'train')
validation_error, validation_perf = perf_report(adv, validation, 'validation')
test_error, test_perf = perf_report(adv, test, 'test')

100%|██████████| 184/184 [00:36<00:00,  5.00it/s]


accuracy:0.97, positive samples:1733, negative samples:1947


100%|██████████| 23/23 [00:04<00:00,  4.92it/s]


accuracy:0.98, positive samples:214, negative samples:246


100%|██████████| 23/23 [00:04<00:00,  4.99it/s]


accuracy:0.96, positive samples:203, negative samples:257


In [116]:
pd.concat([train_perf, validation_perf, test_perf]).reset_index(drop=True)#.to_csv('performance.csv', index=False)

Unnamed: 0,dataset,records,positive samples,negative samples,hit,miss,accuracy,miss rate
0,train,3680,1733,1947,3570,110,0.970109,0.029891
0,validation,460,214,246,451,9,0.980435,0.019565
0,test,460,203,257,442,18,0.96087,0.03913


In [109]:
train_error.to_csv('train_error.csv', index=False)

In [110]:
test_error.to_csv('test_error.csv', index=False)

In [113]:
np.sum(train_error.label==1), np.sum(train_error.label==0)

(65, 45)

In [114]:
np.sum(validation_error.label==1), np.sum(validation_error.label==0)

(4, 5)

In [115]:
np.sum(test_error.label==1), np.sum(test_error.label==0)

(9, 9)

In [38]:
N = len(validation)
step = 10
ones = 0
zeros = 0
hits = 0
for s in tqdm(range(0, N, step)):
    s, e = s, s+step
    prediction, probs = adv(validation[s:e]['text'])
    
    hits += np.sum(validation[s:e]['label'] == prediction)
    zeros = np.sum(validation[s:e]['label'] == 0)
    ones = np.sum(validation[s:e]['label']==1)


10

In [11]:
train['text'][0:5]

['瑋瑋：我們要去香港啦！大約下午一點抵達，這次偷偷租了一個地方跟三年沒見的大家見面，沒有告訴哲哲🤪🤪🤪 歡迎有空的香港觀眾來玩！  地點：荃灣德士古道120號安泰國際中心20樓2003室（THE HOUSE攝影棚） 入場時間：2023/01/18 晚上六點進場 活動開始：18：30-19：30 費用：免費！你們人來就好💞💞💞 來看看我們超臨時的聚會，會有多少香港觀眾來呢？',
 '#drgracieofficial  謝謝你們今天來看直播。 祝大家有美好的每一天！  過好當下、讓自己快樂，不影響別人、維持健康； 這樣就好。  晚安😴  G. Hsu  一直以為文章發出去了 結果沒有😭',
 '今天要住的飯店是… 半年前才訂得到的Disney hotel 🏰  相隔三年來東京覺得超級陌生 完全沒坐功課…. 電車要怎麼搭都完全忘了 也忘記日本有什麼好吃的食物\U0001f979 大家有推薦的話留言告訴我一下 很需要被推薦….',
 '新年快樂！2023一起努力🥰',
 'Red Bull飛行日的影片上線囉～ 真的很好玩  團隊的大家辛苦了～  好可惜我覺得我們的飛行器還有很多可以加強的地方 本來是想說用把飛行員丟出去這個概念來呈現飛行這個主題 但效果好像不太理想ˊ<__ˋ 我們再接再厲 再來挑戰更有趣的東西  下次見ˊVˋ']

In [12]:
train['label'][0:5]

[0, 0, 0, 0, 0]

In [136]:
from datasets import load_dataset, DatasetDict
from typing import Any

import pandas as pd
from datasets import Dataset, DatasetDict

def train_valid_test_from_file(csv_file_path= './data/milelens_advertorial_dataset_formatted.csv', train_ratio=0.8, validation_ratio=0.1):
    test_size = 1-train_ratio
    validation_size = validation_ratio/test_size

    # Load the CSV file using pandas
    data = pd.read_csv(csv_file_path)

    # Convert the pandas DataFrame into a Hugging Face Dataset
    dataset = Dataset.from_pandas(data)

    if 0 < test_size < 1 :
        # Split the dataset into train, validation, and test splits
        dataset = dataset.train_test_split(test_size=test_size)  # Adjust the test_size as needed

        # Assign the resulting splits to their respective variables
        train_dataset = dataset['train']
        test_dataset = dataset['test']

        if 0 < validation_size < 1:
            # Further split the train_dataset into train and validation splits
            test_dataset = test_dataset.train_test_split(test_size=1-validation_size)  # Adjust the test_size as needed

            # Assign the resulting splits to their respective variables
            validation_dataset = test_dataset['train']
            test_dataset = test_dataset['test']

            # Create a DatasetDict to store the splits
            dataset_dict = DatasetDict({'train': train_dataset, 'validation': validation_dataset, 'test': test_dataset})
        elif validation_size >= 1:
            print(f'Wrong validation ratio:{validation_size} should be < 1')
            dataset_dict = DatasetDict({'train': train_dataset, 'validation': test_dataset})
        else:
            print(f'Wrong validation ratio:{validation_size} should be > 0 ')
            dataset_dict = DatasetDict({'train': train_dataset, 'test': test_dataset})

    else:
        print(f'Wrong train ratio:{test_size} should be >0 1')
        dataset_dict = DatasetDict({'train': dataset})
        
    return dataset_dict

In [10]:
dataset.train_valid_test_from_file(train_ratio=0.8, validation_ratio=0.1)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 3680
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 460
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 460
    })
})

In [8]:
dataset.train_valid_test_from_file(train_ratio=0.8, validation_ratio=0)

Wrong validation ratio:0.00 should be > 0


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 3680
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 920
    })
})

In [7]:
dataset.train_valid_test_from_file(train_ratio=1)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 4600
    })
})

In [12]:
dataset.train_valid_test_from_file(train_ratio=0, validation_ratio=1)

Wrong train ratio:0.00 should be > 0


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 4600
    })
})

In [9]:
dataset.train_valid_test_from_file(train_ratio=0.8, validation_ratio=0.8)

Wrong validation ratio:4.00 should be < 1


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 3680
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 920
    })
})

In [11]:
dataset.train_valid_test_from_file(train_ratio=0.8, validation_ratio=0.05)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 3680
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 230
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 690
    })
})

In [None]:
label_dict = {"no": 0, "yes": 1}


#def train_milelens_model(pretrain_model: str = "hfl/chinese-bert-wwm-ext", use_wandb: bool = True):
pretrain_model = "hfl/chinese-bert-wwm-ext"
use_wandb = True
if True:
    """

    Args:
        use_wandb:
            Determine to use wandb to track training metrics
    Returns:

    """
    # milelens_ds = LoadDatasets(
    #     'ChnSentiCorp', 'fb_reply_230103', 'imdb', 'weibo_senti_100k',
    #     'online_shopping_10_cats', 'waimai_10k', 'JD_pos', 'JD_neg', 'ntu_train_data_230428'
    # )
    #print(milelens_ds.data_infos)
    #ds = milelens_ds.get_dataset_dict()
    ds = dataset.train_valid_test_from_file(csv_file_path= './data/milelens_advertorial_dataset_formatted.csv')

    id2label = dict(zip(label_dict.values(), label_dict.keys()))
    label2id = label_dict

    tokenizer = AutoTokenizer.from_pretrained(pretrain_model)
    modeler = AutoModelForSequenceClassification.from_pretrained(
        pretrain_model, num_labels=2, id2label=id2label, label2id=label2id)

    def preprocess_function(examples):
        return tokenizer(examples["text"], truncation=True, max_length=512)


    tokenized_ds = ds.map(preprocess_function, batched=True)

    # Data collator that will dynamically pad the inputs received
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    # Use wandb to track training metrics
    report_args = []
    if use_wandb:
        wandb.init(project="milelens-ml-advertorial")

        report_args.append("wandb")

    training_args = TrainingArguments(
        learning_rate=3e-5,
        per_device_train_batch_size=10,
        per_device_eval_batch_size=50,
        num_train_epochs=10,
        logging_steps=10000,
        save_steps=10000,
        weight_decay=0.01,
        warmup_ratio=0.05,
        evaluation_strategy="steps",
        output_dir="prebuilt_model/log",
    )

    trainer = Trainer(
        model=modeler,
        args=training_args,
        train_dataset=tokenized_ds["train"],
        eval_dataset=tokenized_ds["validation"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    trainer.train()

In [None]:
tokenized_ds['train']

In [None]:
tokenized_ds['train']

In [None]:
tokenized_ds

In [20]:
import unicodedata

def _is_control(char):
    """Checks whether `chars` is a control character."""
    # These are technically control characters but we count them as whitespace
    # characters.
    if char == "\t" or char == "\n" or char == "\r":
        return False
    cat = unicodedata.category(char)
    if cat.startswith("C"):
        return True
    return False

In [21]:
def _is_whitespace(char):
    """Checks whether `chars` is a whitespace character."""
    # \t, \n, and \r are technically contorl characters but we treat them
    # as whitespace since they are generally considered as such.
    if char == " " or char == "\t" or char == "\n" or char == "\r":
        return True
    cat = unicodedata.category(char)
    if cat == "Zs":
        return True
    return False


In [23]:
def _clean_text(text):
    """Performs invalid character removal and whitespace cleanup on text."""
    output = []
    for char in text:
        cp = ord(char)
        if cp == 0 or cp == 0xfffd or _is_control(char):
            continue
        if _is_whitespace(char):
            output.append(" ")
        else:
            output.append(char)
    return "".join(output)

In [25]:
_clean_text(tokenized_advertorial["train"][2292]['text'])

'😮💨😮💨😮💨'

In [12]:
from datetime import date

today = date.today()
print(today)

2023-05-31


In [7]:
import os
import sys
os.chdir('/home/jupyter/gitlab/advertorial-classifier/')
print(os.getcwd())
#sys.path.insert(0, os.getcwd())
#os.chdir('../../advertorial-classifier/')
#import sys
#sys.path.insert(0, )

# %%
from advertorial import dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments, Trainer
#import wandb
import numpy as np
import evaluate

# %%
# advertorial_dataset = dataset.train_valid_test_from_file(csv_file_path= './data/milelens_advertorial_dataset_formatted.csv')
# train, validation, test = advertorial_dataset['train'], advertorial_dataset['validation'], advertorial_dataset['test'] 
# id2label = {0: "no", 1: "yes"}
# label2id = {"no": 0, "yes": 1}

# pretrain_model ="hfl/chinese-bert-wwm-ext"
# tokenizer = AutoTokenizer.from_pretrained(pretrain_model)
# model = AutoModelForSequenceClassification.from_pretrained(
#     pretrain_model, num_labels=2, id2label=id2label, label2id=label2id)


advertorial_dataset = dataset.train_valid_test_from_file(csv_file_path= './data/milelens_advertorial_dataset_formatted_23634.csv', train_ratio=0.8, validation_ratio=0.2)
train = advertorial_dataset['train']
valid = advertorial_dataset['validation']
id2label = {0: "no", 1: "yes"}
label2id = {"no": 0, "yes": 1}

/home/jupyter/gitlab/advertorial-classifier
Wrong validation ratio:1.00 should be < 1
