In [None]:
import os
os.chdir('/home/bonzo_yang/gitlab/advertorial-classifier/')

In [None]:
from advertorial import dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments, Trainer
import wandb
import numpy as np
import evaluate

In [None]:
# advertorial_dataset = dataset.train_valid_test_from_file(csv_file_path= './data/milelens_advertorial_dataset_formatted.csv')
# train, validation, test = advertorial_dataset['train'], advertorial_dataset['validation'], advertorial_dataset['test'] 
# id2label = {0: "no", 1: "yes"}
# label2id = {"no": 0, "yes": 1}

# pretrain_model ="hfl/chinese-bert-wwm-ext"
# tokenizer = AutoTokenizer.from_pretrained(pretrain_model)
# model = AutoModelForSequenceClassification.from_pretrained(
#     pretrain_model, num_labels=2, id2label=id2label, label2id=label2id)


advertorial_dataset = dataset.train_valid_test_from_file(csv_file_path= './data/milelens_advertorial_dataset_formatted.csv', train_ratio=1)
train = advertorial_dataset['train']
id2label = {0: "no", 1: "yes"}
label2id = {"no": 0, "yes": 1}

pretrain_model ="hfl/chinese-bert-wwm-ext"
tokenizer = AutoTokenizer.from_pretrained(pretrain_model)
model = AutoModelForSequenceClassification.from_pretrained(
    pretrain_model, num_labels=2, id2label=id2label, label2id=label2id)

In [None]:
from advertorial.inference import AdvertorialModel
adv = AdvertorialModel(model_path='./prebuilt_model/230525_chinese_bert_wwm_ext', use_gpu=True)

In [None]:
import pandas as pd
def perf_report(model, dataset, name='train'):
    from tqdm import tqdm
    N = len(dataset)
    step = 20
    ones = 0
    zeros = 0
    hits = 0
    miss = 0
    predictions = []
    for s in tqdm(range(0, N, step)):
        s, e = s, s+step
        prediction, probs = model(dataset[s:e]['text'])
        
        hits += np.sum(dataset[s:e]['label'] == prediction)
        miss += np.sum(dataset[s:e]['label'] != prediction)
        zeros += np.sum(dataset[s:e]['label'] == np.array(0))
        ones += np.sum(dataset[s:e]['label'] == np.array(1))
        predictions.append(prediction) 

    accuracy = hits/N
    print(f'accuracy:{accuracy:.2f}, positive samples:{ones}, negative samples:{zeros}')  
    performance_df = pd.DataFrame({'dataset':[name], 
                                   'records':[N], 
                                   'positive samples':[ones], 
                                   'negative samples':[zeros], 
                                   'hit':[hits],
                                   'miss':[miss],
                                   'accuracy':[accuracy], 'miss rate':[1-accuracy]})

    predictions = np.concatenate(predictions)
    error_ids = predictions != dataset['label']
    error_df = pd.DataFrame({'text':np.array(dataset['text'])[error_ids], 'label':np.array(dataset['label'])[error_ids], 'prediction':predictions[error_ids]})
    return error_df, performance_df

In [None]:
train_error, train_perf = perf_report(adv, train, 'train')
validation_error, validation_perf = perf_report(adv, validation, 'validation')
test_error, test_perf = perf_report(adv, test, 'test')

In [None]:
pd.concat([train_perf, validation_perf, test_perf]).reset_index(drop=True)#.to_csv('performance.csv', index=False)
#pd.concat([train_perf]).reset_index(drop=True)#.to_csv('performance.csv', index=False)