In [62]:
from core_pro.ultilities import make_sync_folder, update_df
import polars as pl
from pathlib import Path
from transformers import AutoTokenizer
from datetime import datetime
from huggingface_hub import login, HfApi
import sys

sys.path.extend([str(Path.home() / 'PycharmProjects/model_train')])
from src.model_train.data_loading import TrainDistribution
from src.model_train.pipeline_train import Pipeline
from src.model_train.func import training_report

path = make_sync_folder('cx/buyer_listening')

In [63]:
file = path / 'raw_cleaned.parquet'
df = (
    pl.read_parquet(file)
    .with_columns(pl.concat_str([pl.col('l1'), pl.col('l2')], separator=' >> ').alias('combine_category'))
    .with_columns(pl.col('combine_category').str.split(' >> ').list.len().alias('len_category'))
)
print(df.shape)
df

(330566, 9)


index,text,l1,l2,sentiment,text_clean,text_clean_word_count,combine_category,len_category
u32,str,str,str,str,str,u32,str,u32
0,"""Trả ngay và liền ạ""","""Commercial""","""Games/Minigames""","""neutral""","""trả ngay và liền ạ""",5,"""Commercial >> Games/Minigames""",2
1,"""Nhờ thằng te tò te nên app đc …","""Others""","""Cannot defined""","""""","""nhờ thằng te tò te nên app đc …",11,"""Others >> Cannot defined""",2
2,"""Lười trả lời""","""Others""","""Cannot defined""","""healthy""","""lười trả lời""",3,"""Others >> Cannot defined""",2
3,"""‼️‼️‼️ GÓC CẢNH BÁO ‼️ ‼️‼️ ‼️…","""Others""","""Scam""","""negative""","""em bảo gửi e ck trực giác cho …",40,"""Others >> Scam""",2
4,"""Đặt shoppe food hơn 1 tiếng đồ…","""Feature""","""Digital Product""","""""","""đặt shoppe food hơn 1 tiếng đồ…",31,"""Feature >> Digital Product""",2
…,…,…,…,…,…,…,…,…
330561,"""💥 Chọn số Trúng Voucher Shopee…","""Commercial""","""Shopee Programs""","""neutral""","""chọn số trúng voucher shopee m…",25,"""Commercial >> Shopee Programs""",2
330562,"""Rất đúng theo tiêu chuẩn""","""Buyer complained seller""","""Sellers packed fake orders""","""""","""rất đúng theo tiêu chuẩn""",5,"""Buyer complained seller >> Sel…",2
330563,"""Mỹ cấm TikTok: Facebook, Googl…","""Others""","""Cannot defined""","""neutral""","""mỹ cấm tiktok: facebook, googl…",7,"""Others >> Cannot defined""",2
330564,"""Vì có vài món đồ tôi mua và đa…","""Delivery""","""Delivery status/info""","""poor""","""vì có vài món đồ tôi mua và đa…",26,"""Delivery >> Delivery status/in…",2


In [64]:
df['len_category'].value_counts()

len_category,count
u32,u32
2,330566


In [65]:
df['text_clean_word_count'].describe(percentiles=[.25, .5, .75, .9, .99])

statistic,value
str,f64
"""count""",330566.0
"""null_count""",0.0
"""mean""",26.900755
"""std""",55.753364
"""min""",2.0
…,…
"""50%""",13.0
"""75%""",25.0
"""90%""",51.0
"""99%""",265.0


In [66]:
label = 'combine_category'
label_list = df[label].unique().to_list()
dist_check = TrainDistribution(df, col_label=label, col_item='text_clean', label_list=label_list)
dict_ = dist_check.split_train_valid_test(test_size=.2)

In [67]:
dict_['test']

index,text,l1,l2,sentiment,text_clean,text_clean_word_count,combine_category,len_category
u32,str,str,str,str,str,u32,str,u32
1071,"""https://shp.ee/xrjjey5xdbq Tướ…","""Others""","""Seller""","""neutral""","""https://shp.ee/xrjjey5xdbq tướ…",11,"""Others >> Seller""",2
299630,"""Tránh trường hợp nhiều người k…","""Payment""","""Payment issues""","""moderate""","""tránh trường hợp nhiều người k…",14,"""Payment >> Payment issues""",2
237359,"""Chú trọng xây dựng văn hóa tiê…","""Others""","""Cannot defined""","""neutral""","""chú trọng xây dựng văn hóa tiê…",10,"""Others >> Cannot defined""",2
174784,""" #acheinashopee #achadinhossho…","""Others""","""Cannot defined""","""neutral""","""#acheinashopee #achadinhosshop…",7,"""Others >> Cannot defined""",2
254796,"""Tôi gọi điện thoại của bạn hỏi…","""Others""","""Cannot defined""","""""","""tôi gọi điện thoại của bạn hỏi…",9,"""Others >> Cannot defined""",2
…,…,…,…,…,…,…,…,…
117389,"""Còn lại quá hoàn hảo""","""Others""","""Cannot defined""","""healthy""","""còn lại quá hoàn hảo""",5,"""Others >> Cannot defined""",2
199413,"""Tôi không thích hàng nhái hoặc…","""Others""","""Seller""","""poor""","""tôi không thích hàng nhái hoặc…",14,"""Others >> Seller""",2
307409,"""Tôi muốn trả hàng vì mua phải ""","""Return/Refund""","""Request RR""","""moderate""","""tôi muốn trả hàng vì mua phải""",7,"""Return/Refund >> Request RR""",2
98467,"""Tìm 1 đằng ra kết quả 1 nẻo""","""Buyer complained seller""","""Sellers packed fake orders""","""moderate""","""tìm 1 đằng ra kết quả 1 nẻo""",8,"""Buyer complained seller >> Sel…",2


In [68]:
col = ['index', 'text_clean', label]
_ = dist_check.df_to_dataset(col, show_index=3)

In [69]:
pretrain_name = 'bkai-foundation-models/vietnamese-bi-encoder'
tokenizer = AutoTokenizer.from_pretrained(pretrain_name)

In [70]:
dict_train = dist_check.ds_tokenize(tokenizer, show_index=1)

Map:   0%|          | 0/211561 [00:00<?, ? examples/s]

Map:   0%|          | 0/52891 [00:00<?, ? examples/s]

Map:   0%|          | 0/66114 [00:00<?, ? examples/s]

In [9]:
pipe = Pipeline(
    pretrain_name=pretrain_name,
    id2label=dist_check.id2label,
    label2id=dist_check.label2id,
    bf16=True,
    flash_attention_2=False,
)

time_now = datetime.now().strftime("%Y%m%d%H%M%S")
folder = path / f'model/{pretrain_name.split('/')[-1]}/{time_now}'
config = dict(
    log_step=500,
    num_train_epochs=5,
    learning_rate=1e-4,
)
trainer = pipe.train(
    folder=folder,
    train=dict_train['train'],
    val=dict_train['valid'],
    **config
)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at bkai-foundation-models/vietnamese-bi-encoder and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,F1,Accuracy
500,2.107,1.087032,0.726797,0.726797
1000,0.9479,0.885942,0.773288,0.773288
1500,0.8071,0.867224,0.77758,0.77758
2000,0.7713,0.861821,0.778885,0.778885


***** train metrics *****
  epoch                    =        5.0
  total_flos               = 25327846GF
  train_loss               =     1.1451
  train_runtime            = 0:07:13.71
  train_samples_per_second =   2438.919
  train_steps_per_second   =      4.773


In [72]:
valid_result = trainer.predict(dict_train['test'])
y_pred = valid_result.predictions.argmax(-1)
y_true = valid_result.label_ids

df_report = training_report(y_true=y_true, y_pred=y_pred, id2label=dist_check.id2label)

sh = '1TsAxRmQDPIuL_enHMyHZSsb1aZZs9VCSzOYyXo83uZA'
update_df(df_report, 'train_report', sh)

TypeError: training_report() got an unexpected keyword argument 'id2label'

In [25]:
upload = False
if upload:
    hf_token = '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'
    login(token=hf_token)

    repo = 'kevinkhang2909/buyer_listening'
    api = HfApi()
    api.upload_folder(
        folder_path=folder,
        repo_id=repo,
        commit_message='model updated',
        ignore_patterns=['checkpoint*']
    )

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/kevinkhang2909/buyer_listening/commit/c5eb32ddf808ca01d0d63ec813bbf3b6befd3243', commit_message='model updated', commit_description='', oid='c5eb32ddf808ca01d0d63ec813bbf3b6befd3243', pr_url=None, repo_url=RepoUrl('https://huggingface.co/kevinkhang2909/buyer_listening', endpoint='https://huggingface.co', repo_type='model', repo_id='kevinkhang2909/buyer_listening'), pr_revision=None, pr_num=None)