In [1]:
import csv

users = {}
with open('data/users.csv', newline='') as csvfile:
    # 讀取 CSV 檔內容，將每一列轉成一個 dictionary
    users_info = csv.DictReader(csvfile)
    for user in users_info:
        users[user["user_id"]] = {
            "gender": user["gender"],
            "occupation_titles": user["occupation_titles"],
            "interests": user["interests"]
        }

group2id = {}
id2group = {}
with open('data/subgroups.csv', newline='') as csvfile:
    group = csv.DictReader(csvfile)
    for i, subgroup in enumerate(group):
        group2id[subgroup['subgroup_id']] = i
        id2group[i] = subgroup['subgroup_id']
len(group2id)

91

In [2]:
#### train dataset
train_dataset = []
train_users_subgroup = {}   
with open('data/train_group.csv', newline='') as csvfile:
    train = csv.DictReader(csvfile)
    for i in train:
        if i["subgroup"] == "":
            train_users_subgroup[i["user_id"]] = []
            continue
        user_train = {}
        user_train["user_id"] = i["user_id"]
        user_train["interests"] = users[i["user_id"]]["interests"] + ',' +users[i["user_id"]]["occupation_titles"] if users[i["user_id"]]["occupation_titles"] != "" else users[i["user_id"]]["interests"]
        user_train["subgroup"] = [group2id[subgroup] for subgroup in i["subgroup"].split(" ")]
        train_users_subgroup[i["user_id"]] = user_train["subgroup"]
        train_dataset.append(user_train)
        
train_dataset[0]

{'user_id': '5bdecbfffec014002166796a',
 'interests': '職場技能_創業,設計_平面設計,藝術_電腦繪圖,藝術_繪畫與插畫,手作_刺繡,攝影_影像創作,手作_手作小物,服務業',
 'subgroup': [26]}

In [3]:
#### validation dataset
validation_dataset = []     
with open('data/val_unseen_group.csv', newline='') as csvfile:
    validation = csv.DictReader(csvfile)
    for i in validation:
        if i["subgroup"] == "":
            continue
        user_validation = {}
        user_validation["user_id"] = i["user_id"]
        user_validation["interests"] = users[i["user_id"]]["interests"] + ',' +users[i["user_id"]]["occupation_titles"] if users[i["user_id"]]["occupation_titles"] != "" else users[i["user_id"]]["interests"]
        user_validation["subgroup"] = [group2id[subgroup] for subgroup in i["subgroup"].split(" ")]
        validation_dataset.append(user_validation)
        
validation_dataset[0]

{'user_id': '612c1fcd560d8100069aa5ba',
 'interests': '生活品味_寵物,手作_手作小物,生活品味_親子教育,手作_刺繡,生活品味_烹飪料理與甜點,金融業',
 'subgroup': [7, 68, 69]}

In [4]:
from datasets import Dataset
from datasets import load_dataset

# dataset = load_dataset("csv", data_files={"train":"new_data/train_group.csv", "validation":"new_data/val_seen_group.csv"})
# dataset["train"][1]
dataset = {}
dataset['train'] = Dataset.from_list(train_dataset)
dataset['validation'] = Dataset.from_list(validation_dataset)
dataset['train']

  from .autonotebook import tqdm as notebook_tqdm


Dataset({
    features: ['user_id', 'interests', 'subgroup'],
    num_rows: 59032
})

In [5]:
from transformers import AutoTokenizer
import numpy as np

tokenizer = AutoTokenizer.from_pretrained("hfl/chinese-bert-wwm-ext")

def preprocess_data(examples):
  # take a batch of texts
  text = examples["interests"]
  # encode them
  encoding = tokenizer(text, padding="max_length", truncation=True, max_length=128)
  # create numpy array of shape (batch_size, num_labels)
  labels = np.zeros(len(group2id))
  # fill numpy array
  for id in examples["subgroup"]:
      labels[id] = 1

  encoding["labels"] = labels.tolist()
  
  return encoding

In [6]:
encoded_train_dataset = dataset['train'].map(preprocess_data, remove_columns=dataset['train'].column_names)
encoded_validation_dataset = dataset['validation'].map(preprocess_data, remove_columns=dataset['validation'].column_names)
example = encoded_train_dataset[0]
# example = encoded_dataset['validation'][0]
example["input_ids"]


100%|██████████| 59032/59032 [00:18<00:00, 3155.02ex/s]
100%|██████████| 11526/11526 [00:03<00:00, 3325.04ex/s]


[101,
 5480,
 1842,
 2825,
 5543,
 142,
 1201,
 3511,
 117,
 6257,
 6243,
 142,
 2398,
 7481,
 6257,
 6243,
 117,
 5971,
 6123,
 142,
 7442,
 5582,
 5257,
 1756,
 117,
 5971,
 6123,
 142,
 5257,
 4529,
 5645,
 2991,
 4529,
 117,
 2797,
 868,
 142,
 1173,
 5255,
 117,
 3109,
 2512,
 142,
 2512,
 1008,
 1201,
 868,
 117,
 2797,
 868,
 142,
 2797,
 868,
 2207,
 4289,
 117,
 3302,
 1243,
 3511,
 102,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]

In [7]:
encoded_train_dataset.set_format("torch")
encoded_validation_dataset.set_format("torch")
example['labels']
#dataset['train'][0]

[0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0]

In [8]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("hfl/chinese-bert-wwm-ext", 
                                                           #problem_type="multi_label_classification",
                                                           #ignore_mismatched_sizes=True,
                                                           num_labels=len(group2id))
     

Some weights of the model checkpoint at hfl/chinese-bert-wwm-ext were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkp

In [9]:
from transformers import TrainingArguments, Trainer

batch_size = 16

args = TrainingArguments(
    f"bert-finetuned-subgroup111",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    #load_best_model_at_end=True,
    #metric_for_best_model=metric_name,
)

2022-12-29 21:36:39.241465: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0


In [10]:
#from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
import torch
# def multi_label_metrics(predictions, labels, threshold=0.5):
#     # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
#     sigmoid = torch.nn.Sigmoid()
#     probs = sigmoid(torch.Tensor(predictions))
#     # next, use threshold to turn them into integer predictions
#     y_pred = np.zeros(probs.shape)
#     y_pred[np.where(probs >= threshold)] = 1
#     # finally, compute metrics
#     y_true = labels
#     f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
#     roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
#     accuracy = accuracy_score(y_true, y_pred)
#     # return as dictionary
#     metrics = {'f1': f1_micro_average,
#                'roc_auc': roc_auc,
#                'accuracy': accuracy}
#     return metrics

def apk(actual, predicted, k=50):
    """
    Computes the average precision at k.

    This function computes the average prescision at k between two lists of
    items.

    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements

    Returns
    -------
    score : double
            The average precision at k over the input lists

    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)
            
    #print(score)
    if len(actual) == 0:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=50):
    """
    Computes the mean average precision at k.

    This function computes the mean average prescision at k between two lists
    of lists of items.

    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements

    Returns
    -------
    score : double
            The mean average precision at k over the input lists

    """
    return {'map@50' : np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])}

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, 
            tuple) else p.predictions
    #print(preds, type(preds))
    predicted = [np.argsort(pred)[::-1].tolist() for pred in preds]
    actual = [np.where(label == 1)[0].tolist() for label in p.label_ids]

    result = mapk(
        actual=actual,
        predicted=predicted)
    return result
     

In [11]:
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_train_dataset,
    eval_dataset=encoded_validation_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)
trainer.train()

***** Running training *****
  Num examples = 59032
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 18450
  0%|          | 0/18450 [00:00<?, ?it/s]You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


KeyboardInterrupt: 

In [None]:
#### test unseen dataset
test_dataset = []     
with open('data/test_unseen_group.csv', newline='') as csvfile:
    test = csv.DictReader(csvfile)
    for i in test:
        user_validation = {}
        user_validation["user_id"] = i["user_id"]
        user_validation["interests"] = users[i["user_id"]]["interests"] + ',' +users[i["user_id"]]["occupation_titles"] if users[i["user_id"]]["occupation_titles"] != "" else users[i["user_id"]]["interests"]
        test_dataset.append(user_validation)
        
dataset['test'] = Dataset.from_list(test_dataset)

In [None]:
def preprocess_test_data(examples):
  # take a batch of texts
  text = examples["interests"]
  # encode them
  encoding = tokenizer(text, padding="max_length", truncation=True, max_length=128)
  return encoding

encoded_test_dataset = dataset['test'].map(preprocess_test_data, remove_columns=dataset['test'].column_names)
encoded_test_dataset[0]

100%|██████████| 11097/11097 [00:02<00:00, 3710.02ex/s]


{'input_ids': [101,
  2832,
  6536,
  4415,
  6512,
  142,
  4415,
  6512,
  117,
  4923,
  2466,
  142,
  7030,
  1265,
  1146,
  3358,
  117,
  2832,
  6536,
  4415,
  6512,
  142,
  2832,
  6536,
  6223,
  2573,
  117,
  5480,
  1842,
  2825,
  5543,
  142,
  3126,
  4372,
  2990,
  1285,
  117,
  2832,
  6536,
  4415,
  6512,
  142,
  7032,
  6084,
  1555,
  1501,
  117,
  5480,
  1842,
  2825,
  5543,
  142,
  943,
  782,
  1501,
  4277,
  5195,
  4245,
  117,
  6182,
  6863,
  3511,
  117,
  3302,
  1243,
  3511,
  102,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'token_type_ids': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,

In [None]:
encoded_test_dataset.set_format("torch")
predicts = trainer.predict(encoded_test_dataset)
predicts_list = [np.argsort(pred)[::-1].tolist() for pred in predicts.predictions]
predicts_list[0]

***** Running Prediction *****
  Num examples = 11097
  Batch size = 16


[50,
 58,
 0,
 63,
 65,
 6,
 68,
 36,
 49,
 28,
 70,
 71,
 82,
 60,
 35,
 74,
 39,
 75,
 84,
 56,
 32,
 34,
 69,
 5,
 7,
 2,
 62,
 22,
 54,
 14,
 64,
 59,
 78,
 52,
 31,
 80,
 4,
 18,
 72,
 55,
 51,
 83,
 12,
 3,
 53,
 23,
 57,
 33,
 27,
 67,
 13,
 61,
 40,
 38,
 48,
 77,
 37,
 86,
 15,
 29,
 76,
 20,
 66,
 46,
 24,
 79,
 41,
 44,
 42,
 45,
 11,
 8,
 47,
 89,
 90,
 88,
 73,
 87,
 21,
 25,
 1,
 19,
 26,
 43,
 81,
 9,
 30,
 10,
 85,
 17,
 16]

In [None]:
import csv
with open('predict_outputs/bertchinese_unseen_subgroup111.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['user_id', 'subgroup'])
    
    for i, predicts in enumerate(predicts_list):
        subgroup = " ".join([str(id2group[pred]) for pred in predicts])
        writer.writerow([test_dataset[i]["user_id"], subgroup])
print("unseen subgroup done")

unseen subgroup done


In [None]:
#### test seen dataset
test_dataset = []     
with open('data/test_seen_group.csv', newline='') as csvfile:
    test = csv.DictReader(csvfile)
    for i in test:
        user_validation = {}
        user_validation["user_id"] = i["user_id"]
        user_validation["interests"] = users[i["user_id"]]["interests"] + ',' +users[i["user_id"]]["occupation_titles"] if users[i["user_id"]]["occupation_titles"] != "" else users[i["user_id"]]["interests"]
        test_dataset.append(user_validation)
        
dataset['test'] = Dataset.from_list(test_dataset)
encoded_test_dataset = dataset['test'].map(preprocess_test_data, remove_columns=dataset['test'].column_names)
encoded_test_dataset[0]

100%|██████████| 7205/7205 [00:02<00:00, 3393.09ex/s]


{'input_ids': [101,
  4495,
  3833,
  1501,
  1456,
  142,
  2187,
  4289,
  117,
  6121,
  7077,
  142,
  3152,
  3428,
  117,
  6257,
  6243,
  142,
  2398,
  7481,
  6257,
  6243,
  117,
  6257,
  6243,
  142,
  1240,
  2706,
  6257,
  6243,
  117,
  3109,
  2512,
  142,
  1555,
  3511,
  3109,
  2512,
  117,
  3109,
  2512,
  142,
  2527,
  6182,
  1198,
  6744,
  117,
  6121,
  7077,
  142,
  3149,
  855,
  6121,
  7077,
  117,
  6257,
  6243,
  142,
  5206,
  7514,
  6257,
  6243,
  117,
  5971,
  3152,
  6257,
  6243,
  117,
  2451,
  1440,
  1001,
  3064,
  102,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'token_type_ids': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,

In [None]:
encoded_test_dataset.set_format("torch")
predicts = trainer.predict(encoded_test_dataset)
predicts_list = [np.argsort(pred)[::-1].tolist() for pred in predicts.predictions]
predicts_list[0]

***** Running Prediction *****
  Num examples = 7205
  Batch size = 16


[2,
 65,
 50,
 71,
 49,
 39,
 6,
 5,
 22,
 58,
 60,
 70,
 0,
 4,
 35,
 28,
 18,
 52,
 33,
 72,
 38,
 37,
 36,
 64,
 82,
 77,
 24,
 3,
 84,
 76,
 59,
 32,
 7,
 68,
 78,
 54,
 66,
 55,
 34,
 53,
 12,
 69,
 31,
 63,
 56,
 14,
 61,
 8,
 21,
 23,
 40,
 13,
 27,
 48,
 89,
 29,
 26,
 86,
 75,
 74,
 20,
 45,
 44,
 62,
 80,
 15,
 19,
 51,
 11,
 83,
 57,
 73,
 88,
 46,
 90,
 25,
 85,
 1,
 47,
 87,
 42,
 43,
 41,
 67,
 79,
 16,
 10,
 9,
 17,
 81,
 30]

In [None]:
with open('predict_outputs/bertchinese_seen_group111.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['user_id', 'subgroup'])
    
    for i, predicts in enumerate(predicts_list):
        subgroup = " ".join([str(id2group[pred]) for pred in predicts if pred not in train_users_subgroup[test_dataset[i]["user_id"]]])
        writer.writerow([test_dataset[i]["user_id"], subgroup])
print("seen subgroup done")

seen subgroup done
