In [1]:
import csv

users = {}
with open('data/users.csv', newline='') as csvfile:
    # 讀取 CSV 檔內容，將每一列轉成一個 dictionary
    users_info = csv.DictReader(csvfile)
    for user in users_info:
        users[user["user_id"]] = {
            "gender": user["gender"],
            "occupation_titles": user["occupation_titles"],
            "interests": user["interests"]
        }

course2id = {}
id2course = {}
with open('data/courses.csv', newline='') as csvfile:
    courses = csv.DictReader(csvfile)
    for i, course in enumerate(courses):
        course2id[course['course_id']] = i
        id2course[i] = course['course_id']
len(course2id)

728

In [21]:
#### train dataset
train_dataset = []
train_users_course = {}     
with open('data/train.csv', newline='') as csvfile:
    train = csv.DictReader(csvfile)
    for i in train:
        user_train = {}
        user_train["user_id"] = i["user_id"]
        user_train["interests"] = users[i["user_id"]]["interests"] + ',' +users[i["user_id"]]["occupation_titles"] if users[i["user_id"]]["occupation_titles"] != "" else users[i["user_id"]]["interests"]
        user_train["course_id"] = [course2id[course] for course in i["course_id"].split(" ")]
        train_users_course[i["user_id"]] = user_train["course_id"]
        train_dataset.append(user_train)
        
#train_dataset[0]
train_users_course[i["user_id"]]

[185, 295]

In [3]:
#### validation dataset
validation_dataset = []     
with open('data/val_seen.csv', newline='') as csvfile:
    validation = csv.DictReader(csvfile)
    for i in validation:
        user_validation = {}
        user_validation["user_id"] = i["user_id"]
        user_validation["interests"] = users[i["user_id"]]["interests"] + ',' +users[i["user_id"]]["occupation_titles"] if users[i["user_id"]]["occupation_titles"] != "" else users[i["user_id"]]["interests"]
        user_validation["course_id"] = [course2id[course] for course in i["course_id"].split(" ")]
        validation_dataset.append(user_validation)
        
validation_dataset[0]

{'user_id': '56dae2b74e3ef90900b7bd0e',
 'interests': '程式_程式入門,程式_資料科學,職場技能_求職,語言_英文,程式_程式語言,資訊科技',
 'course_id': [247]}

In [4]:
from datasets import Dataset
from datasets import load_dataset

# dataset = load_dataset("csv", data_files={"train":"new_data/train_group.csv", "validation":"new_data/val_seen_group.csv"})
# dataset["train"][1]
dataset = {}
dataset['train'] = Dataset.from_list(train_dataset)
dataset['validation'] = Dataset.from_list(validation_dataset)
dataset['train']

  from .autonotebook import tqdm as notebook_tqdm


Dataset({
    features: ['user_id', 'interests', 'course_id'],
    num_rows: 59737
})

In [5]:
from transformers import AutoTokenizer
import numpy as np

tokenizer = AutoTokenizer.from_pretrained("hfl/chinese-bert-wwm-ext")

def preprocess_data(examples):
  # take a batch of texts
  text = examples["interests"]
  # encode them
  encoding = tokenizer(text, padding="max_length", truncation=True, max_length=128)
  # create numpy array of shape (batch_size, num_labels)
  labels = np.zeros(len(course2id))
  # fill numpy array
  for id in examples["course_id"]:
      labels[id] = 1

  encoding["labels"] = labels.tolist()
  
  return encoding

In [6]:
encoded_train_dataset = dataset['train'].map(preprocess_data, remove_columns=dataset['train'].column_names)
encoded_validation_dataset = dataset['validation'].map(preprocess_data, remove_columns=dataset['validation'].column_names)
example = encoded_train_dataset[0]
# example = encoded_dataset['validation'][0]
example["input_ids"]


100%|██████████| 59737/59737 [00:20<00:00, 2878.11ex/s]
100%|██████████| 7748/7748 [00:02<00:00, 2713.46ex/s]


[101,
 5480,
 1842,
 2825,
 5543,
 142,
 1201,
 3511,
 117,
 6257,
 6243,
 142,
 2398,
 7481,
 6257,
 6243,
 117,
 5971,
 6123,
 142,
 7442,
 5582,
 5257,
 1756,
 117,
 5971,
 6123,
 142,
 5257,
 4529,
 5645,
 2991,
 4529,
 117,
 2797,
 868,
 142,
 1173,
 5255,
 117,
 3109,
 2512,
 142,
 2512,
 1008,
 1201,
 868,
 117,
 2797,
 868,
 142,
 2797,
 868,
 2207,
 4289,
 117,
 3302,
 1243,
 3511,
 102,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]

In [7]:
encoded_train_dataset.set_format("torch")
encoded_validation_dataset.set_format("torch")
example['labels']
#dataset['train'][0]

[0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0

In [8]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("hfl/chinese-bert-wwm-ext", 
                                                           #problem_type="multi_label_classification",
                                                           #ignore_mismatched_sizes=True,
                                                           num_labels=len(course2id))
     

Some weights of the model checkpoint at hfl/chinese-bert-wwm-ext were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkp

In [9]:
from transformers import TrainingArguments, Trainer

batch_size = 16

args = TrainingArguments(
    f"bert-finetuned-course0",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    #load_best_model_at_end=True,
    #metric_for_best_model=metric_name,
)

2022-12-29 21:37:17.918688: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0


In [10]:
#from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
import torch
# def multi_label_metrics(predictions, labels, threshold=0.5):
#     # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
#     sigmoid = torch.nn.Sigmoid()
#     probs = sigmoid(torch.Tensor(predictions))
#     # next, use threshold to turn them into integer predictions
#     y_pred = np.zeros(probs.shape)
#     y_pred[np.where(probs >= threshold)] = 1
#     # finally, compute metrics
#     y_true = labels
#     f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
#     roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
#     accuracy = accuracy_score(y_true, y_pred)
#     # return as dictionary
#     metrics = {'f1': f1_micro_average,
#                'roc_auc': roc_auc,
#                'accuracy': accuracy}
#     return metrics

def apk(actual, predicted, k=50):
    """
    Computes the average precision at k.

    This function computes the average prescision at k between two lists of
    items.

    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements

    Returns
    -------
    score : double
            The average precision at k over the input lists

    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)
            
    #print(score)
    if len(actual) == 0:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=50):
    """
    Computes the mean average precision at k.

    This function computes the mean average prescision at k between two lists
    of lists of items.

    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements

    Returns
    -------
    score : double
            The mean average precision at k over the input lists

    """
    return {'map@50' : np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])}

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, 
            tuple) else p.predictions
    #print(preds, type(preds))
    predicted = [np.argsort(pred)[::-1].tolist() for pred in preds]
    actual = [np.where(label == 1)[0].tolist() for label in p.label_ids]

    result = mapk(
        actual=actual,
        predicted=predicted)
    return result
     

In [11]:
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_train_dataset,
    eval_dataset=encoded_validation_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)
trainer.train()

***** Running training *****
  Num examples = 59737
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 18670
  0%|          | 0/18670 [00:00<?, ?it/s]You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  3%|▎         | 501/18670 [01:11<43:58,  6.89it/s]

{'loss': 0.1572, 'learning_rate': 1.9464381360471345e-05, 'epoch': 0.13}


  5%|▌         | 1001/18670 [02:24<43:30,  6.77it/s]

{'loss': 0.0273, 'learning_rate': 1.892876272094269e-05, 'epoch': 0.27}


  8%|▊         | 1501/18670 [03:38<42:53,  6.67it/s]

{'loss': 0.0204, 'learning_rate': 1.8393144081414034e-05, 'epoch': 0.4}


 11%|█         | 2001/18670 [04:53<41:20,  6.72it/s]

{'loss': 0.0185, 'learning_rate': 1.785752544188538e-05, 'epoch': 0.54}


 13%|█▎        | 2501/18670 [06:08<40:37,  6.63it/s]

{'loss': 0.0178, 'learning_rate': 1.7321906802356724e-05, 'epoch': 0.67}


 16%|█▌        | 3001/18670 [07:23<39:21,  6.64it/s]

{'loss': 0.0175, 'learning_rate': 1.6786288162828067e-05, 'epoch': 0.8}


 19%|█▉        | 3501/18670 [08:38<38:34,  6.55it/s]

{'loss': 0.0172, 'learning_rate': 1.6250669523299413e-05, 'epoch': 0.94}


 20%|█▉        | 3733/18670 [09:13<36:38,  6.79it/s]***** Running Evaluation *****
  Num examples = 7748
  Batch size = 16
                                                    
 20%|██        | 3734/18670 [09:36<36:38,  6.79it/s]Saving model checkpoint to bert-finetuned-course0/checkpoint-3734
Configuration saved in bert-finetuned-course0/checkpoint-3734/config.json


{'eval_loss': 0.021309418603777885, 'eval_map@50': 0.06162341986375953, 'eval_runtime': 23.6594, 'eval_samples_per_second': 327.481, 'eval_steps_per_second': 20.499, 'epoch': 1.0}


Model weights saved in bert-finetuned-course0/checkpoint-3734/pytorch_model.bin
tokenizer config file saved in bert-finetuned-course0/checkpoint-3734/tokenizer_config.json
Special tokens file saved in bert-finetuned-course0/checkpoint-3734/special_tokens_map.json
 21%|██▏       | 4001/18670 [10:31<37:24,  6.53it/s]   

{'loss': 0.0174, 'learning_rate': 1.5715050883770756e-05, 'epoch': 1.07}


 24%|██▍       | 4501/18670 [11:45<35:30,  6.65it/s]

{'loss': 0.0172, 'learning_rate': 1.5179432244242101e-05, 'epoch': 1.21}


 27%|██▋       | 5001/18670 [13:00<34:17,  6.64it/s]

{'loss': 0.0169, 'learning_rate': 1.4643813604713446e-05, 'epoch': 1.34}


 29%|██▉       | 5501/18670 [14:15<33:09,  6.62it/s]

{'loss': 0.0171, 'learning_rate': 1.4108194965184789e-05, 'epoch': 1.47}


 32%|███▏      | 6001/18670 [15:30<31:49,  6.63it/s]

{'loss': 0.017, 'learning_rate': 1.3572576325656135e-05, 'epoch': 1.61}


 35%|███▍      | 6501/18670 [16:45<30:32,  6.64it/s]

{'loss': 0.0171, 'learning_rate': 1.3036957686127478e-05, 'epoch': 1.74}


 37%|███▋      | 7001/18670 [18:00<29:29,  6.60it/s]

{'loss': 0.017, 'learning_rate': 1.2501339046598821e-05, 'epoch': 1.87}


 40%|███▉      | 7467/18670 [19:10<27:46,  6.72it/s]***** Running Evaluation *****
  Num examples = 7748
  Batch size = 16
                                                    
 40%|████      | 7468/18670 [19:34<27:45,  6.72it/s]Saving model checkpoint to bert-finetuned-course0/checkpoint-7468
Configuration saved in bert-finetuned-course0/checkpoint-7468/config.json


{'eval_loss': 0.022877763956785202, 'eval_map@50': 0.05899716669401654, 'eval_runtime': 24.1572, 'eval_samples_per_second': 320.732, 'eval_steps_per_second': 20.077, 'epoch': 2.0}


Model weights saved in bert-finetuned-course0/checkpoint-7468/pytorch_model.bin
tokenizer config file saved in bert-finetuned-course0/checkpoint-7468/tokenizer_config.json
Special tokens file saved in bert-finetuned-course0/checkpoint-7468/special_tokens_map.json
 40%|████      | 7501/18670 [19:53<28:11,  6.60it/s]   

{'loss': 0.017, 'learning_rate': 1.1965720407070168e-05, 'epoch': 2.01}


 43%|████▎     | 8001/18670 [21:08<26:19,  6.75it/s]

{'loss': 0.0172, 'learning_rate': 1.1430101767541511e-05, 'epoch': 2.14}


 46%|████▌     | 8501/18670 [22:22<25:09,  6.74it/s]

{'loss': 0.0169, 'learning_rate': 1.0894483128012856e-05, 'epoch': 2.28}


 48%|████▊     | 9001/18670 [23:36<24:07,  6.68it/s]

{'loss': 0.0168, 'learning_rate': 1.0358864488484199e-05, 'epoch': 2.41}


 51%|█████     | 9501/18670 [24:51<22:58,  6.65it/s]

{'loss': 0.0169, 'learning_rate': 9.823245848955545e-06, 'epoch': 2.54}


 54%|█████▎    | 10001/18670 [26:06<21:51,  6.61it/s]

{'loss': 0.0168, 'learning_rate': 9.28762720942689e-06, 'epoch': 2.68}


 56%|█████▌    | 10501/18670 [27:21<20:31,  6.63it/s]

{'loss': 0.0169, 'learning_rate': 8.752008569898233e-06, 'epoch': 2.81}


 59%|█████▉    | 11001/18670 [28:37<19:15,  6.64it/s]

{'loss': 0.0168, 'learning_rate': 8.216389930369578e-06, 'epoch': 2.95}


 60%|█████▉    | 11201/18670 [29:07<18:49,  6.61it/s]***** Running Evaluation *****
  Num examples = 7748
  Batch size = 16
                                                     
 60%|██████    | 11202/18670 [29:31<18:49,  6.61it/s]Saving model checkpoint to bert-finetuned-course0/checkpoint-11202
Configuration saved in bert-finetuned-course0/checkpoint-11202/config.json


{'eval_loss': 0.024334268644452095, 'eval_map@50': 0.0619876636643889, 'eval_runtime': 24.0724, 'eval_samples_per_second': 321.863, 'eval_steps_per_second': 20.148, 'epoch': 3.0}


Model weights saved in bert-finetuned-course0/checkpoint-11202/pytorch_model.bin
tokenizer config file saved in bert-finetuned-course0/checkpoint-11202/tokenizer_config.json
Special tokens file saved in bert-finetuned-course0/checkpoint-11202/special_tokens_map.json
 62%|██████▏   | 11501/18670 [30:30<17:49,  6.70it/s]   

{'loss': 0.017, 'learning_rate': 7.68077129084092e-06, 'epoch': 3.08}


 64%|██████▍   | 12001/18670 [31:45<16:45,  6.64it/s]

{'loss': 0.0167, 'learning_rate': 7.145152651312266e-06, 'epoch': 3.21}


 67%|██████▋   | 12501/18670 [33:00<15:20,  6.70it/s]

{'loss': 0.0168, 'learning_rate': 6.60953401178361e-06, 'epoch': 3.35}


 70%|██████▉   | 13001/18670 [34:14<14:24,  6.56it/s]

{'loss': 0.0167, 'learning_rate': 6.073915372254955e-06, 'epoch': 3.48}


 72%|███████▏  | 13501/18670 [35:29<12:55,  6.67it/s]

{'loss': 0.0166, 'learning_rate': 5.538296732726299e-06, 'epoch': 3.62}


 75%|███████▍  | 14001/18670 [36:44<11:49,  6.58it/s]

{'loss': 0.0168, 'learning_rate': 5.002678093197644e-06, 'epoch': 3.75}


 78%|███████▊  | 14501/18670 [38:00<10:30,  6.61it/s]

{'loss': 0.0168, 'learning_rate': 4.4670594536689885e-06, 'epoch': 3.88}


 80%|███████▉  | 14935/18670 [39:05<09:19,  6.68it/s]***** Running Evaluation *****
  Num examples = 7748
  Batch size = 16
                                                     
 80%|████████  | 14936/18670 [39:30<09:19,  6.68it/s]Saving model checkpoint to bert-finetuned-course0/checkpoint-14936
Configuration saved in bert-finetuned-course0/checkpoint-14936/config.json


{'eval_loss': 0.02544930763542652, 'eval_map@50': 0.06723627386326213, 'eval_runtime': 24.2448, 'eval_samples_per_second': 319.574, 'eval_steps_per_second': 20.004, 'epoch': 4.0}


Model weights saved in bert-finetuned-course0/checkpoint-14936/pytorch_model.bin
tokenizer config file saved in bert-finetuned-course0/checkpoint-14936/tokenizer_config.json
Special tokens file saved in bert-finetuned-course0/checkpoint-14936/special_tokens_map.json
 80%|████████  | 15001/18670 [39:54<09:12,  6.63it/s]  

{'loss': 0.0166, 'learning_rate': 3.931440814140332e-06, 'epoch': 4.02}


 83%|████████▎ | 15501/18670 [41:09<08:03,  6.56it/s]

{'loss': 0.0164, 'learning_rate': 3.3958221746116767e-06, 'epoch': 4.15}


 86%|████████▌ | 16001/18670 [42:24<06:39,  6.68it/s]

{'loss': 0.0166, 'learning_rate': 2.860203535083021e-06, 'epoch': 4.28}


 88%|████████▊ | 16501/18670 [43:39<05:25,  6.67it/s]

{'loss': 0.0165, 'learning_rate': 2.3245848955543654e-06, 'epoch': 4.42}


 91%|█████████ | 17001/18670 [44:53<04:11,  6.63it/s]

{'loss': 0.0163, 'learning_rate': 1.78896625602571e-06, 'epoch': 4.55}


 94%|█████████▎| 17501/18670 [46:09<02:56,  6.63it/s]

{'loss': 0.0166, 'learning_rate': 1.2533476164970543e-06, 'epoch': 4.69}


 96%|█████████▋| 18001/18670 [47:23<01:40,  6.63it/s]

{'loss': 0.0166, 'learning_rate': 7.177289769683986e-07, 'epoch': 4.82}


 99%|█████████▉| 18501/18670 [48:38<00:25,  6.65it/s]

{'loss': 0.0166, 'learning_rate': 1.821103374397429e-07, 'epoch': 4.95}


100%|█████████▉| 18669/18670 [49:03<00:00,  6.73it/s]***** Running Evaluation *****
  Num examples = 7748
  Batch size = 16
                                                     
100%|██████████| 18670/18670 [49:27<00:00,  6.73it/s]Saving model checkpoint to bert-finetuned-course0/checkpoint-18670
Configuration saved in bert-finetuned-course0/checkpoint-18670/config.json


{'eval_loss': 0.026051407679915428, 'eval_map@50': 0.06778748196609237, 'eval_runtime': 24.1123, 'eval_samples_per_second': 321.33, 'eval_steps_per_second': 20.114, 'epoch': 5.0}


Model weights saved in bert-finetuned-course0/checkpoint-18670/pytorch_model.bin
tokenizer config file saved in bert-finetuned-course0/checkpoint-18670/tokenizer_config.json
Special tokens file saved in bert-finetuned-course0/checkpoint-18670/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 18670/18670 [49:35<00:00,  6.27it/s]

{'train_runtime': 2975.9515, 'train_samples_per_second': 100.366, 'train_steps_per_second': 6.274, 'train_loss': 0.021052546799981024, 'epoch': 5.0}





TrainOutput(global_step=18670, training_loss=0.021052546799981024, metrics={'train_runtime': 2975.9515, 'train_samples_per_second': 100.366, 'train_steps_per_second': 6.274, 'train_loss': 0.021052546799981024, 'epoch': 5.0})

In [12]:
#### test unseen dataset
test_dataset = []     
with open('data/test_unseen.csv', newline='') as csvfile:
    test = csv.DictReader(csvfile)
    for i in test:
        user_validation = {}
        user_validation["user_id"] = i["user_id"]
        user_validation["interests"] = users[i["user_id"]]["interests"] + ',' +users[i["user_id"]]["occupation_titles"] if users[i["user_id"]]["occupation_titles"] != "" else users[i["user_id"]]["interests"]
        test_dataset.append(user_validation)
        
dataset['test'] = Dataset.from_list(test_dataset)

In [13]:
def preprocess_test_data(examples):
  # take a batch of texts
  text = examples["interests"]
  # encode them
  encoding = tokenizer(text, padding="max_length", truncation=True, max_length=128)
  return encoding

encoded_test_dataset = dataset['test'].map(preprocess_test_data, remove_columns=dataset['test'].column_names)
encoded_test_dataset[0]

100%|██████████| 11097/11097 [00:03<00:00, 3665.29ex/s]


{'input_ids': [101,
  2832,
  6536,
  4415,
  6512,
  142,
  4415,
  6512,
  117,
  4923,
  2466,
  142,
  7030,
  1265,
  1146,
  3358,
  117,
  2832,
  6536,
  4415,
  6512,
  142,
  2832,
  6536,
  6223,
  2573,
  117,
  5480,
  1842,
  2825,
  5543,
  142,
  3126,
  4372,
  2990,
  1285,
  117,
  2832,
  6536,
  4415,
  6512,
  142,
  7032,
  6084,
  1555,
  1501,
  117,
  5480,
  1842,
  2825,
  5543,
  142,
  943,
  782,
  1501,
  4277,
  5195,
  4245,
  117,
  6182,
  6863,
  3511,
  117,
  3302,
  1243,
  3511,
  102,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'token_type_ids': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,

In [14]:
encoded_test_dataset.set_format("torch")
predicts = trainer.predict(encoded_test_dataset)
predicts_list = [np.argsort(pred)[::-1].tolist() for pred in predicts.predictions]
predicts_list[0]

***** Running Prediction *****
  Num examples = 11097
  Batch size = 16
100%|██████████| 694/694 [00:32<00:00, 21.25it/s]


[533,
 560,
 603,
 599,
 500,
 502,
 501,
 425,
 644,
 569,
 631,
 496,
 563,
 426,
 573,
 391,
 557,
 620,
 586,
 424,
 270,
 314,
 531,
 185,
 530,
 624,
 288,
 613,
 416,
 462,
 591,
 590,
 70,
 509,
 226,
 625,
 638,
 524,
 616,
 287,
 53,
 202,
 589,
 652,
 418,
 295,
 516,
 164,
 595,
 598,
 653,
 112,
 182,
 126,
 430,
 522,
 159,
 581,
 614,
 632,
 580,
 610,
 145,
 593,
 514,
 374,
 300,
 579,
 385,
 654,
 341,
 477,
 605,
 645,
 618,
 130,
 512,
 310,
 627,
 567,
 403,
 626,
 575,
 92,
 124,
 594,
 291,
 493,
 592,
 503,
 177,
 231,
 67,
 643,
 64,
 639,
 191,
 583,
 161,
 572,
 157,
 648,
 378,
 611,
 621,
 332,
 454,
 206,
 132,
 84,
 228,
 218,
 343,
 281,
 401,
 122,
 552,
 264,
 87,
 100,
 308,
 559,
 255,
 634,
 117,
 246,
 604,
 506,
 376,
 353,
 622,
 597,
 362,
 241,
 602,
 609,
 155,
 617,
 446,
 189,
 372,
 585,
 196,
 174,
 550,
 115,
 148,
 555,
 55,
 405,
 66,
 267,
 10,
 494,
 574,
 420,
 394,
 293,
 27,
 600,
 34,
 452,
 472,
 259,
 492,
 548,
 465,
 354,
 423

In [15]:
import csv
with open('predict_outputs/bertchinese_unseen_course999.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['user_id', 'course_id'])
    
    for i, predicts in enumerate(predicts_list):
        courses = " ".join([str(id2course[pred]) for pred in predicts])
        writer.writerow([test_dataset[i]["user_id"], courses])
print("unseen course done")

unseen course done


In [16]:
#### test seen dataset
test_dataset = []     
with open('data/test_seen.csv', newline='') as csvfile:
    test = csv.DictReader(csvfile)
    for i in test:
        user_validation = {}
        user_validation["user_id"] = i["user_id"]
        user_validation["interests"] = users[i["user_id"]]["interests"] + ',' +users[i["user_id"]]["occupation_titles"] if users[i["user_id"]]["occupation_titles"] != "" else users[i["user_id"]]["interests"]
        test_dataset.append(user_validation)
        
dataset['test'] = Dataset.from_list(test_dataset)
encoded_test_dataset = dataset['test'].map(preprocess_test_data, remove_columns=dataset['test'].column_names)
encoded_test_dataset[0]

100%|██████████| 7205/7205 [00:02<00:00, 3306.44ex/s]


{'input_ids': [101,
  4495,
  3833,
  1501,
  1456,
  142,
  2187,
  4289,
  117,
  6121,
  7077,
  142,
  3152,
  3428,
  117,
  6257,
  6243,
  142,
  2398,
  7481,
  6257,
  6243,
  117,
  6257,
  6243,
  142,
  1240,
  2706,
  6257,
  6243,
  117,
  3109,
  2512,
  142,
  1555,
  3511,
  3109,
  2512,
  117,
  3109,
  2512,
  142,
  2527,
  6182,
  1198,
  6744,
  117,
  6121,
  7077,
  142,
  3149,
  855,
  6121,
  7077,
  117,
  6257,
  6243,
  142,
  5206,
  7514,
  6257,
  6243,
  117,
  5971,
  3152,
  6257,
  6243,
  117,
  2451,
  1440,
  1001,
  3064,
  102,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'token_type_ids': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,

In [17]:
encoded_test_dataset.set_format("torch")
predicts = trainer.predict(encoded_test_dataset)
predicts_list = [np.argsort(pred)[::-1].tolist() for pred in predicts.predictions]
predicts_list[0]

***** Running Prediction *****
  Num examples = 7205
  Batch size = 16
100%|██████████| 451/451 [00:21<00:00, 21.27it/s]


[501,
 533,
 500,
 560,
 425,
 502,
 603,
 426,
 524,
 424,
 573,
 599,
 616,
 652,
 531,
 226,
 624,
 563,
 530,
 391,
 631,
 614,
 416,
 126,
 70,
 644,
 627,
 593,
 569,
 557,
 496,
 159,
 314,
 124,
 586,
 287,
 620,
 288,
 132,
 270,
 185,
 589,
 613,
 430,
 590,
 122,
 206,
 509,
 610,
 182,
 653,
 591,
 632,
 164,
 462,
 155,
 625,
 580,
 174,
 202,
 385,
 506,
 595,
 514,
 522,
 191,
 654,
 92,
 100,
 477,
 638,
 454,
 112,
 341,
 516,
 53,
 567,
 300,
 246,
 148,
 295,
 228,
 418,
 581,
 255,
 67,
 622,
 626,
 634,
 597,
 177,
 598,
 648,
 575,
 291,
 592,
 145,
 611,
 84,
 583,
 493,
 267,
 645,
 579,
 310,
 423,
 403,
 594,
 332,
 643,
 419,
 130,
 55,
 66,
 621,
 605,
 639,
 150,
 512,
 374,
 117,
 351,
 465,
 141,
 473,
 494,
 362,
 394,
 231,
 572,
 457,
 602,
 630,
 189,
 258,
 31,
 372,
 618,
 161,
 452,
 405,
 402,
 401,
 574,
 472,
 552,
 378,
 585,
 196,
 635,
 559,
 562,
 490,
 587,
 335,
 241,
 219,
 115,
 64,
 338,
 343,
 308,
 154,
 547,
 570,
 353,
 187,
 600,
 

In [28]:
with open('predict_outputs/bertchinese_seen_course999.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['user_id', 'course_id'])
    
    for i, predicts in enumerate(predicts_list):
        courses = " ".join([str(id2course[pred]) for pred in predicts if pred not in train_users_course[test_dataset[i]["user_id"]]])
        #print(len(courses))
        writer.writerow([test_dataset[i]["user_id"], courses])
print("seen course done")

seen course done
