In [1]:
# https://huggingface.co/transformers/custom_datasets.html

In [1]:
!curl https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py
!python pytorch-xla-env-setup.py --version 1.7 --apt-packages libomp5 libopenblas-dev
!pip install numpy

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100    14  100    14    0     0     84      0 --:--:-- --:--:-- --:--:--    84
  File "/kaggle/working/pytorch-xla-env-setup.py", line 1
    404: Not Found
    ^^^
SyntaxError: illegal target for annotation


In [2]:
!wget https://s3-ap-southeast-1.amazonaws.com/he-public-data/dataset52a7b21.zip
!unzip dataset52a7b21.zip
!rm dataset/.~lock.train.csv#
!rm dataset52a7b21.zip

--2024-07-23 21:23:42--  https://s3-ap-southeast-1.amazonaws.com/he-public-data/dataset52a7b21.zip
Resolving s3-ap-southeast-1.amazonaws.com (s3-ap-southeast-1.amazonaws.com)... 52.219.124.202, 52.219.40.206, 52.219.41.34, ...
Connecting to s3-ap-southeast-1.amazonaws.com (s3-ap-southeast-1.amazonaws.com)|52.219.124.202|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1061576029 (1012M) [binary/octet-stream]
Saving to: 'dataset52a7b21.zip'


2024-07-23 21:24:34 (20.0 MB/s) - 'dataset52a7b21.zip' saved [1061576029/1061576029]

Archive:  dataset52a7b21.zip
   creating: dataset/
  inflating: dataset/train.csv       
  inflating: dataset/sample_submission.csv  
  inflating: dataset/test.csv        
  inflating: dataset/.~lock.train.csv#  


In [3]:
import csv
import pickle
import pandas as pd
import numpy as np
train = pd.read_csv("dataset/train.csv", escapechar = "\\", quoting = csv.QUOTE_NONE)

In [4]:
class_counts = train.BROWSE_NODE_ID.value_counts()
drop_indices = class_counts[class_counts<50].index
train = train[~train.BROWSE_NODE_ID.isin(drop_indices)]
train.head()

Unnamed: 0,TITLE,DESCRIPTION,BULLET_POINTS,BRAND,BROWSE_NODE_ID
0,"Pete The Cat Bedtime Blues Doll, 14.5 Inch","Pete the Cat is the coolest, most popular cat ...","[Pete the Cat Bedtime Blues plush doll,Based o...",MerryMakers,0
1,"The New Yorker NYHM014 Refrigerator Magnet, 2 ...",The New Yorker Handsome Cello Wrapped Hard Mag...,[Cat In A Tea Cup by New Yorker cover artist G...,The New Yorker,1
3,Amway Nutrilite Kids Chewable Iron Tablets (100),,"[Nutrilite Kids,Chewable Iron Tablets,Quantity...",Amway,3
4,Teacher Planner Company A4 6 Lesson Academic T...,,,,4
5,Men'S Full Sleeve Raglan T-Shirts Denim T-Shir...,Men'S Full Sleeve Raglan T-Shirts Denim T-Shir...,"[Color: Blue,Sleeve: Full Sleeve,Material: Cot...",Bhavya Enterprise,5


In [6]:
from sklearn.model_selection import train_test_split
train_split, val_split = train_test_split(train, test_size=.05)
from transformers import AlbertTokenizerFast
tokenizer = AlbertTokenizerFast.from_pretrained('albert-base-v2')

In [8]:
label_map = {}
for idx, value in enumerate(train.BROWSE_NODE_ID.unique()):
    label_map[value] = idx

with open('lable_map.pickle', 'wb') as handle:
    pickle.dump(label_map, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [20]:
import torch

class Dataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, is_train=True, label_map={}, max_length=128):
        self.df = df
        self.title = df.TITLE.values
        self.desc = df.DESCRIPTION.values
        self.bullets = df.BULLET_POINTS.apply(lambda x: x[1:-1] if len(x)>0 and x[0]=='[' else x).values
        self.tokenizer = tokenizer
        if is_train:
            self.labels = df.BROWSE_NODE_ID.apply(lambda x: label_map[x]).values
            self.label_map = label_map
        self.is_train = is_train
        self.max_length = max_length
 
    def __getitem__(self, idx):
        req_string = self.title[idx] + ' ~ '
        if torch.rand(1)>0.5:
            req_string += self.desc[idx]
        req_string += ' ~ '
        if torch.rand(1)>0.5:
            req_string += self.bullets[idx]
        tokenized_data = tokenizer.tokenize(req_string)
        to_append = ["[CLS]"] + tokenized_data[:self.max_length - 2] + ["[SEP]"]
        input_ids = tokenizer.convert_tokens_to_ids(to_append)
        input_mask = [1] * len(input_ids)
        padding = [0] * (self.max_length - len(input_ids))
        input_ids += padding
        input_mask += padding
        item = {
            "input_ids": torch.tensor(input_ids, dtype=torch.long),
            "attention_mask": torch.tensor(input_mask, dtype=torch.long)
        }
        if self.is_train:
            item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item
 
    def __len__(self):
        return len(self.df)

train_dataset = Dataset(train_split.fillna(""), tokenizer, is_train=True, label_map=label_map)
val_dataset = Dataset(val_split.fillna(""), tokenizer, is_train=True, label_map=label_map)

In [30]:
from transformers import AlbertForSequenceClassification, Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=2,              # total number of training epochs
    per_device_train_batch_size=32, # batch size per device during training
    per_device_eval_batch_size=32,  # batch size for evaluation
                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
   
    dataloader_num_workers=4,
    report_to="tensorboard",
    label_smoothing_factor=0.1,
  
    evaluation_strategy="steps",
    eval_steps=100,                # Evaluation and Save happens every 500 steps
    save_strategy="steps",
    
            # Only last 5 models are saved. Older ones are deleted.
    load_best_model_at_end=True,     # best model is always saved
    prediction_loss_only=True,
)


model = AlbertForSequenceClassification.from_pretrained('albert-base-v2')
model.classifier = torch.nn.Linear(768, len(label_map))
model.num_labels = len(label_map)

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset             # evaluation dataset
)

trainer.train()

Token indices sequence length is longer than the specified maximum sequence length for this model (641 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (631 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (634 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (758 > 512). Running this sequence through the model will result in indexing errors


Step,Training Loss,Validation Loss
100,No log,6.835758


Token indices sequence length is longer than the specified maximum sequence length for this model (594 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (933 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1151 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (600 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (792 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for th