In [1]:
import sys
# add the path
sys.path.append("run")
from base_utils import *
from sentence_transformers import SentenceTransformer
from datasets import load_dataset
from functools import partial
from transformers import AutoTokenizer, AutoModel, BartForConditionalGeneration, BertForSequenceClassification
import argparse
from nltk.tokenize import sent_tokenize
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
torch.cuda.current_device()

0

In [18]:
class Processor():
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer
        
    def __call__(self, sample):
        for key in sample.keys():
            if key == "input": 
                text = self.tokenizer(sample[key])["input_ids"]
            if key == "target":
                if sample[key]: #True
                    target = 1  
                else:
                    target = 0
        return_dict = {"input_ids": torch.tensor(text), 
                        "target": torch.tensor([target])}
        return return_dict
    
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
path = "./train_classifier_data/"
train = os.path.join(path, "new_train.csv")
valid = os.path.join(path, "new_valid.csv")
test = os.path.join(path, "new_test.csv")
data = load_dataset("csv", data_files={"train": train,
                                       "valid": valid,
                                       "test": test})
batch_size = 128
processor = Processor(tokenizer)
encoded_data = data.map(lambda sample: processor(sample))
encoded_data.set_format("torch")
# batchify the encoded data
train_dataloader = batchify(encoded_data["train"]["input_ids"], encoded_data["train"]["target"],
                            batch_size=batch_size)
valid_dataloader = batchify(encoded_data["valid"]["input_ids"], encoded_data["valid"]["target"],
                            batch_size=batch_size)
test_dataloader = batchify(encoded_data["test"]["input_ids"], encoded_data["test"]["target"],
                           batch_size=batch_size)

In [9]:
import time

In [12]:
class Classifier(JoModule):
    def __init__(self, base):
        super().__init__()
        self.base = base
        self.loss_fn = nn.CrossEntropyLoss()
        
    def forward(self, input_ids):
        logits = self.base(input_ids).logits
        return logits   # in shape (N, 2)
    
    def training_step(self, batch, device):
        input_ids, target = batch
        input_ids = input_ids.to(device)
        target = target.reshape(-1).to(device)
        logits = self.forward(input_ids)
        loss = self.loss_fn(logits, target)
        return loss
        
    
    def validation_step(self, batch, device, metrics=["loss", "precision"]):
        input_ids, target = batch
        input_ids = input_ids.to(device)
        target = target.reshape(-1).to(device)
        logits = self.forward(input_ids)
        # compute loss
        loss = self.loss_fn(logits, target)
        # compute precision
        precision = (logits.topk(1).indices.reshape(-1) == target).sum() / len(target)
        return {"loss": loss, "precision": precision}
start = time.time()    
base = BertForSequenceClassification.from_pretrained("bert-base-uncased")
model = Classifier(base)
trainer = Trainer(batch_size=128,
                  max_epochs=5,
                  optimizer_method="Adam",
                  lr=2e-6,
                  save_model="exp_classifier_new",
                  logging="exp_classifier.log",
                  use_amp=False,
                  warmup=False,
                  accelerator="cuda:0",
                  valid_metrics=["loss", "precision"],
                  efficient_valid=True, 
                  )
trainer.fit(model, train_dataloader, valid_dataloader)
end = time.time()
print("Time taken:"+str(end-start)+" seconds")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Start training...
The total number of parameters is: 109.48M
current learning rate: 2e-06


                                                                                                                                                              

------------------------- epoch: 0 -------------------------
on train data
loss: 0.7017771601676941
precision: 0.4909762144088745
------------------------- epoch: 0 -------------------------
on valid data
loss: 0.67826247215271
precision: 0.5450066328048706
current learning rate: 2e-06


                                                                                                                                                              

------------------------- epoch: 1 -------------------------
on train data
loss: 0.6633497476577759
precision: 0.6096197366714478
------------------------- epoch: 1 -------------------------
on valid data
loss: 0.648210883140564
precision: 0.7734954357147217
current learning rate: 2e-06


                                                                                                                                                              

------------------------- epoch: 2 -------------------------
on train data
loss: 0.6125339865684509
precision: 0.7670373916625977
------------------------- epoch: 2 -------------------------
on valid data
loss: 0.6069641709327698
precision: 0.7939589023590088
current learning rate: 2e-06


                                                                                                                                                              

------------------------- epoch: 3 -------------------------
on train data
loss: 0.5285905599594116
precision: 0.7923558950424194
------------------------- epoch: 3 -------------------------
on valid data
loss: 0.5451961755752563
precision: 0.8159466981887817
current learning rate: 2e-06


                                                                                                                                                              

------------------------- epoch: 4 -------------------------
on train data
loss: 0.46761685609817505
precision: 0.8470154404640198
------------------------- epoch: 4 -------------------------
on valid data
loss: 0.48416590690612793
precision: 0.8179308772087097
Time taken:48.646119356155396 seconds


In [17]:
start = time.time()
model = Classifier(base)
model = load_model(model, "./saved/exp_classifier_new_dict")
model.validIter(test_dataloader, "cuda:0", ["loss", "precision"], False)
end = time.time()
print("Time taken:" + str(end-start)+" seconds")

                                                                                                                                                              

Time taken:0.5799984931945801 seconds
