In [13]:
import numpy as np 
import pandas as pd 
import torch
import torch.utils.data as data
import torch.nn as nn
import torch.optim as optim
from transformers import DistilBertModel, BertConfig, DistilBertTokenizer
from types import SimpleNamespace
from torch.utils.data import DataLoader
import os
import csv
from transformers import AutoModel, AutoTokenizer
from sklearn.model_selection import train_test_split


The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


Moving 0 files to the new cache system


0it [00:00, ?it/s]

In [14]:
MODEL = "distilbert-base-uncased"

In [15]:
tokenizer = DistilBertTokenizer.from_pretrained(MODEL, max_length=512, padding=True, truncation = True, return_tensors="pt")

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [28]:
args = SimpleNamespace(
    # general options
    #train_path = '../input/covid2/train',        # train data folder
    #valid_path = '../input/covid2/valid',        # valid data folder
    #test_path = '../input/covid2/test',          # test data folder
    batch_size = 128,                         # training and valid batch size
    test_batch_size = 128,                       # batch size for testing
    epochs = 10,                                 # maximum number of epochs to train
    lr = 0.05,                                 # learning rate
    momentum = 0.9,                              # SGD momentum, for SGD only
    optimizer = 'adam',                          # optimization method: sgd | adam
    log_interval = 5,                            # how many batches to wait before logging training status
    patience = 5,                                # how many epochs of no loss improvement should we wait before stop training
    checkpoint = '../models',                    # checkpoints directory
    seed = 42,
    train = True,                                # train before testing
    cuda = True,                                 # use gpu
    num_workers = 2,                             # how many subprocesses to use for data loading
    adapter_hidden_size = 64
)

In [29]:
# Given the list with the classes for each comment, returns the output with the desired format
def prepare_target(sentiments):
  target = []
  for sent in sentiments:
    if sent == 0:
      target.append([1.0,0.0,0.0])
    elif sent == 1:
      target.append([0.0,1.0,0.0])
    else:
      target.append([0.0,0.0,1.0])
  return torch.tensor(target)


In [30]:
file = "../data/external/comments.csv"
df = pd.read_csv(file)
df = df.dropna()
comments = df['Comment'].tolist() # The tokenizer recieves a list as input 
target = prepare_target(df['Sentiment']) 
X_train, X_test, y_train, y_test = train_test_split(comments, target, test_size=0.33, random_state=args.seed)

In [31]:
class Loader(torch.utils.data.Dataset):
  def __init__(self, comments, sentiments):
    self.data= tokenizer(comments, padding=True, truncation = True, max_length=512,return_tensors="pt")['input_ids']
    self.target = sentiments
    
  def __getitem__(self, index):
    data = self.data[index]
    target = self.target[index]
    return data, target
  
  def __len__(self):
        return len(self.target)


In [32]:
class DistilBERTforSentiment(nn.Module):
    def __init__(self, adapter_hidden_size=64):
        super().__init__()

        self.distilbert = DistilBertModel.from_pretrained(MODEL)
        
        hidden_size = self.distilbert.config.hidden_size

        self.adaptor = nn.Sequential(
            nn.Linear(hidden_size, adapter_hidden_size),
            nn.ReLU(True),           
            nn.Dropout(0.1),            
            nn.Linear(adapter_hidden_size, hidden_size),
        )  
        
        self.classifier = nn.Sequential(
            nn.Linear(hidden_size, adapter_hidden_size),
            nn.ReLU(True),           
            nn.Dropout(0.1),            
            nn.Linear(adapter_hidden_size, 3),
        )
        
        
    def forward(self, inputs):
        outputs = self.distilbert(input_ids = inputs, return_dict=False)
        # B x seq_length x H
        x = self.adaptor(outputs[0])
        
        x,_ = x.max(dim=1)
        # B x H
        
        results = self.classifier(x)
        return results

In [33]:
args.cuda = args.cuda and torch.cuda.is_available()
if args.cuda:
    print('Using CUDA with {0} GPUs'.format(torch.cuda.device_count()))

# build model
model = DistilBERTforSentiment(adapter_hidden_size=args.adapter_hidden_size)

for param in model.distilbert.parameters():
    param.requires_grad = False
    
if args.cuda:
    model.cuda()

# Define criterion
criterion = nn.CrossEntropyLoss()

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.weight', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [34]:
def train_one_epoch(trainloader, model, criterion, optimizer, epoch_index, cuda,max_norm=1):
    model.train()
    running_loss = 0
    accumulation_steps = 40 # effective 40 batch  
    for i, (input_ids,target) in enumerate(trainloader, 0):
        if cuda:
            input_ids, target = input_ids.cuda(), target.cuda()
        #optimizer.zero_grad()
        output = model(input_ids)
        loss = criterion(output, target)
        #print(torch.sigmoid(output), target)
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm)
        if (i+1) % accumulation_steps == 0:
            optimizer.step()                 # Now we can do an optimizer step
            optimizer.zero_grad()
        running_loss += loss.item()
        if i % 1000 == 999:
            last_loss = running_loss / 1000 # loss per batch
            print('  batch {} loss: {}'.format(i + 1, last_loss))
            tb_x = epoch_index * len(trainloader) + i + 1
            print('Loss/train', last_loss, tb_x)
            running_loss = 0.
    return running_loss / i

In [35]:
def test_one_epoch(test_loader,model,criterion, cuda, avg_loss):
    running_vloss = 0.0
    best_vloss = 99999
    for i, (input_ids,target) in enumerate(test_loader, 0):
        if cuda:
            input_ids, target = input_ids.cuda(), target.cuda()
        output = model(input_ids)
        loss = criterion(output, target)
        running_vloss += loss

    avg_vloss = running_vloss / (i + 1)
    print('LOSS train {} valid {}'.format(avg_loss, avg_vloss))
    return avg_vloss

In [36]:
# num_workers set to 8 as recommended when training in local
training_set = Loader(X_train, y_train) #X['attention_mask']
train_loader = torch.utils.data.DataLoader(training_set, batch_size=1,
                                          shuffle=True, num_workers=8)

test_set = Loader(X_test, y_test) #X['attention_mask']
test_loader = torch.utils.data.DataLoader(test_set, batch_size=1,
                                          shuffle=True, num_workers=8)

In [37]:
optimizer = optim.Adam(model.parameters(), lr=args.lr)
epoch = 0
best_valid_loss = 9999
while (epoch < args.epochs + 1):
        train_loss = train_one_epoch(train_loader, model, criterion, optimizer, epoch, args.cuda)
        valid_loss = test_one_epoch(test_loader, model, criterion, args.cuda, train_loss)
        if not os.path.isdir(args.checkpoint):
            os.mkdir(args.checkpoint)
        torch.save(model.state_dict(), '../models/{}/model{:03d}.pt'.format(args.checkpoint, epoch))
        if valid_loss <= best_valid_loss:
            print('Saving state')
            best_valid_loss = valid_loss
            best_epoch = epoch
            state = {
                'valid_loss': valid_loss,
                'epoch': epoch,
            }
            if not os.path.isdir(args.checkpoint):
                os.mkdir(args.checkpoint)
            torch.save(state, './{}/ckpt.pt'.format(args.checkpoint))
        print("End epoch ", epoch)
        epoch += 1

Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/Users/polvilavella/opt/anaconda3/envs/taed/lib/python3.10/multiprocessing/spawn.py", line 116, in spawn_main
    exitcode = _main(fd, parent_sentinel)
  File "/Users/polvilavella/opt/anaconda3/envs/taed/lib/python3.10/multiprocessing/spawn.py", line 126, in _main
    self = reduction.pickle.load(from_parent)
AttributeError: Can't get attribute 'Loader' on <module '__main__' (built-in)>
Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/Users/polvilavella/opt/anaconda3/envs/taed/lib/python3.10/multiprocessing/spawn.py", line 116, in spawn_main
    exitcode = _main(fd, parent_sentinel)
  File "/Users/polvilavella/opt/anaconda3/envs/taed/lib/python3.10/multiprocessing/spawn.py", line 126, in _main
    self = reduction.pickle.load(from_parent)
AttributeError: Can't get attribute 'Loader' on <module '__main__' (built-in)>
Traceback (most recent call last):
  File "<string>", li

RuntimeError: DataLoader worker (pid(s) 35045) exited unexpectedly

In [None]:
'''class BertforSentiment(nn.Module):
  def __init__(self, adapter_hidden_size=64):
        super().__init__()
        self.distilbert = Distilbert.from_pretrained(MODEL)
        self.hidden_size = self.distilhubert.config.hidden_size
        self.adaptor = nn.Sequential(
            nn.Linear(3*self.hidden_size, adapter_hidden_size),
            nn.ReLU(True),           
            nn.Dropout(0.1),            
            nn.Linear(adapter_hidden_size, self.hidden_size*3),
        )  
        
        self.classifier = nn.Sequential(
            nn.Linear(3*self.hidden_size, adapter_hidden_size),
            nn.ReLU(True),           
            nn.Dropout(0.1),            
            nn.Linear(adapter_hidden_size, 1),
        )
        num_layers = self.distilhubert.config.num_hidden_layers + 1 # transformer layers + input embeddings
        self.layer_weights = nn.Parameter(torch.tensor([1/3, 1/3, 1/3], dtype = torch.float32), requires_grad = True)'''

