In [1]:
import numpy as np 
import pandas as pd 
import torch
import torch.utils.data as data
import torch.nn as nn
import torch.optim as optim
from transformers import DistilBertModel, BertConfig, DistilBertTokenizer
from types import SimpleNamespace
from torch.utils.data import DataLoader
import os
import csv
from transformers import AutoModel, AutoTokenizer
from sklearn.model_selection import train_test_split

In [2]:
MODEL = "distilbert-base-uncased"

In [3]:
tokenizer = DistilBertTokenizer.from_pretrained(MODEL, max_length=512, padding=True, truncation = True, return_tensors="pt")

In [4]:
args = SimpleNamespace(
    # general options
    #train_path = '../input/covid2/train',        # train data folder
    #valid_path = '../input/covid2/valid',        # valid data folder
    #test_path = '../input/covid2/test',          # test data folder
    batch_size = 32,                         # training and valid batch size
    test_batch_size = 32,                       # batch size for testing
    epochs = 4,                                 # maximum number of epochs to train
    lr = 0.05,                                 # learning rate
    momentum = 0.9,                              # SGD momentum, for SGD only
    optimizer = 'adam',                          # optimization method: sgd | adam
    log_interval = 5,                            # how many batches to wait before logging training status
    patience = 5,                                # how many epochs of no loss improvement should we wait before stop training
    checkpoint = '../models',                    # checkpoints directory
    seed = 42,
    train = True,                                # train before testing
    cuda = True,                                 # use gpu
    num_workers = 2,                             # how many subprocesses to use for data loading
    adapter_hidden_size = 64
)

In [5]:
from data import make_dataset

data_raw = "../data/raw/comments.csv"
df_raw = pd.read_csv(data_raw, index_col=0, sep=',')
df_clean = make_dataset.clean_data(df_raw)
df_clean

Unnamed: 0,Video ID,Comment,Likes,Sentiment
0,wAZZ-UWGVHI,Lets not forget that Apple Pay in 2014 require...,95.0,1.0
1,wAZZ-UWGVHI,Here in NZ 50 of retailers don’t even have con...,19.0,0.0
2,wAZZ-UWGVHI,I will forever acknowledge this channel with t...,161.0,2.0
3,wAZZ-UWGVHI,Whenever I go to a place that doesn’t take App...,8.0,0.0
4,wAZZ-UWGVHI,Apple Pay is so convenient secure and easy to ...,34.0,2.0
...,...,...,...,...
18403,cyLWtMSry58,I really like the point about engineering tool...,0.0,2.0
18404,cyLWtMSry58,I’ve just started exploring this field And thi...,20.0,2.0
18405,cyLWtMSry58,Excelente video con una pregunta filosófica pr...,1.0,1.0
18406,cyLWtMSry58,Hey Daniel just discovered your channel a coup...,35.0,2.0


In [7]:
from features import build_features

data_clean = '../data/processed/comments_clean.csv'
comments, target = build_features.preprocess(data_clean=data_clean, text_col='Comment', target_col='Sentiment')
X_train, X_test, y_train, y_test = train_test_split(comments, target, test_size=0.33, random_state=args.seed)

ParserError: Error tokenizing data. C error: Buffer overflow caught - possible malformed input file.


In [10]:
file = "../data/raw/comments.csv"
df = pd.read_csv(file, index_col=0, sep=',')
df

Unnamed: 0,Video ID,Comment,Likes,Sentiment
0,wAZZ-UWGVHI,Let's not forget that Apple Pay in 2014 requir...,95.0,1.0
1,wAZZ-UWGVHI,Here in NZ 50% of retailers don’t even have co...,19.0,0.0
2,wAZZ-UWGVHI,I will forever acknowledge this channel with t...,161.0,2.0
3,wAZZ-UWGVHI,Whenever I go to a place that doesn’t take App...,8.0,0.0
4,wAZZ-UWGVHI,"Apple Pay is so convenient, secure, and easy t...",34.0,2.0
...,...,...,...,...
18404,cyLWtMSry58,I really like the point about engineering tool...,0.0,2.0
18405,cyLWtMSry58,I’ve just started exploring this field. And th...,20.0,2.0
18406,cyLWtMSry58,Excelente video con una pregunta filosófica pr...,1.0,1.0
18407,cyLWtMSry58,"Hey Daniel, just discovered your channel a cou...",35.0,2.0


In [15]:
df_train_ids = []
df_test_ids = []
for i in range(len(df.index)):
    if i%10 < 10*0.7:
        df_train_ids.append(i)
    else:
        df_test_ids.append(i)

In [19]:
df_train = df.loc[df_train_ids].copy().reset_index(drop=True)
df_train

Unnamed: 0,Video ID,Comment,Likes,Sentiment
0,wAZZ-UWGVHI,Let's not forget that Apple Pay in 2014 requir...,95.0,1.0
1,wAZZ-UWGVHI,Here in NZ 50% of retailers don’t even have co...,19.0,0.0
2,wAZZ-UWGVHI,I will forever acknowledge this channel with t...,161.0,2.0
3,wAZZ-UWGVHI,Whenever I go to a place that doesn’t take App...,8.0,0.0
4,wAZZ-UWGVHI,"Apple Pay is so convenient, secure, and easy t...",34.0,2.0
...,...,...,...,...
12882,cyLWtMSry58,"I come from a physics background, and usually ...",5.0,2.0
12883,cyLWtMSry58,Back when I was learning to code I didn’t know...,33.0,1.0
12884,cyLWtMSry58,I really like the point about engineering tool...,0.0,2.0
12885,cyLWtMSry58,I’ve just started exploring this field. And th...,20.0,2.0


In [10]:
df['Comment'][33]

'Liquid nitrogen never ends🤗\nM4tech poli😎😎'

In [26]:
import emoji
import re

text = "game is on 🔥"
# text = df['Comment'][33]
print(text)
text_clean = emoji.demojize(text, delimiters=(" ", " "))
text_clean = re.sub(' +', ' ', text_clean)
print(text_clean)


game is on 🔥
game is on fire 


In [45]:
df_clean = df.dropna().reset_index(drop=True)

for idx, row in df_clean.iterrows():
    text = row['Comment']
    text_clean = emoji.demojize(text, delimiters=(" ", " "))
    text_clean = re.sub(' +', ' ', text_clean)
    df_clean.loc[idx,'Comment'] = text_clean

df_clean

Unnamed: 0,Video ID,Comment,Likes,Sentiment
0,wAZZ-UWGVHI,Let's not forget that Apple Pay in 2014 requir...,95.0,1.0
1,wAZZ-UWGVHI,Here in NZ 50% of retailers don’t even have co...,19.0,0.0
2,wAZZ-UWGVHI,I will forever acknowledge this channel with t...,161.0,2.0
3,wAZZ-UWGVHI,Whenever I go to a place that doesn’t take App...,8.0,0.0
4,wAZZ-UWGVHI,"Apple Pay is so convenient, secure, and easy t...",34.0,2.0
...,...,...,...,...
18403,cyLWtMSry58,I really like the point about engineering tool...,0.0,2.0
18404,cyLWtMSry58,I’ve just started exploring this field. And th...,20.0,2.0
18405,cyLWtMSry58,Excelente video con una pregunta filosófica pr...,1.0,1.0
18406,cyLWtMSry58,"Hey Daniel, just discovered your channel a cou...",35.0,2.0


In [7]:
class Loader(torch.utils.data.Dataset):
  def __init__(self, comments, sentiments):
    self.data= tokenizer(comments, padding=True, truncation = True, max_length=512,return_tensors="pt")['input_ids']
    self.target = sentiments
    
  def __getitem__(self, index):
    data = self.data[index]
    target = self.target[index]
    return data, target
  
  def __len__(self):
        return len(self.target)


In [8]:
class DistilBERTforSentiment(nn.Module):
    def __init__(self, adapter_hidden_size=64):
        super().__init__()

        self.distilbert = DistilBertModel.from_pretrained(MODEL)
        
        hidden_size = self.distilbert.config.hidden_size

        self.adaptor = nn.Sequential(
            nn.Linear(hidden_size, adapter_hidden_size),
            nn.ReLU(True),           
            nn.Dropout(0.1),            
            nn.Linear(adapter_hidden_size, hidden_size),
        )  
        
        self.classifier = nn.Sequential(
            nn.Linear(hidden_size, adapter_hidden_size),
            nn.ReLU(True),           
            nn.Dropout(0.1),            
            nn.Linear(adapter_hidden_size, 3),
        )
        
        
    def forward(self, inputs):
        outputs = self.distilbert(input_ids = inputs, return_dict=False)
        # B x seq_length x H
        x = self.adaptor(outputs[0])
        
        x,_ = x.max(dim=1)
        # B x H
        
        results = self.classifier(x)
        return results

In [9]:
args.cuda = args.cuda and torch.cuda.is_available()
if args.cuda:
    print('Using CUDA with {0} GPUs'.format(torch.cuda.device_count()))

# build model
model = DistilBERTforSentiment(adapter_hidden_size=args.adapter_hidden_size)

for param in model.distilbert.parameters():
    param.requires_grad = False
    
if args.cuda:
    model.cuda()

# Define criterion
criterion = nn.CrossEntropyLoss()

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [10]:
def train_one_epoch(trainloader, model, criterion, optimizer, epoch_index, cuda,max_norm=1):
    model.train()
    running_loss = 0
    accumulation_steps = 40 # effective 40 batch  
    for i, (input_ids,target) in enumerate(trainloader, 0):
        if cuda:
            input_ids, target = input_ids.cuda(), target.cuda()
        #optimizer.zero_grad()
        output = model(input_ids)
        loss = criterion(output, target)
        #print(torch.sigmoid(output), target)
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm)
        if (i+1) % accumulation_steps == 0:
            optimizer.step()                 # Now we can do an optimizer step
            optimizer.zero_grad()
        running_loss += loss.item()
        if i % 1000 == 999:
            last_loss = running_loss / 1000 # loss per batch
            print('  batch {} loss: {}'.format(i + 1, last_loss))
            tb_x = epoch_index * len(trainloader) + i + 1
            print('Loss/train', last_loss, tb_x)
            running_loss = 0.
    return running_loss / i

In [11]:
def test_one_epoch(test_loader,model,criterion, cuda, avg_loss):
    running_vloss = 0.0
    best_vloss = 99999
    for i, (input_ids,target) in enumerate(test_loader, 0):
        if cuda:
            input_ids, target = input_ids.cuda(), target.cuda()
        output = model(input_ids)
        loss = criterion(output, target)
        running_vloss += loss

    avg_vloss = running_vloss / (i + 1)
    print('LOSS train {} valid {}'.format(avg_loss, avg_vloss))
    return avg_vloss

In [12]:
# num_workers set to 8 as recommended when training in local
training_set = Loader(X_train, y_train) #X['attention_mask']
train_loader = torch.utils.data.DataLoader(training_set, batch_size=1,
                                          shuffle=True, num_workers=8)

test_set = Loader(X_test, y_test) #X['attention_mask']
test_loader = torch.utils.data.DataLoader(test_set, batch_size=1,
                                          shuffle=True, num_workers=8)

In [14]:
train_loader

<torch.utils.data.dataloader.DataLoader at 0x144b4c280>

In [15]:
optimizer = optim.Adam(model.parameters(), lr=args.lr)
epoch = 0
best_valid_loss = 9999
while (epoch < args.epochs + 1):
        train_loss = train_one_epoch(train_loader, model, criterion, optimizer, epoch, args.cuda)
        valid_loss = test_one_epoch(test_loader, model, criterion, args.cuda, train_loss)
        if not os.path.isdir(args.checkpoint):
            os.mkdir(args.checkpoint)
        torch.save(model.state_dict(), '../models/{}/model{:03d}.pt'.format(args.checkpoint, epoch))
        if valid_loss <= best_valid_loss:
            print('Saving state')
            best_valid_loss = valid_loss
            best_epoch = epoch
            state = {
                'valid_loss': valid_loss,
                'epoch': epoch,
            }
            if not os.path.isdir(args.checkpoint):
                os.mkdir(args.checkpoint)
            torch.save(state, './{}/ckpt.pt'.format(args.checkpoint))
        print("End epoch ", epoch)
        epoch += 1

Traceback (most recent call last):
  File "<string>", line 1, in <module>
Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/Users/polvilavella/opt/anaconda3/envs/taed/lib/python3.10/multiprocessing/spawn.py", line 116, in spawn_main
  File "/Users/polvilavella/opt/anaconda3/envs/taed/lib/python3.10/multiprocessing/spawn.py", line 116, in spawn_main
    exitcode = _main(fd, parent_sentinel)
  File "/Users/polvilavella/opt/anaconda3/envs/taed/lib/python3.10/multiprocessing/spawn.py", line 126, in _main
    self = reduction.pickle.load(from_parent)
AttributeError: Can't get attribute 'Loader' on <module '__main__' (built-in)>
    exitcode = _main(fd, parent_sentinel)
  File "/Users/polvilavella/opt/anaconda3/envs/taed/lib/python3.10/multiprocessing/spawn.py", line 126, in _main
    self = reduction.pickle.load(from_parent)
AttributeError: Can't get attribute 'Loader' on <module '__main__' (built-in)>
Traceback (most recent call last):
  File "<string>", li

RuntimeError: DataLoader worker (pid(s) 36354, 36357) exited unexpectedly

In [None]:
'''class BertforSentiment(nn.Module):
  def __init__(self, adapter_hidden_size=64):
        super().__init__()
        self.distilbert = Distilbert.from_pretrained(MODEL)
        self.hidden_size = self.distilhubert.config.hidden_size
        self.adaptor = nn.Sequential(
            nn.Linear(3*self.hidden_size, adapter_hidden_size),
            nn.ReLU(True),           
            nn.Dropout(0.1),            
            nn.Linear(adapter_hidden_size, self.hidden_size*3),
        )  
        
        self.classifier = nn.Sequential(
            nn.Linear(3*self.hidden_size, adapter_hidden_size),
            nn.ReLU(True),           
            nn.Dropout(0.1),            
            nn.Linear(adapter_hidden_size, 1),
        )
        num_layers = self.distilhubert.config.num_hidden_layers + 1 # transformer layers + input embeddings
        self.layer_weights = nn.Parameter(torch.tensor([1/3, 1/3, 1/3], dtype = torch.float32), requires_grad = True)'''

