In [None]:
import torch
print(torch.cuda.memory_allocated())
print(torch.cuda.memory_reserved())

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('jigsaw_toxicity_processed3.csv')

In [None]:
df['labels'] = df.loc[:, ['score','non-toxic']].values.tolist()

In [None]:
df.head(1)

In [None]:
import datasets
dataset = datasets.Dataset.from_pandas(df)

In [None]:
print(len(dataset))

In [None]:
dataset[0]

In [None]:
#from transformers import BertTokenizer
from transformers import BertTokenizer

#tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

dataset = dataset.map(
    lambda x: tokenizer(
            x['comment_text'], max_length=256, padding='max_length',
            truncation=True
        ), batched=True
)


In [None]:
dataset = dataset.remove_columns(['comment_text'])
dataset = dataset.remove_columns(['Unnamed: 0'])
dataset = dataset.remove_columns(['Unnamed: 0.1'])
dataset.set_format(type='torch', output_all_columns=True)

In [None]:
dataset

In [None]:
import torch
batch_size = 16
loader = torch.utils.data.DataLoader(dataset,batch_size=batch_size)

In [None]:
from transformers import BertModel , BertForSequenceClassification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

In [None]:
# # Get all of the model's parameters as a list of tuples.
# params = list(model.named_parameters())

# print('The BERT model has {:} different named parameters.\n'.format(len(params)))

# print('==== Embedding Layer ====\n')

# for p in params[0:5]:
#     print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

# print('\n==== First Transformer ====\n')

# for p in params[5:21]:
#     print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

# print('\n==== Output Layer ====\n')

# for p in params[-4:]:
#     print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

In [None]:
# # define mean pooling function
# def mean_pool(token_embeds, attention_mask):
#     # reshape attention_mask to cover 768-dimension embeddings
#     in_mask = attention_mask.unsqueeze(-1).expand(
#         token_embeds.size()
#     ).float()
#     # perform mean-pooling but exclude padding tokens (specified by in_mask)
#     pool = torch.sum(token_embeds * in_mask, 1) / torch.clamp(
#         in_mask.sum(1), min=1e-9
#     )
#     return pool

In [None]:
#loss_func = torch.nn.BCEWithLogitsLoss()#CrossEntropyLoss()
#linear = torch.nn.Linear(768,1)

In [None]:
def loss_fn(outputs, targets):  # set the loss_fn 
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [None]:
#set device and move model there
device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
#loss_fn.to(device)
#linear.to(device)
print(f'moved to {device}')

In [None]:
#print(torch.cuda.memory_allocated())
#print(torch.cuda.memory_reserved())

In [None]:
from transformers.optimization import get_linear_schedule_with_warmup

# initialize Adam optimizer
optim = torch.optim.Adam(model.parameters(), lr=2e-5)

# setup warmup for first ~10% of steps
total_steps = int(len(dataset['input_ids']) / batch_size)
warmup_steps = int(0.1 * total_steps)
scheduler = get_linear_schedule_with_warmup(
    optim, num_warmup_steps=warmup_steps,
    num_training_steps=total_steps-warmup_steps
)

In [None]:
all_loss=[]
all_batch=[]
all_epoch=[]

In [None]:
from tqdm.auto import tqdm

epochs = 2
# 1 epoch should be enough, increase if wanted
for epoch in range(epochs):
    model.train()  # make sure model is in training mode
    # initialize the dataloader loop with tqdm (tqdm == progress bar)
    loop = tqdm(loader,leave=True)
    #for batch in loop:
    for _,batch in enumerate(loop):
        # zero all gradients on each new step
        optim.zero_grad()
        
        # prepare batches and more all to the active device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = loss_fn(outputs.logits, labels)
        loss.backward()
        optim.step()
        scheduler.step()
        
        # update the TDQM progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())
        loss = loss.item()
        all_loss.append(loss)
        all_batch.append(_)
        all_epoch.append(epoch)
        if _%10==0:
            print(f'Epoch {str(epoch)} Btach {str(_)} Loss {str(loss)}')
            
        

In [None]:
import matplotlib.pyplot as plt
l=[]
l.extend(range(len(all_loss)))
plt.plot(l,all_loss, )

In [None]:
import os

model_path = './model_save3'

if not os.path.exists(model_path):
    os.mkdir(model_path)

model.save_pretrained(model_path)

In [None]:
all_loss

In [None]:
loss_dict = {'loss': all_loss}        
loss_df = pd.DataFrame(loss_dict) 
loss_df.to_csv('loss.csv') 