In [3]:
import torch
import random

# If there's a GPU available...
if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [4]:

!pip install pytorch-transformers



In [5]:
#@markdown Utility function (run on each Runtime restart)

from IPython.display import clear_output
import os
import sys

def download_from_gdrive(gdrive_id, filename):
    !wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id='$gdrive_id -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id="$gdrive_id -O $filename && rm -rf /tmp/cookies.txt

import torch
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('DEVICE:', DEVICE)

DEVICE: cuda


In [6]:
DATA_DIR = 'memes900k'
CAPTIONS_FILE = os.path.join(DATA_DIR, 'captions.txt')

In [7]:
#@markdown Load and process the file with data and checkpoints Google Drive IDs

GDRIVE_ID = '1S4QwcuznRxLlxkIT0Lb6vIuqDTib41B3'
FILE_IDS_NAME = 'file_ids.txt'

download_from_gdrive(GDRIVE_ID, FILE_IDS_NAME)

FILE_IDS = {}
with open(FILE_IDS_NAME, 'r') as f:
    for line in f:
        name, gid = line.strip().split('\t')
        FILE_IDS[name] = gid

clear_output()

In [8]:
#@title Load dataset

# full dataset
print('Loading the dataset from Google Drive')
fname = f'{DATA_DIR}.zip'
download_from_gdrive(FILE_IDS[fname], fname)
!unzip -o {DATA_DIR}
clear_output()

In [9]:
from __future__ import absolute_import, division, print_function

import glob
import logging
import os
import random
import json

import numpy as np
import torch
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
                              TensorDataset)
import random
from torch.utils.data.distributed import DistributedSampler
from tqdm import tqdm_notebook, trange


from pytorch_transformers import (WEIGHTS_NAME, BertConfig, BertForSequenceClassification, BertTokenizer,
                                  XLMConfig, XLMForSequenceClassification, XLMTokenizer, 
                                  XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer,
                                  RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer)

from pytorch_transformers import AdamW, WarmupLinearSchedule



logging.basicConfig(level=logging.INFO)

In [10]:
args = {
    'data_dir': 'memes900k/',
    'model_type':  'roberta',
    'model_name': 'roberta-base',
    'task_name': 'binary',
    'output_dir': 'outputs/',
    'cache_dir': 'cache/',
    'do_train': True,
    'do_eval': True,
    'fp16': False,
    'fp16_opt_level': 'O1',
    'max_seq_length': 128,
    'output_mode': 'classification',
    'train_batch_size': 8,
    'eval_batch_size': 8,

    'gradient_accumulation_steps': 1,
    'num_train_epochs': 1,
    'weight_decay': 0,
    'learning_rate': 4e-5,
    'adam_epsilon': 1e-8,
    'warmup_steps': 0,
    'max_grad_norm': 1.0,

    'logging_steps': 50,
    'evaluate_during_training': False,
    'save_steps': 2000,
    'eval_all_checkpoints': True,

    'overwrite_output_dir': False,
    'reprocess_input_data': False,
    
}
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [11]:
with open('args.json', 'w') as f:
    json.dump(args, f)

In [12]:
MODEL_CLASSES = {
    'xlnet': (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer),
    'xlm': (XLMConfig, XLMForSequenceClassification, XLMTokenizer),
    'roberta': (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer)
}

config_class, model_class, tokenizer_class = MODEL_CLASSES[args['model_type']]

In [13]:
config = config_class.from_pretrained(args['model_name'], num_labels=300)
tokenizer = tokenizer_class.from_pretrained(args['model_name'],num_labels=300)

INFO:pytorch_transformers.modeling_utils:loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-config.json from cache at /root/.cache/torch/pytorch_transformers/e1a2a406b5a05063c31f4dfdee7608986ba7c6393f7f79db5e69dcd197208534.117c81977c5979de8c088352e74ec6e70f5c66096c28b61d3c50101609b39690
INFO:pytorch_transformers.modeling_utils:Model config {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 300,
  "output_attentions": false,
  "output_hidden_states": false,
  "pad_token_id": 1,
  "pruned_heads": {},
  "torchscript": false,
  "type_vocab_size": 1,
  "

In [14]:
model = model_class.from_pretrained(args['model_name'],num_labels=300)

INFO:pytorch_transformers.modeling_utils:loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-config.json from cache at /root/.cache/torch/pytorch_transformers/e1a2a406b5a05063c31f4dfdee7608986ba7c6393f7f79db5e69dcd197208534.117c81977c5979de8c088352e74ec6e70f5c66096c28b61d3c50101609b39690
INFO:pytorch_transformers.modeling_utils:Model config {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 300,
  "output_attentions": false,
  "output_hidden_states": false,
  "pad_token_id": 1,
  "pruned_heads": {},
  "torchscript": false,
  "type_vocab_size": 1,
  "

In [15]:
def prepare_features(seq_1, max_seq_length = 20, 
             zero_pad = True, include_CLS_token = True, include_SEP_token = True):
    ## Tokenzine Input
    tokens_a = tokenizer.tokenize(seq_1)
   
    ## Truncate
    if len(tokens_a) > max_seq_length - 2:
        tokens_a = tokens_a[0:(max_seq_length - 2)]
    ## Initialize Tokens
    tokens = []
    if include_CLS_token:
        tokens.append(tokenizer.cls_token)
    ## Add Tokens and separators
    for token in tokens_a:
        tokens.append(token)

    if include_SEP_token:
        tokens.append(tokenizer.sep_token)

    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    ## Input Mask 
    input_mask = [1] * len(input_ids)
    ## Zero-pad sequence lenght
    if zero_pad:
        while len(input_ids) < max_seq_length:
            input_ids.append(0)
            input_mask.append(0)
  
    return torch.tensor(input_ids).unsqueeze(0), input_mask

In [16]:
msg = "My dog is cute!"
print(prepare_features(msg)[0])

tensor([[    0,  1308,  2335,    16, 11962,   328,     2,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]])


# Data preprocessing

In [17]:
x_train = []
y_train = []
# select from github dataset
with open("memes900k/captions.txt") as file:
  for line in file.readlines():
    l,_,text = line.split('\t')
    y_train.append(l.strip())
    text = text.strip().replace('<sep>',"[SEP]")
    x_train.append(text)



In [18]:
templates_map = dict(zip(set(y_train),range(300)))
templates_map_reverse = {v:k for k,v in templates_map.items()}
len(templates_map)

300

In [19]:
y_train = [templates_map[i] for i in y_train if i ]

In [20]:
x_test = []
y_test = []
with open("memes900k/captions_test.txt") as file:
  for line in file.readlines():
    l,_,text = line.split('\t')
    if l in templates_map:
      y_test.append(templates_map[l])
      text = text.strip().replace('<sep>',"[SEP]")
      x_test.append(text)

In [21]:
len(set(y_train))

300

In [22]:
# Tokenize all of the sentences and map the tokens to their word IDs.
input_ids = []
attention_masks = []

# For every sentence...
for sent in x_train:
    # `encode_plus` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    #   (5) Pad or truncate the sentence to `max_length`
    #   (6) Create attention masks for [PAD] tokens.
    
    encoded_dict,attention = prepare_features(sent)
    
    
    # Add the encoded sentence to the list.    
    input_ids.append(encoded_dict)
    attention_masks.append(torch.Tensor(attention))
    

# Convert the lists into tensors.



# Print sentence 0, now as a list of IDs.
print('Original: ', x_train[3])
print('Token IDs:', input_ids[3])


Original:  TED [SEP] y u no tell us how you met their mother
Token IDs: tensor([[    0, 32690,   646,  3388,   510,   742,  1423,  1717,   117,  1137,
           201,   141,    47,  1145,    49,   985,     2,     0,     0,     0]])


In [24]:
# Convert the lists into tensors.

input_ids_t = torch.cat(input_ids, dim=0)

In [28]:
attention_masks_t = torch.stack(attention_masks, dim=0)

In [37]:
from torch.utils.data import DataLoader, random_split,WeightedRandomSampler, TensorDataset
from collections import Counter
from sklearn.model_selection import train_test_split

input_train,input_val,label_train,label_val= train_test_split(input_ids_t,torch.tensor(y_train), test_size=0.01)

train_dataset = TensorDataset(input_train, label_train)
val_dataset = TensorDataset(input_val, label_val)


In [40]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

# The DataLoader needs to know our batch size for training, so we specify it 
# here. For fine-tuning BERT on a specific task, the authors recommend a batch 
# size of 16 or 32.
batch_size = 64

# Create the DataLoaders for our training and validation sets.
# We'll take training samples in random order. 
train_dataloader = DataLoader(
            train_dataset,  # The training samples.
            sampler = RandomSampler(train_dataset), # Select batches randomly
            batch_size = batch_size # Trains with this batch size.
        )

# For validation the order doesn't matter, so we'll just read them sequentially.
validation_dataloader = DataLoader(
            val_dataset, # The validation samples.
            sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
            batch_size = 120 # Evaluate with this batch size.
        )

# Build Model

In [42]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
loss_function = nn.CrossEntropyLoss()
learning_rate = 1e-03
optimizer = optim.Adam(params =  model.parameters(), lr=learning_rate)

In [None]:
max_epochs = 3
model = model.train()
for epoch in tqdm_notebook(range(max_epochs)):
    print("EPOCH -- {}".format(epoch))
    for i, j in enumerate(train_dataloader):
        
        optimizer.zero_grad()
        sent=j[0]
        label=j[1]
      
        sent = sent.squeeze(0)
        if torch.cuda.is_available():
          sent = sent
          label = label

        output = model.forward(sent)[0]
        _, predicted = torch.max(output, 1)
        
        loss = loss_function(output, label)
        #print("training",loss)
        loss.backward()
        optimizer.step()
        
        if i%100 == 0:
            correct = 0
            total = 0
            for k in validation_dataloader:
                sent=k[0]
                label=k[1]
                sent = sent.squeeze(0)
                if torch.cuda.is_available():
                  sent = sent
                  label = label
                output = model.forward(sent)[0]
                _, predicted = torch.max(output.data, 1)
                total += label.size(0)
                correct += (predicted.cpu() == label.cpu()).sum()
                #print("validation")
            accuracy = 100.00 * correct.numpy() / total
            print('Iteration: {}. Loss: {}. Accuracy: {}%'.format(i, loss.item(), accuracy))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))

EPOCH -- 0
training tensor(5.7090, grad_fn=<NllLossBackward>)
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
Iteration: 0. Loss: 5.709024906158447. Accuracy: 0.24444444444444444%
training tensor(5.7152, grad_fn=<NllLossBac

In [None]:
torch.save(model.state_dict(), 'roberat.pth')

In [None]:
import pandas as pd

# Display floats with two decimal places.
pd.set_option('precision', 2)

# Create a DataFrame from our training statistics.
df_stats = pd.DataFrame(data=bc.training_stats)

# Use the 'epoch' as the row index.
df_stats = df_stats.set_index('epoch')

# A hack to force the column headers to wrap.
#df = df.style.set_table_styles([dict(selector="th",props=[('max-width', '70px')])])

# Display the table.
df_stats

Unnamed: 0_level_0,Training Loss,Valid. Loss,Valid. Accur.,Training Time,Validation Time
epoch,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,3.09,2.7,0.4,1:44:41,0:03:42
2,2.49,2.56,0.42,1:44:56,0:03:42
3,2.11,2.54,0.43,1:44:43,0:03:42
4,1.74,2.59,0.44,1:44:46,0:03:42


# Predict

In [None]:
x_test1 = ["a group of people sitting on a bench","a dog is playing with a toy toy","a cople of women walking down a street","a man riding a wave on top of a surf board"]

In [None]:
def get_reply(msg):
  model.eval()
  input_msg, _ = prepare_features(msg)
  if torch.cuda.is_available():
    input_msg = input_msg
  output = model(input_msg)[0]
  _, pred_label = torch.max(output.data, 1)
  prediction=list(templates_map_reverse.keys())[pred_label]
  return prediction



In [None]:
for i in x_test1:
  print(get_reply(i))