In [1]:
import torch
import random

# If there's a GPU available...
if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [2]:

!pip install transformers



In [3]:
#@markdown Utility function (run on each Runtime restart)

from IPython.display import clear_output
import os
import sys

def download_from_gdrive(gdrive_id, filename):
    !wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id='$gdrive_id -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id="$gdrive_id -O $filename && rm -rf /tmp/cookies.txt

import torch
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('DEVICE:', DEVICE)

DEVICE: cuda


In [4]:
DATA_DIR = 'memes900k'
CAPTIONS_FILE = os.path.join(DATA_DIR, 'captions.txt')

In [5]:
#@markdown Load and process the file with data and checkpoints Google Drive IDs

GDRIVE_ID = '1S4QwcuznRxLlxkIT0Lb6vIuqDTib41B3'
FILE_IDS_NAME = 'file_ids.txt'

download_from_gdrive(GDRIVE_ID, FILE_IDS_NAME)

FILE_IDS = {}
with open(FILE_IDS_NAME, 'r') as f:
    for line in f:
        name, gid = line.strip().split('\t')
        FILE_IDS[name] = gid

clear_output()

In [6]:
#@title Load dataset

# full dataset
print('Loading the dataset from Google Drive')
fname = f'{DATA_DIR}.zip'
download_from_gdrive(FILE_IDS[fname], fname)
!unzip -o {DATA_DIR}
clear_output()

In [7]:
from __future__ import absolute_import, division, print_function

import glob
import logging
import os
import random
import json

import numpy as np
import torch
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
                              TensorDataset)
import random
from torch.utils.data.distributed import DistributedSampler
from tqdm import tqdm_notebook, trange


from transformers import (WEIGHTS_NAME, BertConfig, BertForSequenceClassification, BertTokenizer,
                                  XLMConfig, XLMForSequenceClassification, XLMTokenizer, 
                                  XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer,
                                  RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer)

from pytorch_transformers import AdamW, WarmupLinearSchedule



logging.basicConfig(level=logging.INFO)

In [8]:
args = {
    'data_dir': 'memes900k/',
    'model_type':  'xlnet',
    'model_name': 'xlnet-base-cased',
    'task_name': 'binary',
    'output_dir': 'outputs/',
    'cache_dir': 'cache/',
    'do_train': True,
    'do_eval': True,
    'fp16': False,
    'fp16_opt_level': 'O1',
    'max_seq_length': 20,
    'output_mode': 'classification',
    'train_batch_size': 8,
    'eval_batch_size': 8,

    'gradient_accumulation_steps': 1,
    'num_train_epochs': 1,
    'weight_decay': 0,
    'learning_rate': 4e-5,
    'adam_epsilon': 1e-8,
    'warmup_steps': 0,
    'max_grad_norm': 1.0,

    'logging_steps': 50,
    'evaluate_during_training': False,
    'save_steps': 2000,
    'eval_all_checkpoints': True,

    'overwrite_output_dir': False,
    'reprocess_input_data': False,
    
}
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [9]:
with open('args.json', 'w') as f:
    json.dump(args, f)

In [10]:
MODEL_CLASSES = {
    'xlnet': (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer),
    'xlm': (XLMConfig, XLMForSequenceClassification, XLMTokenizer),
    'roberta': (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer)
}

config_class, model_class, tokenizer_class = MODEL_CLASSES[args['model_type']]

In [11]:
config = config_class.from_pretrained(args['model_name'], num_labels=300)
tokenizer = tokenizer_class.from_pretrained(args['model_name'],num_labels=300)



In [12]:
print(model_class)

<class 'transformers.modeling_xlnet.XLNetForSequenceClassification'>


In [13]:
model =  XLNetForSequenceClassification.from_pretrained('xlnet-base-cased',num_labels=300)

Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.weight', 'sequence_summary.summary.bias', 'logits_proj.weight', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a

In [14]:
def prepare_features(seq_1, max_seq_length = 20, 
             zero_pad = True, include_CLS_token = True, include_SEP_token = True):
    ## Tokenzine Input
    tokens_a = tokenizer.tokenize(seq_1)
   
    ## Truncate
    if len(tokens_a) > max_seq_length - 2:
        tokens_a = tokens_a[0:(max_seq_length - 2)]
    ## Initialize Tokens
    tokens = []
    if include_CLS_token:
        tokens.append(tokenizer.cls_token)
    ## Add Tokens and separators
    for token in tokens_a:
        tokens.append(token)

    if include_SEP_token:
        tokens.append(tokenizer.sep_token)

    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    ## Input Mask 
    input_mask = [1] * len(input_ids)
    ## Zero-pad sequence lenght
    if zero_pad:
        while len(input_ids) < max_seq_length:
            input_ids.append(0)
            input_mask.append(0)
  
    return torch.tensor(input_ids).unsqueeze(0), input_mask

In [15]:
msg = "My dog is cute!"
print(prepare_features(msg)[0])

tensor([[    3,   631,  2288,    27, 10920,   136,     4,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]])


# Data preprocessing

In [16]:
x_train = []
y_train = []
# select from github dataset
with open("memes900k/captions.txt") as file:
  for line in file.readlines():
    l,_,text = line.split('\t')
    y_train.append(l.strip())
    text = text.strip().replace('<sep>',"[SEP]")
    x_train.append(text)



In [17]:
templates_map = dict(zip(set(y_train),range(300)))
templates_map_reverse = {v:k for k,v in templates_map.items()}
len(templates_map)

300

In [18]:
y_train = [templates_map[i] for i in y_train if i ]

In [19]:
x_test = []
y_test = []
with open("memes900k/captions_test.txt") as file:
  for line in file.readlines():
    l,_,text = line.split('\t')
    if l in templates_map:
      y_test.append(templates_map[l])
      text = text.strip().replace('<sep>',"[SEP]")
      x_test.append(text)

In [20]:
len(set(y_train))

300

In [21]:
# Tokenize all of the sentences and map the tokens to their word IDs.
input_ids = []
attention_masks = []

# For every sentence...
for sent in x_train:

    encoded_dict,attention = prepare_features(sent)
    
    
    # Add the encoded sentence to the list.    
    input_ids.append(encoded_dict)
    attention_masks.append(torch.Tensor(attention))
    

# Print sentence 0, now as a list of IDs.
print('Original: ', x_train[3])
print('Token IDs:', input_ids[3])


Original:  TED [SEP] y u no tell us how you met their mother
Token IDs: tensor([[    3,    17, 19425,  4145,    83,  8186,  3158,    17,   117,    17,
           660,   116,   759,   211,   160,    44,  1033,    58,   831,     4]])


In [22]:
# Convert the lists into tensors.

input_ids_t = torch.cat(input_ids, dim=0)

In [23]:
attention_masks_t = torch.stack(attention_masks, dim=0)

In [24]:
from torch.utils.data import DataLoader, random_split,WeightedRandomSampler, TensorDataset
from collections import Counter
from sklearn.model_selection import train_test_split

input_train,input_val,label_train,label_val= train_test_split(input_ids_t,torch.tensor(y_train), test_size=0.01)

train_dataset = TensorDataset(input_train, label_train)
val_dataset = TensorDataset(input_val, label_val)


In [25]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler


batch_size =128


# We'll take training samples in random order. 
train_dataloader = DataLoader(
            train_dataset,  # The training samples.
            sampler = RandomSampler(train_dataset), # Select batches randomly
            batch_size = batch_size # Trains with this batch size.
        )

# For validation the order doesn't matter, so we'll just read them sequentially.
validation_dataloader = DataLoader(
            val_dataset, # The validation samples.
            sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
            batch_size = 120 # Evaluate with this batch size.
        )

# Build Model

In [26]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
loss_function = nn.CrossEntropyLoss()
learning_rate = 1e-03
optimizer = optim.Adam(params =  model.parameters(), lr=learning_rate)

In [None]:
max_epochs = 3
model = model.train()
for epoch in tqdm_notebook(range(max_epochs)):
    print("EPOCH -- {}".format(epoch))
    for i, j in enumerate(train_dataloader):
        
        optimizer.zero_grad()
        sent=j[0]
        label=j[1]
      
        sent = sent.squeeze(0)
        if torch.cuda.is_available():
          sent = sent
          label = label

        output = model.forward(sent)[0]
        _, predicted = torch.max(output, 1)
        
        loss = loss_function(output, label)
        print("training",loss)
        loss.backward()
        optimizer.step()
        
        if i%100 == 0:
            correct = 0
            total = 0
            for k in validation_dataloader:
                sent=k[0]
                label=k[1]
                sent = sent.squeeze(0)
                if torch.cuda.is_available():
                  sent = sent
                  label = label
                output = model.forward(sent)[0]
                _, predicted = torch.max(output.data, 1)
                total += label.size(0)
                correct += (predicted.cpu() == label.cpu()).sum()
                print("validation")
            accuracy = 100.00 * correct.numpy() / total
            print('Iteration: {}. Loss: {}. Accuracy: {}%'.format(i, loss.item(), accuracy))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))

EPOCH -- 0
training tensor(5.8621, grad_fn=<NllLossBackward>)
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
validation
Iteration: 0. Loss: 5.862068176269531. Accuracy: 0.2222222222222222%
training tensor(5.7601, grad_fn=<NllLossBack

In [1]:
torch.save(model.state_dict(), 'roberat.pth')

NameError: ignored

In [None]:
import pandas as pd

# Display floats with two decimal places.
pd.set_option('precision', 2)

# Create a DataFrame from our training statistics.
df_stats = pd.DataFrame(data=bc.training_stats)

# Use the 'epoch' as the row index.
df_stats = df_stats.set_index('epoch')

# A hack to force the column headers to wrap.
#df = df.style.set_table_styles([dict(selector="th",props=[('max-width', '70px')])])

# Display the table.
df_stats

Unnamed: 0_level_0,Training Loss,Valid. Loss,Valid. Accur.,Training Time,Validation Time
epoch,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,3.09,2.7,0.4,1:44:41,0:03:42
2,2.49,2.56,0.42,1:44:56,0:03:42
3,2.11,2.54,0.43,1:44:43,0:03:42
4,1.74,2.59,0.44,1:44:46,0:03:42


# Predict

In [None]:
x_test1 = ["a group of people sitting on a bench","a dog is playing with a toy toy","a cople of women walking down a street","a man riding a wave on top of a surf board"]

In [None]:
def get_reply(msg):
  model.eval()
  input_msg, _ = prepare_features(msg)
  if torch.cuda.is_available():
    input_msg = input_msg
  output = model(input_msg)[0]
  _, pred_label = torch.max(output.data, 1)
  prediction=list(templates_map_reverse.keys())[pred_label]
  return prediction



In [None]:
for i in x_test1:
  print(get_reply(i))