In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [2]:
!pip install transformers

Defaulting to user installation because normal site-packages is not writeable


You should consider upgrading via the 'c:\program files\python37\python.exe -m pip install --upgrade pip' command.


In [3]:
from tqdm import tqdm

In [9]:
from os import path
train_path = path.join("train.csv")
df = pd.read_csv(train_path, delimiter="\t")
df.head()

Unnamed: 0.1,Unnamed: 0,label,text
0,0,0,"Dang dog, thanks"
1,1,0,to summon the powers of the flying spaghetti m...
2,2,0,i did that 3rd last 1 by accident last night
3,3,0,"He's insane, used him in DC, better than Blake..."
4,4,0,"Forgot about him, he's a pretty pointless card..."


In [11]:
df = df.iloc[: , 1:]

In [13]:
sentences = df.text.values
labels = df.label.values

In [14]:
from transformers import BertTokenizer
import transformers

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',do_lower_case = True)

In [15]:
# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids = []

# For every sentence...
for sent in tqdm(sentences):
    # `encode` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    encoded_sent = tokenizer.encode(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        # This function also supports truncation and conversion
                        # to pytorch tensors, but we need to do padding, so we
                        # can't use these features :( .
                        #max_length = 128,          # Truncate all sentences.
                        #return_tensors = 'pt',     # Return pytorch tensors.
                   )
    
    # Add the encoded sentence to the list.
    input_ids.append(encoded_sent)

# Print sentence 0, now as a list of IDs.
print('Original: ', sentences[0])
print('Token IDs:', input_ids[0])

  0%|▏                                                                        | 3609/1125670 [00:01<05:44, 3254.63it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1778 > 512). Running this sequence through the model will result in indexing errors
100%|██████████████████████████████████████████████████████████████████████| 1125670/1125670 [05:48<00:00, 3234.54it/s]

Original:  Dang dog, thanks
Token IDs: [101, 4907, 2290, 3899, 1010, 4283, 102]





In [16]:
print('Max sentence length: ', max([len(sen) for sen in input_ids]))

Max sentence length:  9827


In [17]:
from keras.preprocessing.sequence import pad_sequences

# Set the maximum sequence length.
# I've chosen 64 somewhat arbitrarily. It's slightly larger than the
# maximum training sentence length of 47...
MAX_LEN = 64

print('\nPadding/truncating all sentences to %d values...' % MAX_LEN)

print('\nPadding token: "{:}", ID: {:}'.format(tokenizer.pad_token, tokenizer.pad_token_id))

# Pad our input tokens with value 0.
# "post" indicates that we want to pad and truncate at the end of the sequence,
# as opposed to the beginning.
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", 
                          value=0, truncating="post", padding="post")

print('\nDone.')


Padding/truncating all sentences to 64 values...

Padding token: "[PAD]", ID: 0

Done.


In [18]:
# Create attention masks
attention_masks = []

# For each sentence...
for sent in tqdm(input_ids):
    
    # Create the attention mask.
    #   - If a token ID is 0, then it's padding, set the mask to 0.
    #   - If a token ID is > 0, then it's a real token, set the mask to 1.
    att_mask = [int(token_id > 0) for token_id in sent]
    
    # Store the attention mask for this sentence.
    attention_masks.append(att_mask)

100%|█████████████████████████████████████████████████████████████████████| 1125670/1125670 [00:30<00:00, 36502.26it/s]


In [19]:
# Use train_test_split to split our data into train and validation sets for
# training
from sklearn.model_selection import train_test_split

# Use 90% for training and 10% for validation.
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, 
                                                            random_state=2018, test_size=0.1)
# Do the same for the masks.
train_masks, validation_masks, _, _ = train_test_split(attention_masks, labels,
                                             random_state=2018, test_size=0.1)

In [20]:
import torch

train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)

train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)

train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

In [21]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# The DataLoader needs to know our batch size for training, so we specify it 
# here.
# For fine-tuning BERT on a specific task, the authors recommend a batch size of
# 16 or 32.

batch_size = 32

# Create the DataLoader for our training set.
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create the DataLoader for our validation set.
validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

In [22]:
from transformers import BertForSequenceClassification, AdamW, BertConfig

# Load BertForSequenceClassification, the pretrained BERT model with a single 
# linear classification layer on top. 
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 2, # The number of output labels--2 for binary classification.
                    # You can increase this for multi-class tasks.   
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

# Tell pytorch to run this model on the GPU.
model.cuda()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [23]:
# Get all of the model's parameters as a list of tuples.
params = list(model.named_parameters())

print('The BERT model has {:} different named parameters.\n'.format(len(params)))

print('==== Embedding Layer ====\n')

for p in params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== First Transformer ====\n')

for p in params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== Output Layer ====\n')

for p in params[-4:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

The BERT model has 201 different named parameters.

==== Embedding Layer ====

bert.embeddings.word_embeddings.weight                  (30522, 768)
bert.embeddings.position_embeddings.weight                (512, 768)
bert.embeddings.token_type_embeddings.weight                (2, 768)
bert.embeddings.LayerNorm.weight                              (768,)
bert.embeddings.LayerNorm.bias                                (768,)

==== First Transformer ====

bert.encoder.layer.0.attention.self.query.weight          (768, 768)
bert.encoder.layer.0.attention.self.query.bias                (768,)
bert.encoder.layer.0.attention.self.key.weight            (768, 768)
bert.encoder.layer.0.attention.self.key.bias                  (768,)
bert.encoder.layer.0.attention.self.value.weight          (768, 768)
bert.encoder.layer.0.attention.self.value.bias                (768,)
bert.encoder.layer.0.attention.output.dense.weight        (768, 768)
bert.encoder.layer.0.attention.output.dense.bias              (

In [24]:
# I believe the 'W' stands for 'Weight Decay fix"
optimizer = AdamW(model.parameters(),
                  lr = 1e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )



In [29]:
from transformers import get_linear_schedule_with_warmup

# Number of training epochs (authors recommend between 2 and 4)
epochs = 2

# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

In [30]:
import numpy as np

# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [31]:
import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

device = 'cuda'

In [32]:
import random



# Set the seed value all over the place to make this reproducible.
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# Store the average loss after each epoch so we can plot them.
loss_values = []

# For each epoch...
for epoch_i in range(0, epochs):
    
    # ========================================
    #               Training
    # ========================================
    
    # Perform one full pass over the training set.

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # Measure how long the training epoch takes.
    t0 = time.time()

    # Reset the total loss for this epoch.
    total_loss = 0

    # Put the model into training mode. Don't be mislead--the call to 
    # `train` just changes the *mode*, it doesn't *perform* the training.
    # `dropout` and `batchnorm` layers behave differently during training
    # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)
    model.train()

    # For each batch of training data...
    for step, batch in enumerate(train_dataloader):

        # Progress update every 40 batches.
        if step % 40 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
            
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        # Unpack this training batch from our dataloader. 
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using the 
        # `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: labels 
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        # Always clear any previously calculated gradients before performing a
        # backward pass. PyTorch doesn't do this automatically because 
        # accumulating the gradients is "convenient while training RNNs". 
        # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
        model.zero_grad()        

        # Perform a forward pass (evaluate the model on this training batch).
        # This will return the loss (rather than the model output) because we
        # have provided the `labels`.
        # The documentation for this `model` function is here: 
        # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
        outputs = model(b_input_ids, 
                    token_type_ids=None, 
                    attention_mask=b_input_mask, 
                    labels=b_labels)
        
        # The call to `model` always returns a tuple, so we need to pull the 
        # loss value out of the tuple.
        loss = outputs[0]

        # Accumulate the training loss over all of the batches so that we can
        # calculate the average loss at the end. `loss` is a Tensor containing a
        # single value; the `.item()` function just returns the Python value 
        # from the tensor.
        total_loss += loss.item()

        # Perform a backward pass to calculate the gradients.
        loss.backward()

        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters and take a step using the computed gradient.
        # The optimizer dictates the "update rule"--how the parameters are
        # modified based on their gradients, the learning rate, etc.
        optimizer.step()

        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over the training data.
    avg_train_loss = total_loss / len(train_dataloader)            
    
    # Store the loss value for plotting the learning curve.
    loss_values.append(avg_train_loss)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(format_time(time.time() - t0)))
        
    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.

    print("")
    print("Running Validation...")

    t0 = time.time()

    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    model.eval()

    # Tracking variables 
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0

    # Evaluate data for one epoch
    for batch in validation_dataloader:
        
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        
        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels = batch
        
        # Telling the model not to compute or store gradients, saving memory and
        # speeding up validation
        with torch.no_grad():        

            # Forward pass, calculate logit predictions.
            # This will return the logits rather than the loss because we have
            # not provided labels.
            # token_type_ids is the same as the "segment ids", which 
            # differentiates sentence 1 and 2 in 2-sentence tasks.
            # The documentation for this `model` function is here: 
            # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
            outputs = model(b_input_ids, 
                            token_type_ids=None, 
                            attention_mask=b_input_mask)
        
        # Get the "logits" output by the model. The "logits" are the output
        # values prior to applying an activation function like the softmax.
        logits = outputs[0]

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        # Calculate the accuracy for this batch of test sentences.
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        
        # Accumulate the total accuracy.
        eval_accuracy += tmp_eval_accuracy

        # Track the number of batches
        nb_eval_steps += 1

    # Report the final accuracy for this validation run.
    print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
    print("  Validation took: {:}".format(format_time(time.time() - t0)))

print("")
print("Training complete!")


Training...
  Batch    40  of  31,660.    Elapsed: 0:00:14.
  Batch    80  of  31,660.    Elapsed: 0:00:28.
  Batch   120  of  31,660.    Elapsed: 0:00:42.
  Batch   160  of  31,660.    Elapsed: 0:00:57.
  Batch   200  of  31,660.    Elapsed: 0:01:11.
  Batch   240  of  31,660.    Elapsed: 0:01:24.
  Batch   280  of  31,660.    Elapsed: 0:01:38.
  Batch   320  of  31,660.    Elapsed: 0:01:52.
  Batch   360  of  31,660.    Elapsed: 0:02:06.
  Batch   400  of  31,660.    Elapsed: 0:02:19.
  Batch   440  of  31,660.    Elapsed: 0:02:33.
  Batch   480  of  31,660.    Elapsed: 0:02:46.
  Batch   520  of  31,660.    Elapsed: 0:03:00.
  Batch   560  of  31,660.    Elapsed: 0:03:14.
  Batch   600  of  31,660.    Elapsed: 0:03:27.
  Batch   640  of  31,660.    Elapsed: 0:03:41.
  Batch   680  of  31,660.    Elapsed: 0:03:55.
  Batch   720  of  31,660.    Elapsed: 0:04:08.
  Batch   760  of  31,660.    Elapsed: 0:04:22.
  Batch   800  of  31,660.    Elapsed: 0:04:36.
  Batch   840  of  31,660. 

  Batch 6,840  of  31,660.    Elapsed: 0:39:30.
  Batch 6,880  of  31,660.    Elapsed: 0:39:44.
  Batch 6,920  of  31,660.    Elapsed: 0:39:58.
  Batch 6,960  of  31,660.    Elapsed: 0:40:12.
  Batch 7,000  of  31,660.    Elapsed: 0:40:26.
  Batch 7,040  of  31,660.    Elapsed: 0:40:40.
  Batch 7,080  of  31,660.    Elapsed: 0:40:54.
  Batch 7,120  of  31,660.    Elapsed: 0:41:08.
  Batch 7,160  of  31,660.    Elapsed: 0:41:22.
  Batch 7,200  of  31,660.    Elapsed: 0:41:36.
  Batch 7,240  of  31,660.    Elapsed: 0:41:50.
  Batch 7,280  of  31,660.    Elapsed: 0:42:04.
  Batch 7,320  of  31,660.    Elapsed: 0:42:18.
  Batch 7,360  of  31,660.    Elapsed: 0:42:32.
  Batch 7,400  of  31,660.    Elapsed: 0:42:46.
  Batch 7,440  of  31,660.    Elapsed: 0:43:00.
  Batch 7,480  of  31,660.    Elapsed: 0:43:15.
  Batch 7,520  of  31,660.    Elapsed: 0:43:29.
  Batch 7,560  of  31,660.    Elapsed: 0:43:43.
  Batch 7,600  of  31,660.    Elapsed: 0:43:57.
  Batch 7,640  of  31,660.    Elapsed: 0

  Batch 13,600  of  31,660.    Elapsed: 1:17:57.
  Batch 13,640  of  31,660.    Elapsed: 1:18:10.
  Batch 13,680  of  31,660.    Elapsed: 1:18:24.
  Batch 13,720  of  31,660.    Elapsed: 1:18:37.
  Batch 13,760  of  31,660.    Elapsed: 1:18:51.
  Batch 13,800  of  31,660.    Elapsed: 1:19:05.
  Batch 13,840  of  31,660.    Elapsed: 1:19:18.
  Batch 13,880  of  31,660.    Elapsed: 1:19:32.
  Batch 13,920  of  31,660.    Elapsed: 1:19:45.
  Batch 13,960  of  31,660.    Elapsed: 1:19:59.
  Batch 14,000  of  31,660.    Elapsed: 1:20:12.
  Batch 14,040  of  31,660.    Elapsed: 1:20:26.
  Batch 14,080  of  31,660.    Elapsed: 1:20:39.
  Batch 14,120  of  31,660.    Elapsed: 1:20:53.
  Batch 14,160  of  31,660.    Elapsed: 1:21:06.
  Batch 14,200  of  31,660.    Elapsed: 1:21:20.
  Batch 14,240  of  31,660.    Elapsed: 1:21:33.
  Batch 14,280  of  31,660.    Elapsed: 1:21:47.
  Batch 14,320  of  31,660.    Elapsed: 1:22:00.
  Batch 14,360  of  31,660.    Elapsed: 1:22:14.
  Batch 14,400  of  

  Batch 20,320  of  31,660.    Elapsed: 1:56:07.
  Batch 20,360  of  31,660.    Elapsed: 1:56:20.
  Batch 20,400  of  31,660.    Elapsed: 1:56:34.
  Batch 20,440  of  31,660.    Elapsed: 1:56:47.
  Batch 20,480  of  31,660.    Elapsed: 1:57:01.
  Batch 20,520  of  31,660.    Elapsed: 1:57:14.
  Batch 20,560  of  31,660.    Elapsed: 1:57:28.
  Batch 20,600  of  31,660.    Elapsed: 1:57:41.
  Batch 20,640  of  31,660.    Elapsed: 1:57:55.
  Batch 20,680  of  31,660.    Elapsed: 1:58:08.
  Batch 20,720  of  31,660.    Elapsed: 1:58:22.
  Batch 20,760  of  31,660.    Elapsed: 1:58:36.
  Batch 20,800  of  31,660.    Elapsed: 1:58:49.
  Batch 20,840  of  31,660.    Elapsed: 1:59:03.
  Batch 20,880  of  31,660.    Elapsed: 1:59:16.
  Batch 20,920  of  31,660.    Elapsed: 1:59:30.
  Batch 20,960  of  31,660.    Elapsed: 1:59:43.
  Batch 21,000  of  31,660.    Elapsed: 1:59:57.
  Batch 21,040  of  31,660.    Elapsed: 2:00:10.
  Batch 21,080  of  31,660.    Elapsed: 2:00:24.
  Batch 21,120  of  

  Batch 27,040  of  31,660.    Elapsed: 2:33:56.
  Batch 27,080  of  31,660.    Elapsed: 2:34:10.
  Batch 27,120  of  31,660.    Elapsed: 2:34:23.
  Batch 27,160  of  31,660.    Elapsed: 2:34:36.
  Batch 27,200  of  31,660.    Elapsed: 2:34:50.
  Batch 27,240  of  31,660.    Elapsed: 2:35:03.
  Batch 27,280  of  31,660.    Elapsed: 2:35:16.
  Batch 27,320  of  31,660.    Elapsed: 2:35:30.
  Batch 27,360  of  31,660.    Elapsed: 2:35:43.
  Batch 27,400  of  31,660.    Elapsed: 2:35:57.
  Batch 27,440  of  31,660.    Elapsed: 2:36:10.
  Batch 27,480  of  31,660.    Elapsed: 2:36:23.
  Batch 27,520  of  31,660.    Elapsed: 2:36:37.
  Batch 27,560  of  31,660.    Elapsed: 2:36:50.
  Batch 27,600  of  31,660.    Elapsed: 2:37:04.
  Batch 27,640  of  31,660.    Elapsed: 2:37:17.
  Batch 27,680  of  31,660.    Elapsed: 2:37:30.
  Batch 27,720  of  31,660.    Elapsed: 2:37:44.
  Batch 27,760  of  31,660.    Elapsed: 2:37:57.
  Batch 27,800  of  31,660.    Elapsed: 2:38:10.
  Batch 27,840  of  

  Batch 2,000  of  31,660.    Elapsed: 0:11:32.
  Batch 2,040  of  31,660.    Elapsed: 0:11:46.
  Batch 2,080  of  31,660.    Elapsed: 0:12:00.
  Batch 2,120  of  31,660.    Elapsed: 0:12:13.
  Batch 2,160  of  31,660.    Elapsed: 0:12:27.
  Batch 2,200  of  31,660.    Elapsed: 0:12:40.
  Batch 2,240  of  31,660.    Elapsed: 0:12:54.
  Batch 2,280  of  31,660.    Elapsed: 0:13:08.
  Batch 2,320  of  31,660.    Elapsed: 0:13:21.
  Batch 2,360  of  31,660.    Elapsed: 0:13:35.
  Batch 2,400  of  31,660.    Elapsed: 0:13:49.
  Batch 2,440  of  31,660.    Elapsed: 0:14:02.
  Batch 2,480  of  31,660.    Elapsed: 0:14:16.
  Batch 2,520  of  31,660.    Elapsed: 0:14:29.
  Batch 2,560  of  31,660.    Elapsed: 0:14:43.
  Batch 2,600  of  31,660.    Elapsed: 0:14:57.
  Batch 2,640  of  31,660.    Elapsed: 0:15:10.
  Batch 2,680  of  31,660.    Elapsed: 0:15:24.
  Batch 2,720  of  31,660.    Elapsed: 0:15:37.
  Batch 2,760  of  31,660.    Elapsed: 0:15:51.
  Batch 2,800  of  31,660.    Elapsed: 0

  Batch 8,840  of  31,660.    Elapsed: 0:50:18.
  Batch 8,880  of  31,660.    Elapsed: 0:50:31.
  Batch 8,920  of  31,660.    Elapsed: 0:50:45.
  Batch 8,960  of  31,660.    Elapsed: 0:50:58.
  Batch 9,000  of  31,660.    Elapsed: 0:51:12.
  Batch 9,040  of  31,660.    Elapsed: 0:51:26.
  Batch 9,080  of  31,660.    Elapsed: 0:51:39.
  Batch 9,120  of  31,660.    Elapsed: 0:51:53.
  Batch 9,160  of  31,660.    Elapsed: 0:52:06.
  Batch 9,200  of  31,660.    Elapsed: 0:52:20.
  Batch 9,240  of  31,660.    Elapsed: 0:52:33.
  Batch 9,280  of  31,660.    Elapsed: 0:52:47.
  Batch 9,320  of  31,660.    Elapsed: 0:53:00.
  Batch 9,360  of  31,660.    Elapsed: 0:53:14.
  Batch 9,400  of  31,660.    Elapsed: 0:53:27.
  Batch 9,440  of  31,660.    Elapsed: 0:53:41.
  Batch 9,480  of  31,660.    Elapsed: 0:53:54.
  Batch 9,520  of  31,660.    Elapsed: 0:54:08.
  Batch 9,560  of  31,660.    Elapsed: 0:54:22.
  Batch 9,600  of  31,660.    Elapsed: 0:54:35.
  Batch 9,640  of  31,660.    Elapsed: 0

  Batch 15,560  of  31,660.    Elapsed: 1:28:48.
  Batch 15,600  of  31,660.    Elapsed: 1:29:02.
  Batch 15,640  of  31,660.    Elapsed: 1:29:15.
  Batch 15,680  of  31,660.    Elapsed: 1:29:29.
  Batch 15,720  of  31,660.    Elapsed: 1:29:43.
  Batch 15,760  of  31,660.    Elapsed: 1:29:56.
  Batch 15,800  of  31,660.    Elapsed: 1:30:10.
  Batch 15,840  of  31,660.    Elapsed: 1:30:23.
  Batch 15,880  of  31,660.    Elapsed: 1:30:37.
  Batch 15,920  of  31,660.    Elapsed: 1:30:50.
  Batch 15,960  of  31,660.    Elapsed: 1:31:04.
  Batch 16,000  of  31,660.    Elapsed: 1:31:17.
  Batch 16,040  of  31,660.    Elapsed: 1:31:31.
  Batch 16,080  of  31,660.    Elapsed: 1:31:44.
  Batch 16,120  of  31,660.    Elapsed: 1:31:58.
  Batch 16,160  of  31,660.    Elapsed: 1:32:11.
  Batch 16,200  of  31,660.    Elapsed: 1:32:25.
  Batch 16,240  of  31,660.    Elapsed: 1:32:39.
  Batch 16,280  of  31,660.    Elapsed: 1:32:52.
  Batch 16,320  of  31,660.    Elapsed: 1:33:06.
  Batch 16,360  of  

  Batch 22,280  of  31,660.    Elapsed: 2:07:23.
  Batch 22,320  of  31,660.    Elapsed: 2:07:38.
  Batch 22,360  of  31,660.    Elapsed: 2:07:52.
  Batch 22,400  of  31,660.    Elapsed: 2:08:06.
  Batch 22,440  of  31,660.    Elapsed: 2:08:20.
  Batch 22,480  of  31,660.    Elapsed: 2:08:34.
  Batch 22,520  of  31,660.    Elapsed: 2:08:48.
  Batch 22,560  of  31,660.    Elapsed: 2:09:02.
  Batch 22,600  of  31,660.    Elapsed: 2:09:16.
  Batch 22,640  of  31,660.    Elapsed: 2:09:30.
  Batch 22,680  of  31,660.    Elapsed: 2:09:45.
  Batch 22,720  of  31,660.    Elapsed: 2:09:59.
  Batch 22,760  of  31,660.    Elapsed: 2:10:13.
  Batch 22,800  of  31,660.    Elapsed: 2:10:27.
  Batch 22,840  of  31,660.    Elapsed: 2:10:41.
  Batch 22,880  of  31,660.    Elapsed: 2:10:55.
  Batch 22,920  of  31,660.    Elapsed: 2:11:09.
  Batch 22,960  of  31,660.    Elapsed: 2:11:23.
  Batch 23,000  of  31,660.    Elapsed: 2:11:38.
  Batch 23,040  of  31,660.    Elapsed: 2:11:51.
  Batch 23,080  of  

  Batch 29,000  of  31,660.    Elapsed: 2:45:28.
  Batch 29,040  of  31,660.    Elapsed: 2:45:41.
  Batch 29,080  of  31,660.    Elapsed: 2:45:55.
  Batch 29,120  of  31,660.    Elapsed: 2:46:08.
  Batch 29,160  of  31,660.    Elapsed: 2:46:21.
  Batch 29,200  of  31,660.    Elapsed: 2:46:35.
  Batch 29,240  of  31,660.    Elapsed: 2:46:48.
  Batch 29,280  of  31,660.    Elapsed: 2:47:01.
  Batch 29,320  of  31,660.    Elapsed: 2:47:14.
  Batch 29,360  of  31,660.    Elapsed: 2:47:28.
  Batch 29,400  of  31,660.    Elapsed: 2:47:41.
  Batch 29,440  of  31,660.    Elapsed: 2:47:55.
  Batch 29,480  of  31,660.    Elapsed: 2:48:08.
  Batch 29,520  of  31,660.    Elapsed: 2:48:21.
  Batch 29,560  of  31,660.    Elapsed: 2:48:35.
  Batch 29,600  of  31,660.    Elapsed: 2:48:48.
  Batch 29,640  of  31,660.    Elapsed: 2:49:01.
  Batch 29,680  of  31,660.    Elapsed: 2:49:15.
  Batch 29,720  of  31,660.    Elapsed: 2:49:28.
  Batch 29,760  of  31,660.    Elapsed: 2:49:42.
  Batch 29,800  of  

In [34]:
import pandas as pd

# Load the dataset into a pandas dataframe.
df = pd.read_csv('test.csv', delimiter="\t")

# Report the number of sentences.
print('Number of test sentences: {:,}\n'.format(df.shape[0]))

# Create sentence and label lists
sentences = df.text.values
#labels = df.label.values

# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids = []

# For every sentence...
for sent in tqdm(sentences):
    # `encode` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    encoded_sent = tokenizer.encode(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                   )
    
    input_ids.append(encoded_sent)

# Pad our input tokens
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, 
                          dtype="long", truncating="post", padding="post")

# Create attention masks
attention_masks = []

# Create a mask of 1s for each token followed by 0s for padding
for seq in tqdm(input_ids):
  seq_mask = [float(i>0) for i in seq]
  attention_masks.append(seq_mask) 

# Convert to tensors.
prediction_inputs = torch.tensor(input_ids)
prediction_masks = torch.tensor(attention_masks)
#prediction_labels = torch.tensor(labels)

# Set the batch size.  
batch_size = 32  

# Create the DataLoader.
prediction_data = TensorDataset(prediction_inputs, prediction_masks)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)


Number of test sentences: 283,333



100%|████████████████████████████████████████████████████████████████████████| 283333/283333 [01:29<00:00, 3162.57it/s]
100%|███████████████████████████████████████████████████████████████████████| 283333/283333 [00:07<00:00, 39339.55it/s]


In [36]:
# Prediction on test set

print('Predicting labels for {:,} test sentences...'.format(len(prediction_inputs)))

# Put model in evaluation mode
model.eval()

# Tracking variables 
predictions=[]

# Predict 
for batch in tqdm(prediction_dataloader):
  # Add batch to GPU
  batch = tuple(t.to(device) for t in batch)
  
  # Unpack the inputs from our dataloader
  b_input_ids, b_input_mask = batch
  
  # Telling the model not to compute or store gradients, saving memory and 
  # speeding up prediction
  with torch.no_grad():
      # Forward pass, calculate logit predictions
      outputs = model(b_input_ids, token_type_ids=None, 
                      attention_mask=b_input_mask)

  logits = outputs[0]

  # Move logits and labels to CPU
  logits = logits.detach().cpu().numpy()
  #label_ids = b_labels.to('cpu').numpy()
  
  # Store predictions and true labels
  predictions.append(logits)
  #true_labels.append(label_ids)

print('    DONE.')

Predicting labels for 283,333 test sentences...


100%|██████████████████████████████████████████████████████████████████████████████| 8855/8855 [15:24<00:00,  9.58it/s]

    DONE.





In [40]:
from scipy.special import softmax
answers = []
for i in tqdm(range(len(predictions))):
    for j in range(len(predictions[i])):
        answers.append(softmax(predictions[i][j]))
        
final_answers = []
for i in range(len(answers)):
    final_answers.append(np.argmax(answers[i]))

100%|█████████████████████████████████████████████████████████████████████████████| 8855/8855 [00:11<00:00, 787.29it/s]


In [41]:
# Confusion Matrix 
from sklearn.metrics import confusion_matrix
y_true = df.label.values

confusion_matrix(y_true, final_answers)


array([[110127,  31540],
       [ 31698, 109968]], dtype=int64)

In [42]:
from sklearn.metrics import accuracy_score
accuracy_score(y_true, final_answers)

0.7768067962432897

In [43]:
from sklearn.metrics import f1_score
f1_score(y_true, final_answers)

0.7766814749941733

In [48]:
from os import path
save_path = path.join("/saved_model/")
torch.save(model.state_dict(), save_path)

OSError: [Errno 22] Invalid argument: 'saved_model/'

In [47]:
model2 = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 2, # The number of output labels--2 for binary classification.
                    # You can increase this for multi-class tasks.   
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)
model2.load_state_dict(torch.load(save_path))
model2

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

FileNotFoundError: [Errno 2] No such file or directory: '/saved_model'

In [None]:
sample = pd.read_csv('test.csv', delimiter="\t")

In [None]:
sample['label'] = final_answers

In [None]:
sample.to_csv('samplesubmission.csv',index = False)
sample.head()