In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

/kaggle/input/efreiparisdeeplearning2019/dataset_train.csv
/kaggle/input/efreiparisdeeplearning2019/example_submission_test.csv
/kaggle/input/efreiparisdeeplearning2019/dataset_test_no_labels.csv


# BERT

## install transformers library

In [2]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/50/10/aeefced99c8a59d828a92cc11d213e2743212d3641c87c82d61b035a7d5c/transformers-2.3.0-py3-none-any.whl (447kB)
[K     |████████████████████████████████| 450kB 2.8MB/s 
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/a6/b4/7a41d630547a4afd58143597d5a49e07bfd4c42914d8335b2a5657efc14b/sacremoses-0.0.38.tar.gz (860kB)
[K     |████████████████████████████████| 870kB 35.3MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l- \ | / done
[?25h  Created wheel for sacremoses: filename=sacremoses-0.0.38-cp36-none-any.whl size=884629 sha256=3cab7463637cf8b81a78239c23739c7dc3ac60fb74613380cf86414cb8a5136d
  Stored in directory: /root/.cache/pip/wheels/6d/ec/1a/21b8912e35e02741306f35f66c785f3afe94de754a0eaf1422
Successfully built sacremoses
Installing collected packages: sacremoses, transformers
Suc

## imports and activate GPU

In [3]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer

from tqdm import tqdm, trange
import pandas as pd
import io
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

if torch.cuda.is_available():    
  
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

Using TensorFlow backend.


There are 1 GPU(s) available.
We will use the GPU: Tesla P100-PCIE-16GB


## Load Data

In [4]:
data = pd.read_csv("/kaggle/input/efreiparisdeeplearning2019/dataset_train.csv", sep='\t', index_col=0)
data.head()

Unnamed: 0_level_0,sentence_1,sentence_2,label
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,Conceptually cream skimming has two basic dime...,Product and geography are what make cream skim...,neutral
1,you know during the season and i guess at at y...,You lose the things to the following level if ...,entailment
2,One of our number will carry out your instruct...,A member of my team will execute your orders w...,entailment
3,How do you know? All this is their information...,This information belongs to them.,entailment
4,yeah i tell you what though if you go price so...,The tennis shoes have a range of prices.,neutral


## Data preprocessing for bert

In [5]:
data_bert = data.copy()

In [6]:
data_bert['sentence_1']= data_bert.sentence_1.apply(lambda x: "[CLS] " + x.strip() + " [SEP] ") 
data_bert['sentence_2'] = data_bert.sentence_2.apply(lambda x: x.strip() + " [SEP]")
data_bert['sentence'] = data_bert.sentence_1 + data_bert.sentence_2

data_bert.head()

Unnamed: 0_level_0,sentence_1,sentence_2,label,sentence
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,[CLS] Conceptually cream skimming has two basi...,Product and geography are what make cream skim...,neutral,[CLS] Conceptually cream skimming has two basi...
1,[CLS] you know during the season and i guess a...,You lose the things to the following level if ...,entailment,[CLS] you know during the season and i guess a...
2,[CLS] One of our number will carry out your in...,A member of my team will execute your orders w...,entailment,[CLS] One of our number will carry out your in...
3,[CLS] How do you know? All this is their infor...,This information belongs to them. [SEP],entailment,[CLS] How do you know? All this is their infor...
4,[CLS] yeah i tell you what though if you go pr...,The tennis shoes have a range of prices. [SEP],neutral,[CLS] yeah i tell you what though if you go pr...


In [7]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [8]:
data.sentence_1[0]

'Conceptually cream skimming has two basic dimensions - product and geography.'

In [9]:
inputs_ids = tokenizer.encode(data.sentence_1[0], text_pair=data.sentence_2[0], add_special_tokens=True)

In [10]:
# Prends environ 10 min
input_ids = list()
#tq = tqdm.tqdm()
for sent1,sent2 in tqdm(zip(data.sentence_1,data.sentence_2)):
    input_ids.append(tokenizer.encode(sent1, text_pair=sent2, add_special_tokens=True))
    

392662it [09:49, 665.74it/s]


In [11]:
data_bert['input_ids']=input_ids

In [12]:
segment_ids = data_bert.input_ids.apply(lambda x: [0] * (x.index(102)+1) + [1] * (len(x)-(x.index(102)+1)))

In [13]:
print( "mean length of sentences :", sum(map(len,data_bert.input_ids))/len(data_bert.input_ids))

mean length of sentences : 39.9166407750177


In [14]:
MAX_LEN = 128
# Pad input tokens
input_ids = np.array(data_bert.input_ids)
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", value=0, truncating="post",padding="post")
# Pad segment id
segment_ids = np.array(segment_ids)
segment_ids = pad_sequences(segment_ids, maxlen=MAX_LEN, dtype="long", value=0, truncating="post",padding="post")

In [15]:
# Create attention masks
attention_masks = []
# Create a mask : 1 if token else 0
for seq in input_ids:
    seq_mask = [int(i>0) for i in seq]
    attention_masks.append(seq_mask)

In [16]:
classes = ["entailment", "neutral", "contradiction"]
classes = dict(zip(classes, range(len(classes))))
data_bert["label_encoded"] = data_bert.label.map(classes)

In [17]:
labels = np.array(data_bert["label_encoded"])

## Training and validation split

In [18]:
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, 
                                                            random_state=44, test_size=0.1)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, labels,
                                             random_state=44, test_size=0.1)
train_segment_ids, validation_segment_ids, _, _ = train_test_split(segment_ids, labels,
                                                             random_state=44,test_size = 0.1)

## Convert data to tensor

In [19]:
# Convert all inputs and labels into torch tensors
train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)

train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)

train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

train_segment = torch.tensor(train_segment_ids)
validation_segment = torch.tensor(validation_segment_ids)

## Create batch / dataloader

In [20]:
# For fine-tuning BERT on a specific task, the authors recommend a batch size of 16 or 32.
batch_size = 32

#DataLoader training 
train_data = TensorDataset(train_inputs, train_masks, train_segment ,train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

#DataLoader validation
validation_data = TensorDataset(validation_inputs, validation_masks, validation_segment,validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)


## Instanciate bert model & set it on GPU

In [21]:
from transformers import BertForSequenceClassification, AdamW, BertConfig

#pretrained BERT model + single linear classification layer on top. 
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", #12-layer BERT model uncased vocab
    num_labels = 3,  
    output_attentions = False, 
    output_hidden_states = False, 
)

# pytorch GPU
model.cuda()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

## Print model parameter

In [22]:
# Get all of the model's parameters as a list of tuples.
params = list(model.named_parameters())

print('The BERT model has {:} different named parameters.\n'.format(len(params)))

print('==== Embedding Layer ====\n')

for p in params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== First Transformer ====\n')

for p in params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== Output Layer ====\n')

for p in params[-4:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))


The BERT model has 201 different named parameters.

==== Embedding Layer ====

bert.embeddings.word_embeddings.weight                  (30522, 768)
bert.embeddings.position_embeddings.weight                (512, 768)
bert.embeddings.token_type_embeddings.weight                (2, 768)
bert.embeddings.LayerNorm.weight                              (768,)
bert.embeddings.LayerNorm.bias                                (768,)

==== First Transformer ====

bert.encoder.layer.0.attention.self.query.weight          (768, 768)
bert.encoder.layer.0.attention.self.query.bias                (768,)
bert.encoder.layer.0.attention.self.key.weight            (768, 768)
bert.encoder.layer.0.attention.self.key.bias                  (768,)
bert.encoder.layer.0.attention.self.value.weight          (768, 768)
bert.encoder.layer.0.attention.self.value.bias                (768,)
bert.encoder.layer.0.attention.output.dense.weight        (768, 768)
bert.encoder.layer.0.attention.output.dense.bias              (

## Define optimizer

In [23]:
# 'W' stands for 'Weight Decay fix" maybe
optimizer = AdamW(model.parameters(),
                  lr = 1e-5, 
                  eps = 1e-8
                )

## Define epoch, step & scheduler

In [24]:
from transformers import get_linear_schedule_with_warmup

# between 2 and 4 epoch is good
epochs = 2

# number of batches * number of epochs
total_steps = len(train_dataloader) * epochs

#learning rate scheduler
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

## Function to get flatten result + accuracy

In [25]:
import numpy as np

def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

## Function to format time (only for display)

In [26]:
import time
import datetime
def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))


## Train the model

In [27]:
import random

seed_val = 44
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

loss_values = []
for epoch_i in range(0, epochs):
    
    #  Training
    print('\n======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # training epoch time
    t0 = time.time()

    # reset loss for epoch
    total_loss = 0

    # model in training mode
    model.train()

    # For each batch of training data...
    for step, batch in enumerate(train_dataloader):

        # Progress update every 40 batches.
        if step % 100 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
            
            # Report progress
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.  Loss: {:}'.format(step, len(train_dataloader), elapsed, loss.item()))
        # Unpack this training batch from our dataloader. 
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_segment_ids = batch[2].to(device)
        b_labels = batch[3].to(device)

        # https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch
        model.zero_grad()        

        # Perform a forward pass (evaluate the model on this training batch).
        # return loss
        outputs = model(b_input_ids, 
                    token_type_ids=b_segment_ids, 
                    attention_mask=b_input_mask, 
                    labels=b_labels)
        
        # all model return tuple -> tuple[0] is loss
        loss = outputs[0]

        # Accumulate the training loss over all of the batches 
        total_loss += loss.item()

        # Perform a backward pass to calculate the gradients.
        loss.backward()

        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters and take a step using the computed gradient.
        optimizer.step()

        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over the training data.
    avg_train_loss = total_loss / len(train_dataloader)            
    
    # Store the loss value for plotting the learning curve.
    loss_values.append(avg_train_loss)


    print("\n  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epoch took: {:}".format(format_time(time.time() - t0)))
        

    #----------------Validation-----------------

    print("Running Validation...")

    t0 = time.time()

    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    model.eval()

    # Tracking variables 
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0

    # Evaluate data for one epoch
    for batch in validation_dataloader:
        
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        
        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_segment_ids, b_labels = batch
        
        # Telling the model not to compute or store gradients, saving memory and
        # speeding up validation
        with torch.no_grad():        

            # Forward pass, calculate logit predictions.
            #---------PEUT ETRE CHANGER TOKEN_TYPE_IDS ????? ---------------------------
            outputs = model(b_input_ids, 
                            token_type_ids=b_segment_ids, 
                            attention_mask=b_input_mask)
        
        # Get the "logits" output by the model. The "logits" are the output
        # values prior to applying an activation function like the softmax.
        logits = outputs[0]

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        # accuracy of batch test
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        
        # total accuracy
        eval_accuracy += tmp_eval_accuracy

        # number of batches
        nb_eval_steps += 1

    # final accuracy 
    print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
    print("  Validation took: {:}".format(format_time(time.time() - t0)))

print("\nTraining complete!")


Training...
  Batch   100  of  11,044.    Elapsed: 0:00:43.  Loss: 0.9145575165748596
  Batch   200  of  11,044.    Elapsed: 0:01:26.  Loss: 0.8829405903816223
  Batch   300  of  11,044.    Elapsed: 0:02:08.  Loss: 0.677834153175354
  Batch   400  of  11,044.    Elapsed: 0:02:50.  Loss: 0.6953713297843933
  Batch   500  of  11,044.    Elapsed: 0:03:32.  Loss: 0.8736481070518494
  Batch   600  of  11,044.    Elapsed: 0:04:14.  Loss: 0.5869234800338745
  Batch   700  of  11,044.    Elapsed: 0:04:57.  Loss: 0.605618417263031
  Batch   800  of  11,044.    Elapsed: 0:05:39.  Loss: 1.1697375774383545
  Batch   900  of  11,044.    Elapsed: 0:06:21.  Loss: 0.8868364095687866
  Batch 1,000  of  11,044.    Elapsed: 0:07:04.  Loss: 0.7421754002571106
  Batch 1,100  of  11,044.    Elapsed: 0:07:46.  Loss: 0.5806711912155151
  Batch 1,200  of  11,044.    Elapsed: 0:08:28.  Loss: 0.5876591801643372
  Batch 1,300  of  11,044.    Elapsed: 0:09:11.  Loss: 0.6066928505897522
  Batch 1,400  of  11,044. 

## Evaluate the model on new data

In [28]:
# validation
df = pd.read_csv("/kaggle/input/efreiparisdeeplearning2019/dataset_test_no_labels.csv", delimiter='\t', index_col=0)


In [29]:
df[df['sentence_2']==None]

Unnamed: 0_level_0,sentence_1,sentence_2
index,Unnamed: 1_level_1,Unnamed: 2_level_1


## data preprocessing

In [30]:
print(f'Number of test sentences: {df.shape[0]}\n')

input_ids = list()
for sent1,sent2 in tqdm(zip(df.sentence_1,df.sentence_2)):
    input_ids.append(tokenizer.encode(sent1, text_pair=sent2, add_special_tokens=True))
    

0it [00:00, ?it/s]

Number of test sentences: 19647



19647it [00:28, 699.09it/s]


In [31]:
df['input_ids']=input_ids

In [32]:
segment_ids = df.input_ids.apply(lambda x: [0] * (x.index(102)+1) + [1] * (len(x)-(x.index(102)+1)))

In [33]:
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, 
                          dtype="long", truncating="post", padding="post")

In [34]:
segment_ids = np.array(segment_ids)
segment_ids = pad_sequences(segment_ids, maxlen=MAX_LEN, dtype="long", value=0, truncating="post",padding="post")

attention_masks = []
for seq in input_ids:
  seq_mask = [float(i>0) for i in seq]
  attention_masks.append(seq_mask) 


In [35]:
prediction_inputs = torch.tensor(input_ids)
prediction_masks = torch.tensor(attention_masks)
prediction_segment = torch.tensor(segment_ids)
prediction_labels = None

batch_size = 32

prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_segment)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

## Label prediction

In [36]:
print(f"Predicting labels for {len(prediction_inputs)} test sentences...")
model.eval()
predictions = []
for batch in prediction_dataloader:

  batch = tuple(t.to(device) for t in batch)
  b_input_ids, b_input_mask, b_segment_ids = batch

  with torch.no_grad():
      outputs = model(b_input_ids, token_type_ids=b_segment_ids, 
                      attention_mask=b_input_mask)

  logits = outputs[0]
  logits = logits.detach().cpu().numpy()
  predictions.append(logits)

print('\tDONE.')
flat_predictions = [item for sublist in predictions for item in sublist]
flat_predictions = np.argmax(flat_predictions, axis=1).flatten()

Predicting labels for 19647 test sentences...
	DONE.


## Flat prediction for human understanding

In [37]:
# Combine the predictions for each batch into a single list of 0,1,2.
flat_predictions = [item for sublist in predictions for item in sublist]
flat_predictions = np.argmax(flat_predictions, axis=1).flatten()

In [38]:
flat_predictions

array([0, 0, 0, ..., 1, 2, 1])

## Accuracy

In [39]:
#(flat_true_labels == flat_predictions).sum()/len(flat_true_labels)

## Report final result in csv

In [40]:
inv_classes = {v: k for k, v in classes.items()}

In [41]:
inv_classes

{0: 'entailment', 1: 'neutral', 2: 'contradiction'}

In [42]:
final_result = pd.DataFrame(flat_predictions).reset_index().rename(columns={0:"label"})

In [43]:
final_result.label = final_result.label.map(inv_classes)

In [44]:
final_result.to_csv("submission.csv", index=False)

In [45]:
final_result.head(10)

Unnamed: 0,index,label
0,0,entailment
1,1,entailment
2,2,entailment
3,3,neutral
4,4,entailment
5,5,contradiction
6,6,entailment
7,7,contradiction
8,8,neutral
9,9,contradiction


# ALL IN ONE CLASS

In [46]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from transformers import BertForSequenceClassification,BertTokenizer, AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm
import pandas as pd
import numpy as np
import time
import datetime
import random


class bertTwoSentenceBertClassification:

    def __init__(self, dataframe):

        if torch.cuda.is_available():
            self.device = torch.device("cuda")
            print('There are %d GPU(s) available.' % torch.cuda.device_count())
            print('We will use the GPU:', torch.cuda.get_device_name(0))
        else:
            print('No GPU available, using the CPU instead.')
            self.device = torch.device("cpu")

        self.rawdata = dataframe
        assert "label" in self.rawdata.columns.to_list() and "sentence_1" in self.rawdata.columns.to_list() and \
               "sentence_2" in self.rawdata.columns.to_list(), \
            "Dataframe should contain columns ['sentence_1','sentence_2','label']"
        self.tokenizer = None
        self.input_ids, self.segment_ids, self.attention_mask = None, None, None
        self.labels, self.classes = None, None
        self.train_inputs, self.validation_inputs = None, None
        self.train_labels, self.validation_labels = None, None
        self.train_masks, self.validation_masks = None, None
        self.train_segment, self.validation_segment = None, None
        self.train_dataloader, self.validation_dataloader = None, None
        self.model, self.optimizer = None, None
        self.epochs, self.total_steps, self.scheduler = None, None, None
        self.flat_predictions = None

        self.preprocessData()
        self.encode_class()
        self.train_test_tensor()
        self.create_batch()
        self.instanciate_bert()
        self.train_model()

    def preprocessData(self, model='bert-base-uncased', do_lower_case=True):

        def get_input_ids(tokenizer):
            input_ids = list()
            print(f"Converting {len(self.rawdata.sentence_1)} to bert ids...")
            for sent1, sent2 in tqdm(zip(self.rawdata.sentence_1, self.rawdata.sentence_2)):
                input_ids.append(tokenizer.encode(sent1, text_pair=sent2, add_special_tokens=True))
            return input_ids

        def get_padding(input_ids, segment_ids, max_len=128):
            # Pad input tokens
            input_ids = np.array(input_ids)
            input_ids = pad_sequences(input_ids, maxlen=max_len, dtype="long", value=0, truncating="post",
                                      padding="post")
            # Pad segment id
            segment_ids = np.array(segment_ids)
            segment_ids = pad_sequences(segment_ids, maxlen=max_len, dtype="long", value=0, truncating="post",
                                        padding="post")
            return input_ids, segment_ids

        def get_attention_mask(input_ids):
            attention_masks = []
            # Create a mask : 1 if token else 0
            for seq in input_ids:
                seq_mask = [int(i > 0) for i in seq]
                attention_masks.append(seq_mask)
            return attention_masks

        self.tokenizer = BertTokenizer.from_pretrained(model, do_lower_case=do_lower_case)
        self.input_ids = get_input_ids(self.tokenizer)
        self.rawdata['input_ids'] = self.input_ids
        self.rawdata['segment_ids'] = self.rawdata.input_ids.apply(
            lambda x: [0] * (x.index(102) + 1) + [1] * (len(x) - (x.index(102) + 1)))
        self.input_ids, self.segment_ids = get_padding(self.rawdata.input_ids, self.rawdata.segment_ids)
        self.attention_mask = get_attention_mask(self.input_ids)

    def encode_class(self):
        classes = self.rawdata.label.unique().tolist()
        self.classes = dict(zip(classes, range(len(classes))))
        self.rawdata["label_encoded"] = self.rawdata.label.map(self.classes)
        self.labels = np.array(self.rawdata["label_encoded"])

    def train_test_tensor(self, random_state=44, split_size=0.1):
        train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(self.input_ids, self.labels,
                                                                                            random_state=random_state,
                                                                                            test_size=split_size)
        train_masks, validation_masks, _, _ = train_test_split(self.attention_mask, self.labels,
                                                               random_state=random_state, test_size=split_size)
        train_segment_ids, validation_segment_ids, _, _ = train_test_split(self.segment_ids, self.labels,
                                                                           random_state=random_state,
                                                                           test_size=split_size)
        self.train_inputs = torch.tensor(train_inputs) # dtype=torch.long if running on windows
        self.validation_inputs = torch.tensor(validation_inputs)

        self.train_labels = torch.tensor(train_labels)
        self.validation_labels = torch.tensor(validation_labels)

        self.train_masks = torch.tensor(train_masks)
        self.validation_masks = torch.tensor(validation_masks)

        self.train_segment = torch.tensor(train_segment_ids)
        self.validation_segment = torch.tensor(validation_segment_ids)

    def create_batch(self, batch_size=32):
        # For fine-tuning BERT on a specific task, the authors recommend a batch size of 16 or 32.
        self.batch_size = batch_size

        # DataLoader training
        train_data = TensorDataset(self.train_inputs, self.train_masks, self.train_segment, self.train_labels)
        train_sampler = RandomSampler(train_data)
        self.train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

        # DataLoader validation
        validation_data = TensorDataset(self.validation_inputs, self.validation_masks, self.validation_segment,
                                        self.validation_labels)
        validation_sampler = SequentialSampler(validation_data)
        self.validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

    def instanciate_bert(self, model_name="bert-base-uncased", num_labels=3, output_attentions=False,
                         output_hidden_states=False, learning_rate=1e-5, epsilon=1e-8, epoch=2):

        self.model = BertForSequenceClassification.from_pretrained(
            model_name,
            num_labels=num_labels,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
        )
        try:
            self.model.cuda()
        except:
            print("No cuda GPU available")

        self.optimizer = AdamW(self.model.parameters(),
                               lr=learning_rate,
                               eps=epsilon
                               )

        self.epochs = 2
        self.total_steps = len(self.train_dataloader) * self.epochs
        self.scheduler = get_linear_schedule_with_warmup(self.optimizer,
                                                         num_warmup_steps=0,  # Default value in run_glue.py
                                                         num_training_steps=self.total_steps)

    @staticmethod
    def flat_accuracy(predictions, labels):
        flat_predictions = np.argmax(predictions, axis=1).flatten()
        flat_labels = labels.flatten()
        return np.sum(flat_predictions == flat_labels) / len(flat_labels)

    @staticmethod
    def format_time(elapsed):
        """
        Takes a time in seconds and returns a string hh:mm:ss
        """
        elapsed_rounded = int(round(elapsed))
        return str(datetime.timedelta(seconds=elapsed_rounded))

    def train_model(self, seed_val=44):
        random.seed(seed_val)
        np.random.seed(seed_val)
        torch.manual_seed(seed_val)
        torch.cuda.manual_seed_all(seed_val)

        loss_values = []

        for epoch_i in range(0, self.epochs):

            #  Training
            print(f'\n--------------------- Epoch {epoch_i + 1} / {self.epochs} ---------------------')
            print('Training...')
            t0 = time.time()
            total_loss = 0
            self.model.train()

            for step, batch in enumerate(self.train_dataloader):
                if step % 100 == 0 and not step == 0:
                    elapsed = self.format_time(time.time() - t0)
                    # Report progress
                    print(
                        f"Batch {step}  of  {len(self.train_dataloader)}.    Elapsed: {elapsed}.    Loss: {loss.item()}.")

                batch = tuple(t.to(self.device) for t in batch)
                b_input_ids, b_input_mask, b_segment_ids, b_labels = batch

                self.model.zero_grad()

                # Perform a forward pass (evaluate the model on this training batch).
                # return loss
                outputs = self.model(b_input_ids,
                                     token_type_ids=b_segment_ids,
                                     attention_mask=b_input_mask,
                                     labels=b_labels)

                # all model return tuple -> tuple[0] is loss
                loss = outputs[0]
                total_loss += loss.item()

                # Perform a backward pass to calculate the gradients.
                loss.backward()

                # Clip the norm of the gradients to 1.0.
                # This is to help prevent the "exploding gradients" problem.
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)

                # Update parameters and take a step using the computed gradient.
                self.optimizer.step()
                # Update the learning rate.
                self.scheduler.step()

            # Calculate the average loss over the training data.
            avg_train_loss = total_loss / len(self.train_dataloader)

            # Store the loss value for plotting the learning curve.
            loss_values.append(avg_train_loss)

            print(f"\n\tAverage training loss: {avg_train_loss}")
            print(f"\tTraining epoch took: {self.format_time(time.time() - t0)}")

            # ----------------Validation-----------------

            print("Running Validation...")
            t0 = time.time()
            self.model.eval()
            eval_loss, eval_accuracy = 0, 0
            nb_eval_steps, nb_eval_examples = 0, 0

            # Evaluate data for one epoch
            for batch in self.validation_dataloader:
                batch = tuple(t.to(self.device) for t in batch)
                b_input_ids, b_input_mask, b_segment_ids, b_labels = batch

                # Telling the model not to compute or store gradients, saving memory and speeding up validation
                with torch.no_grad():
                    # Forward pass, calculate logit predictions.
                    # This will return the logits rather than the loss because we have
                    # not provided labels.
                    outputs = self.model(b_input_ids,
                                         token_type_ids=b_segment_ids,
                                         attention_mask=b_input_mask)

                # Get the "logits" output by the model. The "logits" are the output
                # values prior to applying an activation function like softmax.
                logits = outputs[0]

                # Move logits and labels to CPU
                logits = logits.detach().cpu().numpy()
                label_ids = b_labels.to('cpu').numpy()

                tmp_eval_accuracy = self.flat_accuracy(logits, label_ids)
                eval_accuracy += tmp_eval_accuracy
                nb_eval_steps += 1

            print(f"\tAccuracy: {eval_accuracy / nb_eval_steps}")
            print(f"\tValidation took: {self.format_time(time.time() - t0)}")

        print("\nTraining complete!")

    def predict(self, df, max_len=128, batch_size=32):
        assert "sentence_1" in df.columns.to_list() and \
               "sentence_2" in df.columns.to_list(), \
            "Dataframe should contain columns ['sentence_1','sentence_2']"

        print(f'Number of test sentences: {df.shape[0]}\n')

        input_ids = []
        input_ids = list()
        for sent1, sent2 in tqdm(zip(df.sentence_1, df.sentence_2)):
            input_ids.append(self.tokenizer.encode(sent1, text_pair=sent2, add_special_tokens=True))

        input_ids = pad_sequences(input_ids, maxlen=max_len,
                                  dtype="long", truncating="post", padding="post")
        df['segment_ids'] = input_ids
        df['segment_ids'] = df.input_ids.apply(lambda x: [0] * (x.index(102) + 1) + [1] * (len(x) - (x.index(102) + 1)))
        segment_ids = np.array(df.segment_ids)
        segment_ids = pad_sequences(segment_ids, maxlen=max_len, dtype="long", value=0, truncating="post",
                                    padding="post")

        attention_masks = []
        for seq in input_ids:
            seq_mask = [float(i > 0) for i in seq]
            attention_masks.append(seq_mask)

        prediction_inputs = torch.tensor(input_ids)
        prediction_masks = torch.tensor(attention_masks)
        prediction_segment = torch.tensor(segment_ids)
        prediction_labels = None

        prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_segment)
        prediction_sampler = SequentialSampler(prediction_data)
        prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

        print(f"Predicting labels for {len(prediction_inputs)} test sentences...")
        self.model.eval()
        predictions = []
        for batch in prediction_dataloader:
            batch = tuple(t.to(self.device) for t in batch)
            b_input_ids, b_input_mask, b_segment_ids = batch

            with torch.no_grad():
                outputs = self.model(b_input_ids, token_type_ids=b_segment_ids,
                                     attention_mask=b_input_mask)

            logits = outputs[0]
            logits = logits.detach().cpu().numpy()
            predictions.append(logits)

        print('\tDONE.')

        flat_predictions = [item for sublist in predictions for item in sublist]
        self.flat_predictions = np.argmax(flat_predictions, axis=1).flatten()

    def export_prediction_csv(self):
        inv_classes = {v: k for k, v in self.classes.items()}
        final_result = pd.DataFrame(self.flat_predictions).reset_index().rename(columns={0: "label"})
        final_result.label = final_result.label.map(inv_classes)
        final_result.to_csv("final_result.csv", index=False)


## Launch the class

In [47]:
"""bert = bertTwoSentenceBertClassification(data)
bert.predict(df)
bert.export_prediction_csv()"""

'bert = bertTwoSentenceBertClassification(data)\nbert.predict(df)\nbert.export_prediction_csv()'

# Personal Model

In [48]:
import numpy as np
import pandas as pd

data = pd.read_csv("/kaggle/input/efreiparisdeeplearning2019/dataset_train.csv", sep='\t', index_col=0)

classes = ["entailment", "neutral", "contradiction"]
classes = dict(zip(classes, range(len(classes))))
data["class"] = data.label.map(classes)

from keras.models import Model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, add, concatenate
from keras.layers import LSTM, Bidirectional, GRU, Conv1D
from keras.layers import GlobalMaxPooling1D, GlobalAveragePooling1D, AveragePooling1D, MaxPooling1D
from keras.layers import Dropout, Reshape
from keras.preprocessing import text, sequence
from keras.utils import to_categorical
from gensim.models import KeyedVectors
from sklearn.model_selection import train_test_split

EMBEDDING_FILE = '../input/gensim-embeddings-dataset/glove.840B.300d.gensim'
NUM_MODELS = 1
BATCH_SIZE = 512
DROPOUT = 0.3
LSTM_UNITS = 128
DENSE_HIDDEN_UNITS = 4 * LSTM_UNITS
EPOCHS = 6
MAX_LEN_1 = 150
MAX_LEN_2 = 70
CHARS_TO_REMOVE = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n“”’\'∞θ÷α•à−β∅³π‘₹´°£€\×™√²—'

In [49]:
def build_matrix(word_index, path):
    embedding_index = KeyedVectors.load(path, mmap='r')
    embedding_matrix = np.zeros((len(word_index) + 1, 300))
    for word, i in word_index.items():
        for candidate in [word, word.lower()]:
            if candidate in embedding_index:
                embedding_matrix[i] = embedding_index[candidate]
                break
    return embedding_matrix

In [50]:
# Takes 25s
tokenizer = text.Tokenizer(filters=CHARS_TO_REMOVE, lower=False)
tokenizer.fit_on_texts(np.concatenate([data.sentence_1.values, data.sentence_2.values]))

In [51]:
# Takes 25s
sent_1 = tokenizer.texts_to_sequences(data.sentence_1.values)
sent_2 = tokenizer.texts_to_sequences(data.sentence_2.values)

In [52]:
sent_1 = sequence.pad_sequences(sent_1, maxlen=MAX_LEN_1)
sent_2 = sequence.pad_sequences(sent_2, maxlen=MAX_LEN_2)

In [53]:
sent_final = np.concatenate([sent_1, sent_2], axis=1)
del sent_1, sent_2

In [54]:
embedding_matrix = build_matrix(tokenizer.word_index, EMBEDDING_FILE)

FileNotFoundError: [Errno 2] No such file or directory: '../input/gensim-embeddings-dataset/glove.840B.300d.gensim'

In [55]:
one_hot_labels = to_categorical(data['class'].values, num_classes=3)

In [56]:
X_train, X_test, y_train, y_test = train_test_split(sent_final, one_hot_labels, test_size=0.1, random_state=24)

## Model 1

In [57]:
words = Input(shape=(None,))
x = Embedding(*embedding_matrix.shape, weights=[embedding_matrix], trainable=False)(words)
x = SpatialDropout1D(DROPOUT)(x)
x = Bidirectional(LSTM(LSTM_UNITS, return_sequences=True))(x)
x = Bidirectional(LSTM(LSTM_UNITS, return_sequences=True))(x)
x = Bidirectional(LSTM(LSTM_UNITS, return_sequences=True))(x)

hidden = concatenate([
    GlobalMaxPooling1D()(x),
    GlobalAveragePooling1D()(x),
])
hidden = add([hidden, Dense(DENSE_HIDDEN_UNITS, activation='relu')(hidden)])
hidden = add([hidden, Dense(DENSE_HIDDEN_UNITS, activation='relu')(hidden)])
result = Dense(3, activation='sigmoid')(hidden)

model = Model(inputs=words, outputs=result)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

NameError: name 'embedding_matrix' is not defined

# Model 2

In [58]:
words = Input(shape=(None,))
x = Embedding(*embedding_matrix.shape, weights=[embedding_matrix], trainable=False)(words)
x = SpatialDropout1D(DROPOUT)(x)

x = Bidirectional(GRU(LSTM_UNITS, return_sequences=True, stateful=False))(x)
x = Bidirectional(GRU(LSTM_UNITS, return_sequences=True, stateful=False))(x)

hidden = concatenate([
    GlobalAveragePooling1D()(x),
    GlobalMaxPooling1D()(x),
])

hidden = Reshape((BATCH_SIZE, -1))(hidden)

hidden = Conv1D(32, 3, strides=1)(hidden)
hidden = Dropout(DROPOUT)(hidden)
hidden = Conv1D(32, 3, strides=1)(hidden)
hidden = Dropout(DROPOUT)(hidden)
hidden = MaxPooling1D()(hidden)

hidden = Conv1D(64, 3, strides=1)(hidden)
hidden = Dropout(DROPOUT)(hidden)
hidden = Conv1D(64, 3, strides=1)(hidden)
hidden = Dropout(DROPOUT)(hidden)
hidden = MaxPooling1D()(hidden)

hidden = Conv1D(128, 3, strides=1)(hidden)
hidden = Dropout(DROPOUT)(hidden)
hidden = Conv1D(128, 3, strides=1)(hidden)
hidden = Dropout(DROPOUT)(hidden)
hidden = AveragePooling1D()(hidden)

hidden = GlobalAveragePooling1D()(hidden)

#hidden = Dense(64, activation='relu')(hidden)
#hidden = Dense(32, activation='relu')(hidden)
result = Dense(3, activation='softmax')(hidden)

model = Model(inputs=words, outputs=result)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

NameError: name 'embedding_matrix' is not defined

In [59]:
model.fit(
    X_train,
    y_train,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_data=(X_test, y_test)
)

AttributeError: 'BertForSequenceClassification' object has no attribute 'fit'

In [60]:
model.predict(x_test, batch_size=2048)

AttributeError: 'BertForSequenceClassification' object has no attribute 'predict'

from keras.preprocessing.text import Tokenizer
max_words = 20
max_len = 150
tok = Tokenizer(num_words=max_words)
tok.fit_on_texts(X_train)
sequences = tok.texts_to_sequences(X_train)
sequences_matrix = sequence.pad_sequences(sequences,maxlen=max_len)

from keras.utils import to_categorical
one_hot_labels = to_categorical(data['class'].values, num_classes=3)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(sent_final, one_hot_labels, test_size=0.3, random_state=24)

from sklearn.base import BaseEstimator, ClassifierMixin
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers import LSTM, GRU, Embedding
from keras.layers.normalization import BatchNormalization
import keras.optimizers as opt

batch_size = 1024
n_epochs = 2
n_labels = 3
n_features = X_train.shape[1]

model = Sequential()
model.add(Embedding(60, output_dim=256))
model.add(GRU(128))
model.add(Dropout(0.3))
model.add(Dense(n_labels))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics = ['accuracy'])

model.fit(X_train, y_train, batch_size=batch_size, epochs=n_epochs, validation_data=(X_test, y_test))

predictions = model.predict(X_test, batch_size=batch_size)

test = np.argmax(y_test, axis=1) == np.argmax(predictions, axis=1)

len(test[test]) / len(predictions)