In [3]:
PRE_TRAINED_MODEL_NAME = 'bert_based_uncased'

In [4]:
import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
import numpy as np
import pandas as pd
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict
from textwrap import wrap
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from time import time

In [5]:
import os
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint

import transformers

from tokenizers import BertWordPieceTokenizer

In [6]:
train = pd.read_csv('comments_preprocessed_final.csv')

In [5]:
X_train = train.comment_text
y_train = train.target
df = pd.DataFrame()
df['review'] = X_train
df['label'] = y_train

In [7]:
df_train = df.iloc[:1058949,:]
df_val = df.iloc[1058949:1411933,:]
df_test = df.iloc[1411933:,:]

In [8]:
'''
Title: Deep Learning For NLP: Zero To Transformers & BERTNegative
Author: Tanul Singh
Date: 2020
Availability: https://www.kaggle.com/tanulsingh077/deep-learning-for-nlp-zero-to-transformers-bert
'''
def fast_encode(texts, tokenizer, chunk_size=256, maxlen=242):
    """
    Encoder for encoding the text into sequence of integers for BERT Input
    """
    tokenizer.enable_truncation(max_length=maxlen)
    tokenizer.enable_padding()
    all_ids = []
    
    for i in tqdm(range(0, len(texts), chunk_size)):
        text_chunk = texts[i:i+chunk_size].tolist()
        encs = tokenizer.encode_batch(text_chunk)
        all_ids.extend([enc.ids for enc in encs])
    
    return np.array(all_ids)

In [9]:
'''
Title: Deep Learning For NLP: Zero To Transformers & BERTNegative
Author: Tanul Singh
Date: 2020
Availability: https://www.kaggle.com/tanulsingh077/deep-learning-for-nlp-zero-to-transformers-bert
'''
# Detect hardware, return appropriate distribution strategy
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

REPLICAS:  1


In [22]:
AUTO = tf.data.experimental.AUTOTUNE


# Configuration
EPOCHS = 1
BATCH_SIZE = 32
MAX_LEN = 242

In [11]:
'''
Title: Deep Learning For NLP: Zero To Transformers & BERTNegative
Author: Tanul Singh
Date: 2020
Availability: https://www.kaggle.com/tanulsingh077/deep-learning-for-nlp-zero-to-transformers-bert
'''
tokenizer = transformers.DistilBertTokenizer.from_pretrained('distilbert-base-cased')
# Save the loaded tokenizer locally
tokenizer.save_pretrained('.')
# Reload it with the huggingface tokenizers library
fast_tokenizer = BertWordPieceTokenizer('vocab.txt', lowercase=False)
fast_tokenizer

Tokenizer(vocabulary_size=28996, model=BertWordPiece, add_special_tokens=True, unk_token=[UNK], sep_token=[SEP], cls_token=[CLS], clean_text=True, handle_chinese_chars=True, strip_accents=True, lowercase=False, wordpieces_prefix=##)

In [12]:
'''
Title: Deep Learning For NLP: Zero To Transformers & BERTNegative
Author: Tanul Singh
Date: 2020
Availability: https://www.kaggle.com/tanulsingh077/deep-learning-for-nlp-zero-to-transformers-bert
'''
from tqdm import tqdm
x_train = fast_encode(df_train.review.astype(str), fast_tokenizer, maxlen=MAX_LEN)
x_valid = fast_encode(df_val.review.astype(str), fast_tokenizer, maxlen=MAX_LEN)
x_test = fast_encode(df_test.review.astype(str), fast_tokenizer, maxlen=MAX_LEN)

y_train = df_train.label
y_valid = df_val.label
y_test = df_test.label

100%|██████████| 4137/4137 [01:15<00:00, 55.15it/s]
100%|██████████| 1379/1379 [00:23<00:00, 57.98it/s]
100%|██████████| 1379/1379 [00:23<00:00, 59.02it/s]


In [23]:
'''
Title: Deep Learning For NLP: Zero To Transformers & BERTNegative
Author: Tanul Singh
Date: 2020
Availability: https://www.kaggle.com/tanulsingh077/deep-learning-for-nlp-zero-to-transformers-bert
'''
train_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_train, y_train))
    .repeat()
    .shuffle(2048)
    .batch(BATCH_SIZE)
    .prefetch(AUTO)
)

valid_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_valid, y_valid))
    .batch(BATCH_SIZE)
    .cache()
    .prefetch(AUTO)
)

test_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_test, y_test))
    .batch(BATCH_SIZE)
)

In [14]:
'''
Title: Deep Learning For NLP: Zero To Transformers & BERTNegative
Author: Tanul Singh
Date: 2020
Availability: https://www.kaggle.com/tanulsingh077/deep-learning-for-nlp-zero-to-transformers-bert
'''
def build_model(transformer, max_len=242):
    """
    function for training the BERT model
    """
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    sequence_output = transformer(input_word_ids)[0]
    cls_token = sequence_output[:, 0, :]
    out = Dense(1, activation='sigmoid')(cls_token)
    
    model = Model(inputs=input_word_ids, outputs=out)
    model.compile(Adam(lr=1e-5), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

In [15]:
'''
Title: Deep Learning For NLP: Zero To Transformers & BERTNegative
Author: Tanul Singh
Date: 2020
Availability: https://www.kaggle.com/tanulsingh077/deep-learning-for-nlp-zero-to-transformers-bert
'''
%%time
with strategy.scope():
    transformer_layer = (
        transformers.TFDistilBertModel
        .from_pretrained('distilbert-base-multilingual-cased')
    )
    model = build_model(transformer_layer, max_len=MAX_LEN)

CPU times: user 7.29 s, sys: 1.09 s, total: 8.39 s
Wall time: 7.54 s


In [27]:
'''
Title: Deep Learning For NLP: Zero To Transformers & BERTNegative
Author: Tanul Singh
Date: 2020
Availability: https://www.kaggle.com/tanulsingh077/deep-learning-for-nlp-zero-to-transformers-bert
'''
n_steps = x_train.shape[0] // BATCH_SIZE
train_history_2 = model.fit(
    train_dataset.repeat(),
    steps_per_epoch=n_steps,
    epochs=1, verbose=1
)

Train for 11030 steps


In [None]:
'''
Title: Deep Learning For NLP: Zero To Transformers & BERTNegative
Author: Tanul Singh
Date: 2020
Availability: https://www.kaggle.com/tanulsingh077/deep-learning-for-nlp-zero-to-transformers-bert
'''
# it has been ran on Microsfot Azure, the the output was able to convert to csv, however,
# there was the notebook had a saving issue
df_test['y_pred'] = model.predict(test_dataset, verbose=1)
df_test.to_csv('actual_prediction64v2.csv', index=False)

 2389/11031 [=====>........................] - ETA: 7:18:38