In [None]:
import pandas as pd
import numpy as np
import math
import re
from bs4 import BeautifulSoup
import random

In [None]:
!pip install bert-for-tf2
!pip install sentencepiece

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras import layers
import bert

In [None]:
data=pd.read_csv('../input/nlp-getting-started/train.csv')

In [None]:
data.head()

In [None]:
def clean_tweet(tweet):
    tweet=BeautifulSoup(tweet,'lxml').get_text()
    tweet= re.sub(r'@[A-Za-z0-9]+',' ',tweet)
    tweet=re.sub(r'https?://[A-Za-z0-9./]+',' ',tweet)
    tweet=re.sub(r'[^a-zA-Z.!?]',' ',tweet)
    tweet=re.sub(r' +',' ',tweet)
    return tweet
    

In [None]:
data_clean=data['text'].apply(lambda text:clean_tweet(text))

In [None]:
data_labels=data.target.values

In [None]:
fulltokenizer=bert.bert_tokenization.FullTokenizer
bert_layer=hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1')

In [None]:
vocab_file=bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case=bert_layer.resolved_object.do_lower_case.numpy()
tokenizer=fulltokenizer(vocab_file,do_lower_case)

In [None]:
def encode_sentence(sent):
    return ['[CLS]']+tokenizer.tokenize(sent)+['[SEP]']

In [None]:
data_inputs=[encode_sentence(sent) for sent in data_clean]

In [None]:
def get_ids(token):
    return tokenizer.convert_tokens_to_ids(token)

def get_masks(token):
    return np.char.not_equal(token,['PAD']).astype('int')

def get_segments(token):
    seg_ids=[]
    current_id=0
    for tok in token:
        seg_ids.append(current_id)
        if tok=='[SEP]':
            current_id=1-current_id
    return seg_ids

In [None]:
data_with_len = [[sent, data_labels[i], len(sent)]
                 for i, sent in enumerate(data_inputs)]
random.shuffle(data_with_len)
data_with_len.sort(key=lambda x: x[2])
sorted_all = [([get_ids(sent_lab[0]),
                get_masks(sent_lab[0]),
                get_segments(sent_lab[0])],
               sent_lab[1])
              for sent_lab in data_with_len if sent_lab[2] > 7]

In [None]:
# A list is a type of iterator so it can be used as generator for a dataset
all_dataset = tf.data.Dataset.from_generator(lambda: sorted_all,
                                             output_types=(tf.int32, tf.int32))

In [None]:
BATCH_SIZE = 32
all_batched = all_dataset.padded_batch(BATCH_SIZE,
                                       padded_shapes=((3, None), ()),
                                       padding_values=(0, 0))

In [None]:
NB_BATCHES = math.ceil(len(sorted_all) / BATCH_SIZE)
NB_BATCHES_TEST = NB_BATCHES // 10
all_batched.shuffle(NB_BATCHES)
test_dataset = all_batched.take(NB_BATCHES_TEST)
train_dataset = all_batched.skip(NB_BATCHES_TEST)

In [None]:
class DCNNBERTEmbedding(tf.keras.Model):
    
    def __init__(self,
                 nb_filters=50,
                 FFN_units=512,
                 nb_classes=2,
                 dropout_rate=0.1,
                 name="dcnn"):
        super(DCNNBERTEmbedding, self).__init__(name=name)
        
        self.bert_layer = hub.KerasLayer(
            "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
            trainable=False)

        self.bigram = layers.Conv1D(filters=nb_filters,
                                    kernel_size=2,
                                    padding="valid",
                                    activation="relu")
        self.trigram = layers.Conv1D(filters=nb_filters,
                                     kernel_size=3,
                                     padding="valid",
                                     activation="relu")
        self.fourgram = layers.Conv1D(filters=nb_filters,
                                      kernel_size=4,
                                      padding="valid",
                                      activation="relu")
        self.pool = layers.GlobalMaxPool1D()
        self.dense_1 = layers.Dense(units=FFN_units, activation="relu")
        self.dropout = layers.Dropout(rate=dropout_rate)
        if nb_classes == 2:
            self.last_dense = layers.Dense(units=1,
                                           activation="sigmoid")
        else:
            self.last_dense = layers.Dense(units=nb_classes,
                                           activation="softmax")
    
    def embed_with_bert(self, all_tokens):
        _, embs = self.bert_layer([all_tokens[:, 0, :],
                                   all_tokens[:, 1, :],
                                   all_tokens[:, 2, :]])
        return embs

    def call(self, inputs, training):
        x = self.embed_with_bert(inputs)

        print(x.shape)

        x_1 = self.bigram(x)
        x_1 = self.pool(x_1)
        x_2 = self.trigram(x)
        x_2 = self.pool(x_2)
        x_3 = self.fourgram(x)
        x_3 = self.pool(x_3)
        
        merged = tf.concat([x_1, x_2, x_3], axis=-1) # (batch_size, 3 * nb_filters)
        merged = self.dense_1(merged)
        merged = self.dropout(merged, training)
        output = self.last_dense(merged)
        
        return output

In [None]:
NB_FILTERS = 100
FFN_UNITS = 256
NB_CLASSES = 2

DROPOUT_RATE = 0.2

BATCH_SIZE = 32
NB_EPOCHS = 3

In [None]:
Dcnn = DCNNBERTEmbedding(nb_filters=NB_FILTERS,
                         FFN_units=FFN_UNITS,
                         nb_classes=NB_CLASSES,
                         dropout_rate=DROPOUT_RATE)

In [None]:
if NB_CLASSES == 2:
    Dcnn.compile(loss="binary_crossentropy",
                 optimizer="adam",
                 metrics=["accuracy"])
else:
    Dcnn.compile(loss="sparse_categorical_crossentropy",
                 optimizer="adam",
                 metrics=["sparse_categorical_accuracy"])

In [None]:
checkpoint_path = "/kaggle/working/"

ckpt = tf.train.Checkpoint(Dcnn=Dcnn)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=1)

if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print("Latest checkpoint restored!!")

In [None]:
class MyCustomCallback(tf.keras.callbacks.Callback):

    def on_epoch_end(self, epoch, logs=None):
        ckpt_manager.save()
        print("Checkpoint saved at {}.".format(checkpoint_path))

In [None]:
Dcnn.fit(train_dataset,
         epochs=NB_EPOCHS,
         validation_data=(test_dataset),
         callbacks=[MyCustomCallback()])

In [None]:
def get_prediction(sentence):
    sentence=clean_tweet(sentence)
    tokens = encode_sentence(sentence)

    input_ids = get_ids(tokens)
    input_mask = get_masks(tokens)
    segment_ids = get_segments(tokens)

    inputs = tf.stack(
        [tf.cast(input_ids, dtype=tf.int32),
         tf.cast(input_mask, dtype=tf.int32),
         tf.cast(segment_ids, dtype=tf.int32)],
         axis=0)
    inputs = tf.expand_dims(inputs, 0) # simulates a batch

    output = Dcnn(inputs, training=False)

    sentiment = math.floor(output*2)

    if sentiment == 0:
        print("Output of the model: {}\nPredicted sentiment: negative".format(
            output))
    elif sentiment == 1:
        print("Output of the model: {}\nPredicted sentiment: positive".format(
            output))
    return sentiment

In [None]:
test=pd.read_csv('../input/nlp-getting-started/test.csv')

In [None]:
preds=[]
for i in test['text']:
    preds.append(get_prediction(i))

In [None]:
sub=pd.read_csv('../input/nlp-getting-started/sample_submission.csv')

In [None]:
sub.head()

In [None]:
sub['target']=preds

In [None]:
sub.head()

In [None]:
sub.to_csv('submission.csv',index=False)