In [None]:
import pandas as pd
import numpy as np
import math
import re
import os
import random

In [None]:
train=pd.read_csv('../input/questions-chapter-classification/train.csv')
val=pd.read_csv('../input/questions-chapter-classification/val.csv')

In [None]:
!pip install bert-for-tf2
!pip install sentencepiece

In [None]:
import tensorflow as tf

import tensorflow_hub as hub

from tensorflow.keras import layers
import bert

from nltk.corpus import stopwords
stop_words = stopwords.words('english')

import string
stop_words.extend(list(string.punctuation))

from nltk.tokenize import word_tokenize

In [None]:
def stopword_remove(sent):
    tokens=word_tokenize(sent)
    sentence=[token for token in tokens if token not in stop_words]
    return ' '.join(sentence) 

In [None]:
train['eng'] = train['eng'].apply(lambda x: stopword_remove(x))
val['eng'] = val['eng'].apply(lambda x: stopword_remove(x))

In [None]:
FullTokenizer = bert.bert_tokenization.FullTokenizer
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1", trainable=False)
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = FullTokenizer(vocab_file, do_lower_case)

In [None]:
def encode_sentence(eng):
    return ["[CLS]"] + tokenizer.tokenize(eng) + ["[SEP]"]

In [None]:
def get_ids(tokens):
    return tokenizer.convert_tokens_to_ids(tokens)

def get_mask(tokens):
    return np.char.not_equal(tokens, "[PAD]").astype(int)

def get_segments(tokens):
    seg_ids = []
    current_seg_id = 0
    for tok in tokens:
        seg_ids.append(current_seg_id)
        if tok == "[SEP]":
            current_seg_id = 1-current_seg_id # turns 1 into 0 and vice versa
    return seg_ids

In [None]:
def padding(encoded_sentence, length=512):
    length_now = len(encoded_sentence)
    if length_now > length:
        ret = encoded_sentence[:511] + ['[SEP]']
        return ret
    pad = ['[PAD]' for i in range(length-length_now)]
    ret = encoded_sentence + pad
    return ret

In [None]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

encoder = OneHotEncoder()
y_train = encoder.fit_transform(train[['chapter']]).toarray()
y_test = encoder.transform(val[['chapter']]).toarray()

encoded_train = [padding(encode_sentence(eng)) for eng in train['eng']]
encoded_test = [padding(encode_sentence(eng)) for eng in val['eng']]

In [None]:
from tqdm import tqdm

x_train = []
for el in tqdm(encoded_train, total=len(encoded_train)):
    x_train.append(
        tf.stack(
            [tf.cast(get_ids(el), dtype=tf.int32),
             tf.cast(get_mask(el), dtype=tf.int32),
             tf.cast(get_segments(el), dtype=tf.int32)],
            axis=0
        )
    )
    
x_test = []
for el in tqdm(encoded_test, total=len(encoded_test)):
    x_test.append(
        tf.stack(
            [tf.cast(get_ids(el), dtype=tf.int32),
             tf.cast(get_mask(el), dtype=tf.int32),
             tf.cast(get_segments(el), dtype=tf.int32)],
            axis=0
        )
    )

In [None]:
x_train = tf.stack(x_train)
x_test = tf.stack(x_test)

In [None]:
class DCNNBERTEmbedding(tf.keras.Model):
    
    def __init__(self,
                 nb_filters=50,
                 FFN_units=512,
                 nb_classes=202,
                 dropout_rate=0.1,
                 name="dcnn"):
        super(DCNNBERTEmbedding, self).__init__(name=name)
        
        self.bert_layer = hub.KerasLayer(
            "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
            trainable=False)

        self.bigram = layers.Conv1D(filters=nb_filters,
                                    kernel_size=2,
                                    padding="valid",
                                    activation="relu")
        self.trigram = layers.Conv1D(filters=nb_filters,
                                     kernel_size=3,
                                     padding="valid",
                                     activation="relu")
        self.fourgram = layers.Conv1D(filters=nb_filters,
                                      kernel_size=4,
                                      padding="valid",
                                      activation="relu")
        self.pool = layers.GlobalMaxPool1D()
        
        self.lstm = layers.LSTM(nb_filters)
        
        self.dense_1 = layers.Dense(units=FFN_units, activation="relu")
        self.dropout = layers.Dropout(rate=dropout_rate)
        if nb_classes == 2:
            self.last_dense = layers.Dense(units=1,
                                           activation="sigmoid")
        else:
            self.last_dense = layers.Dense(units=nb_classes,
                                           activation="softmax")
    
    def embed_with_bert(self, all_tokens):
        _, embs = self.bert_layer([all_tokens[:, 0, :],
                                   all_tokens[:, 1, :],
                                   all_tokens[:, 2, :]])
        return embs

    def call(self, inputs, training):
        x = self.embed_with_bert(inputs)
    
        x_1 = self.bigram(x)
        x_1 = self.pool(x_1)
        x_2 = self.trigram(x)
        x_2 = self.pool(x_2)
        x_3 = self.fourgram(x)
        x_3 = self.pool(x_3)
        x_4 = self.lstm(x)
        
        merged = tf.concat([x_1, x_2, x_3, x_4], axis=-1) # (batch_size, 4 * nb_filters)
        merged = self.dense_1(merged)
        merged = self.dropout(merged, training)
        output = self.last_dense(merged)
        return output

In [None]:
NB_FILTERS = 100
FFN_UNITS = 256
NB_CLASSES = 202

DROPOUT_RATE = 0.2

BATCH_SIZE = 32
NB_EPOCHS = 5

In [None]:
model = DCNNBERTEmbedding(nb_filters=NB_FILTERS,
                         FFN_units=FFN_UNITS,
                         nb_classes=NB_CLASSES,
                         dropout_rate=DROPOUT_RATE)

In [None]:
model.compile(loss="categorical_crossentropy",
             optimizer="adam",
             metrics=["accuracy"])

In [None]:
model.fit(x_train[0:10], y_train[0:10], epochs=1)

In [None]:
model.load_weights('../input/bert-model/model.h5')

In [None]:
model.evaluate(x_test, y_test)

In [None]:
model.fit(x_train,
         y_train,
         epochs=NB_EPOCHS,
         validation_data=(x_test, y_test))

In [None]:
model.save_weights('model.h5')