In [None]:
import pandas as pd
import numpy as np
import math
import re
import os
import random

In [None]:
train=pd.read_csv('../input/questions-chapter-classification/train.csv')

In [None]:
train.head()

In [None]:
!pip install bert-for-tf2
!pip install sentencepiece

In [None]:
import tensorflow as tf

import tensorflow_hub as hub

from tensorflow.keras import layers
import bert

In [None]:
train.drop(columns=['q_id'],inplace=True)

In [None]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [None]:
train['class']=train['class'].map({6:0,7:1,8:2,9:3,10:4,11:5,12:6})

In [None]:
import string
stop_words.extend(list(string.punctuation))

In [None]:
from nltk.tokenize import word_tokenize

In [None]:
def stopword_remove(sent):
    tokens=word_tokenize(sent)
    sentence=[token for token in tokens if token not in stop_words]
    return ' '.join(sentence) 

In [None]:
train['eng']=train['eng'].apply(lambda sent:stopword_remove(sent))

In [None]:
train['chapter']=train['chapter'].apply(lambda sent:stopword_remove(sent))

In [None]:
FullTokenizer = bert.bert_tokenization.FullTokenizer
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
                            trainable=False)
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = FullTokenizer(vocab_file, do_lower_case)

In [None]:
def encode_sentence(eng,chapter):
    return ["[CLS]"] + tokenizer.tokenize(eng) + ["[SEP]"]+tokenizer.tokenize(chapter)

In [None]:
data_inputs =[encode_sentence(eng,chapter) for eng,chapter in zip(train['eng'],train['chapter'])]

In [None]:
def get_ids(tokens):
    return tokenizer.convert_tokens_to_ids(tokens)

def get_mask(tokens):
    return np.char.not_equal(tokens, "[PAD]").astype(int)

def get_segments(tokens):
    seg_ids = []
    current_seg_id = 0
    for tok in tokens:
        seg_ids.append(current_seg_id)
        if tok == "[SEP]":
            current_seg_id = 1-current_seg_id # turns 1 into 0 and vice versa
    return seg_ids

In [None]:
data_labels=train['class'].values

In [None]:
data_with_len = [[sent, data_labels[i], len(sent)]
                 for i, sent in enumerate(data_inputs)]
random.shuffle(data_with_len)
data_with_len.sort(key=lambda x: x[2])
sorted_all = [([get_ids(sent_lab[0]),
                get_mask(sent_lab[0]),
                get_segments(sent_lab[0])],
               sent_lab[1])
              for sent_lab in data_with_len if 512>sent_lab[2] > 3]

In [None]:
# A list is a type of iterator so it can be used as generator for a dataset
all_dataset = tf.data.Dataset.from_generator(lambda: sorted_all,
                                             output_types=(tf.int32, tf.int32))

In [None]:
next(iter(all_dataset))

In [None]:
BATCH_SIZE = 32
all_batched = all_dataset.padded_batch(BATCH_SIZE,
                                       padded_shapes=((3, None), ()),
                                       padding_values=(0, 0))

In [None]:
NB_BATCHES = math.ceil(len(sorted_all) / BATCH_SIZE)
NB_BATCHES_TEST = NB_BATCHES // 10
all_batched.shuffle(NB_BATCHES)
test_dataset = all_batched.take(NB_BATCHES_TEST)
train_dataset = all_batched.skip(NB_BATCHES_TEST)

In [None]:
class DCNNBERTEmbedding(tf.keras.Model):
    
    def __init__(self,
                 nb_filters=50,
                 FFN_units=512,
                 nb_classes=7,
                 dropout_rate=0.1,
                 name="dcnn"):
        super(DCNNBERTEmbedding, self).__init__(name=name)
        
        self.bert_layer = hub.KerasLayer(
            "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
            trainable=False)

        self.bigram = layers.Conv1D(filters=nb_filters,
                                    kernel_size=2,
                                    padding="valid",
                                    activation="relu")
        self.trigram = layers.Conv1D(filters=nb_filters,
                                     kernel_size=3,
                                     padding="valid",
                                     activation="relu")
        self.fourgram = layers.Conv1D(filters=nb_filters,
                                      kernel_size=4,
                                      padding="valid",
                                      activation="relu")
        self.pool = layers.GlobalMaxPool1D()
        self.dense_1 = layers.Dense(units=FFN_units, activation="relu")
        self.dropout = layers.Dropout(rate=dropout_rate)
        if nb_classes == 2:
            self.last_dense = layers.Dense(units=1,
                                           activation="sigmoid")
        else:
            self.last_dense = layers.Dense(units=nb_classes,
                                           activation="softmax")
    
    def embed_with_bert(self, all_tokens):
        _, embs = self.bert_layer([all_tokens[:, 0, :],
                                   all_tokens[:, 1, :],
                                   all_tokens[:, 2, :]])
        return embs

    def call(self, inputs, training):
        x = self.embed_with_bert(inputs)

        x_1 = self.bigram(x)
        x_1 = self.pool(x_1)
        x_2 = self.trigram(x)
        x_2 = self.pool(x_2)
        x_3 = self.fourgram(x)
        x_3 = self.pool(x_3)
        
        merged = tf.concat([x_1, x_2, x_3], axis=-1) # (batch_size, 3 * nb_filters)
        merged = self.dense_1(merged)
        merged = self.dropout(merged, training)
        output = self.last_dense(merged)
        return output

In [None]:
NB_FILTERS = 100
FFN_UNITS = 256
NB_CLASSES = 7

DROPOUT_RATE = 0.2

BATCH_SIZE = 32
NB_EPOCHS = 2

In [None]:
Dcnn = DCNNBERTEmbedding(nb_filters=NB_FILTERS,
                         FFN_units=FFN_UNITS,
                         nb_classes=NB_CLASSES,
                         dropout_rate=DROPOUT_RATE)

In [None]:
if NB_CLASSES == 2:
    Dcnn.compile(loss="binary_crossentropy",
                 optimizer="adam",
                 metrics=["accuracy"])
else:
    Dcnn.compile(loss="sparse_categorical_crossentropy",
                 optimizer="adam",
                 metrics=["accuracy"])

In [None]:
Dcnn.fit(train_dataset,
         epochs=NB_EPOCHS,validation_data=test_dataset)

In [None]:
results = Dcnn.evaluate(test_dataset)
print(results)

In [None]:
val=pd.read_csv('../input/questions-chapter-classification/val.csv')

In [None]:
val['class']=val['class'].map({6:0,7:1,8:2,9:3,10:4,11:5,12:6})

In [None]:
val.head(10)

In [None]:
def get_prediction(eng,chapter):
    eng=stopword_remove(eng)
    chapter=stopword_remove(chapter)
    tokens = encode_sentence(eng,chapter)

    input_ids = get_ids(tokens)
    input_mask = get_mask(tokens)
    segment_ids = get_segments(tokens)

    inputs = tf.stack(
        [tf.cast(input_ids, dtype=tf.int32),
         tf.cast(input_mask, dtype=tf.int32),
         tf.cast(segment_ids, dtype=tf.int32)],
         axis=0)
    inputs = tf.expand_dims(inputs, 0) # simulates a batch

    output = Dcnn(inputs, training=False)
    
    output=tf.math.argmax(output,axis=-1)
    
    output=tf.keras.backend.get_value(output)

    return output[0]

In [None]:
pred=[]
for eng,chapter in zip(val['eng'],val['chapter']):
    pred.append(get_prediction(eng,chapter))

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [None]:
print(accuracy_score(pred,val['class']))

In [None]:
print(classification_report(pred,val['class']))