In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
!pip install bert-for-tf2
!pip install h5py
!wget https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip
!unzip uncased_L-12_H-768_A-12.zip

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm
import tensorflow as tf
import keras
from keras.models import model_from_json
import bert
from bert import BertModelLayer
from bert.loader import StockBertConfig, map_stock_config_to_params, load_stock_weights
from bert.tokenization.bert_tokenization import FullTokenizer
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight as cw

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
os.makedirs("./model", exist_ok=True)
!mv uncased_L-12_H-768_A-12/ ./model


In [None]:
TRAINING_PATH = "../input/sarcasm-preprocessed/sarcastic_preprocessed_train.csv"
TESTING_PATH = "../input/sarcasm-preprocessed/sarcastic_preprocessed_test.csv"

In [None]:
class SentimentAnalysisData:
    DATA_COLUMN = "text"
    LABEL_COLUMN = "label"
    SAMPLE_WEIGHT = "sample_weight"

    def __init__(self, train, test, tokenizer: FullTokenizer, max_seq_len=192):
        self.tokenizer = tokenizer
        self.max_seq_len = 0

        self.train_x, self.train_y, self.train_sample_weights = self._prepare_train(train)
        self.test_x, self.test_y = self._prepare_test(test)

        print("sample_weights", self.train_sample_weights.shape)
        print("max seq_len", self.max_seq_len)
        self.max_seq_len = min(self.max_seq_len, max_seq_len)
        self.train_x, self.test_x = map(self._pad, [self.train_x, self.test_x])

    def _prepare_test(self, df):
        x, y = [], []
        for _, row in tqdm(df.iterrows()):
            try:
                text, label = row[SentimentAnalysisData.DATA_COLUMN], row[SentimentAnalysisData.LABEL_COLUMN]
                tokens = self.tokenizer.tokenize(text)
                tokens = ["[CLS]"] + tokens + ["[SEP]"]
                token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
                self.max_seq_len = max(self.max_seq_len, len(token_ids))
                x.append(token_ids)
                y.append(label)
            except:
                pass
        return np.array(x), np.array(y)
    
    def _prepare_train(self, df):
        x, y, z = [], [], []
        for _, row in tqdm(df.iterrows()):
            try:
                text, label, sample_weight= row[SentimentAnalysisData.DATA_COLUMN], row[SentimentAnalysisData.LABEL_COLUMN], row[SentimentAnalysisData.SAMPLE_WEIGHT]
                tokens = self.tokenizer.tokenize(text)
                tokens = ["[CLS]"] + tokens + ["[SEP]"]
                token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
                self.max_seq_len = max(self.max_seq_len, len(token_ids))
                x.append(token_ids)
                y.append(label)
                z.append(sample_weight)
            except:
                pass
        return np.array(x), np.array(y), np.array(z)

    def _pad(self, ids):
        x = []
        for input_ids in ids:
            input_ids = input_ids[:min(len(input_ids), self.max_seq_len - 2)]
            input_ids = input_ids + [0] * (self.max_seq_len - len(input_ids))
            x.append(np.array(input_ids))
        return np.array(x)

In [None]:
BERT_MODEL_NAME="uncased_L-12_H-768_A-12"
bert_ckpt_dir = os.path.join("./model/", BERT_MODEL_NAME)
BERT_CKPT_FILE = os.path.join(bert_ckpt_dir, "bert_model.ckpt")
bert_config_file = os.path.join(bert_ckpt_dir, "bert_config.json")
VOCAB_PATH =os.path.join(bert_ckpt_dir, "vocab.txt")
tokenizer = FullTokenizer(vocab_file = "./model/uncased_L-12_H-768_A-12/vocab.txt")

In [None]:
    
# # detect and init the TPU
# tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
# tf.config.experimental_connect_to_cluster(tpu)
# tf.tpu.experimental.initialize_tpu_system(tpu)

# # instantiate a distribution strategy
# tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)


In [None]:

def create_model(max_seq_len, num_classes, bert_ckpt_file = BERT_CKPT_FILE):
    
    with tf.io.gfile.GFile(bert_config_file, "r") as reader:
        bc = StockBertConfig.from_json_string(reader.read())
        bert_params = map_stock_config_to_params(bc)
        bert = BertModelLayer.from_params(bert_params, name="bert")
    input_ids = keras.layers.Input(shape=(max_seq_len,), dtype='int32', name="input_ids")
    bert_output = bert(input_ids)

    print("bert shape", bert_output.shape)
    cls_out = keras.layers.Lambda(lambda seq: seq[:, 0, :])(bert_output)
    cls_out = keras.layers.Dropout(0.5)(cls_out)
    logits = keras.layers.Dense(units=1024, activation="tanh")(cls_out)
    logits = keras.layers.Dropout(0.2)(logits)
    logits = keras.layers.Dense(units=num_classes, activation="softmax")(logits)

    model = keras.Model(inputs=input_ids, outputs=logits)
    bert.trainable = False
    model.build(input_shape=(None, max_seq_len))
    load_stock_weights(bert, bert_ckpt_file)

    return model

In [None]:
checkpoint_filepath = '/tmp/checkpoint'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='val_acc',
    mode='max',
    save_best_only=True)

In [None]:
# with tpu_strategy.scope():
MAX_SEQ_LEN = 100
tokenizer = FullTokenizer(vocab_file=VOCAB_PATH)

train_val = pd.read_csv(TRAINING_PATH)
test = pd.read_csv(TRAINING_PATH)
data = SentimentAnalysisData(train_val, test, tokenizer, max_seq_len=MAX_SEQ_LEN)

model = create_model(data.max_seq_len, 5)
model.summary()
model.compile(
    optimizer=keras.optimizers.Adam(1e-5),
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=[keras.metrics.SparseCategoricalAccuracy(name="acc")]
)


# Model weights are saved at the end of every epoch, if it's the best seen
# so far.
history = model.fit(
    x=data.train_x,
    y=data.train_y,
    validation_split=0.2,
    batch_size= 32,
    sample_weight = data.train_sample_weights,
    shuffle=True,
    epochs=12,
    verbose=1,
    callbacks=[model_checkpoint_callback]
)
    
# The model weights (that are considered the best) are loaded into the model.
model.load_weights(checkpoint_filepath)

_, test_acc = model.evaluate(data.test_x, data.test_y)
_, train_acc = model.evaluate(data.train_x, data.train_y)
print("Test Accuracy:" + str(test_acc))

In [None]:
json_file = model.to_json()
json_file_path = "model_json.json"
with open(json_file_path, "w") as saved_model:
   saved_model.write(json_file)
# serialize weights to HDF5
model.save_weights(h5_file)