In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:

!pip install -q tf-nightly
!pip install -q tf-models-nightly

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf

import tensorflow_hub as hub
import tensorflow_datasets as tfds
tfds.disable_progress_bar()

from official.modeling import tf_utils
from official import nlp
from official.nlp import bert

# Load the required submodules
import official.nlp.optimization
import official.nlp.bert.bert_models
import official.nlp.bert.configs
import official.nlp.bert.run_classifier
import official.nlp.bert.tokenization
import official.nlp.data.classifier_data_lib
import official.nlp.modeling.losses
import official.nlp.modeling.models
import official.nlp.modeling.networks

In [None]:
train = "../input/nlp-getting-started/train.csv"
test = "../input/nlp-getting-started/test.csv"

In [None]:
data = (pd.read_csv(train)).iloc[:,3:]
data.shape

# Bert-Tokenizer

In [None]:
gs_folder_bert = "gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-12_H-768_A-12"
tf.io.gfile.listdir(gs_folder_bert)

In [None]:
tokenizer = bert.tokenization.FullTokenizer(
    vocab_file=os.path.join(gs_folder_bert, "vocab.txt"),
     do_lower_case=True)

In [None]:
print("Vocab size:", len(tokenizer.vocab))
tokenizer.convert_tokens_to_ids(['[CLS]', '[SEP]'])

In [None]:
def encode_sentence(s):
    tokens = list(tokenizer.tokenize(s))
    tokens.append('[SEP]')
    return tokenizer.convert_tokens_to_ids(tokens)

sentence = tf.ragged.constant([
    encode_sentence(s) for s in  data['text']])

In [None]:
print("Sentence shape:", sentence.shape.as_list())

In [None]:
cls = [tokenizer.convert_tokens_to_ids(['[CLS]'])]*sentence.shape[0]
input_word_ids = tf.concat([cls, sentence], axis=-1)
_ = plt.pcolormesh(input_word_ids.to_tensor())



In [None]:
input_mask = tf.ones_like(input_word_ids).to_tensor()

plt.pcolormesh(input_mask)

# Encoder and classifier

In [None]:
def encode_sentence(s, tokenizer):
    tokens = list(tokenizer.tokenize(s))
    tokens.append('[SEP]')
    return tokenizer.convert_tokens_to_ids(tokens)

def bert_encode(data, tokenizer):
    num_examples = len(data["text"])
  
    sentence = tf.ragged.constant([
       encode_sentence(s, tokenizer)
       for s in np.array(data["text"])])

    cls = [tokenizer.convert_tokens_to_ids(['[CLS]'])]*sentence.shape[0]
    input_word_ids = tf.concat([cls, sentence], axis=-1)

    input_mask = tf.ones_like(input_word_ids).to_tensor()

    type_cls = tf.zeros_like(cls)
    type_s = tf.zeros_like(sentence)
    input_type_ids = tf.concat(
      [type_cls, type_s], axis=-1).to_tensor()

    inputs = {
      'input_word_ids': input_word_ids.to_tensor(),
      'input_mask': input_mask,
      'input_type_ids': input_type_ids}

    return inputs

In [None]:
d_train,d_val,d_test = np.split(data.sample(frac=1), [int(.6*len(data)), int(.8*len(data))])

In [None]:
print("train ",d_train.shape)
print("val ",d_val.shape)
print("test ",d_test.shape)

In [None]:
data_train = bert_encode(d_train, tokenizer)
data_train_labels = d_train['target']

data_validation = bert_encode(d_val, tokenizer)
data_validation_labels = d_val['target']

data_test = bert_encode(d_test, tokenizer)
data_test_labels  = d_test['target']

In [None]:
for key, value in data_train.items():
  print(f'{key:15s} shape: {value.shape}')

print(f'glue_train_labels shape: {data_train_labels.shape}')

# Model

In [None]:

config_dict = {'attention_probs_dropout_prob': 0.1,
 'hidden_act': 'gelu',
 'hidden_dropout_prob': 0.1,
 'hidden_size': 768,
 'initializer_range': 0.02,
 'intermediate_size': 3072,
 'max_position_embeddings': 512,
 'num_attention_heads': 12,
 'num_hidden_layers': 12,
 'type_vocab_size': 2,
 'vocab_size': 30522}
bert_config = bert.configs.BertConfig.from_dict(config_dict)

config_dict

In [None]:
bert_classifier, bert_encoder = bert.bert_models.classifier_model(
    bert_config, num_labels=1)

In [None]:
data_batch = {key: val[:10] for key, val in data_train.items()}

bert_classifier(
    data_batch, training=True
).numpy()

In [None]:
checkpoint = tf.train.Checkpoint(model=bert_encoder)
checkpoint.restore(
    os.path.join(gs_folder_bert, 'bert_model.ckpt')).assert_consumed()

In [None]:
# Set up epochs and steps
epochs = 3
batch_size = 32
eval_batch_size = 32

train_data_size = len(data_train_labels)
steps_per_epoch = int(train_data_size / batch_size)
num_train_steps = steps_per_epoch * epochs
warmup_steps = int(epochs * train_data_size * 0.1 / batch_size)

# creates an optimizer with learning rate schedule
optimizer = nlp.optimization.create_optimizer(
    2e-5, num_train_steps=num_train_steps, num_warmup_steps=warmup_steps)

In [None]:
metrics = [tf.keras.metrics.BinaryAccuracy('binary_accuracy', dtype=tf.float32)]
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True,name='binary_crossentropy')
bert_classifier.compile(
    optimizer=optimizer,
    loss=loss,
    metrics=metrics)


In [None]:
bert_classifier.fit(
      data_train, data_train_labels,
      validation_data=(data_validation, data_validation_labels),
      batch_size=32,
      epochs=epochs)  

# Evaluation and Accuracy

In [None]:
y_pred = bert_classifier.predict(data_test)

In [None]:
y_pred

In [None]:
result = bert_classifier(data_test, training=False)


In [None]:
 result

In [None]:
check = [1 if i>0 else 0 for i in result]

In [None]:
from sklearn.metrics import classification_report
print(classification_report(check, data_test_labels,))

In [None]:
export_dir='./saved_model'
tf.saved_model.save(bert_classifier, export_dir=export_dir)

In [None]:
export_dir='./saved_model'
clf = tf.saved_model.load(export_dir)

In [None]:
!ls saved_model/assets/

# submit

In [None]:
sub = pd.read_csv(test)

In [None]:
sub.head()

In [None]:
data_sub = bert_encode(sub, tokenizer)

In [None]:
for key, value in data_sub.items():
  print(f'{key:15s} shape: {value.shape}')


In [None]:
sub_val = clf([data_sub['input_word_ids'],
              data_sub['input_mask'],
              data_sub['input_type_ids']], training=False)


In [None]:
sub_val

In [None]:
target = [1 if i>0 else 0 for i in sub_val]

In [None]:
target


In [None]:
my_submission = pd.DataFrame({'id': sub.id, 'Target': target})
my_submission.to_csv('submission.csv', index=False)