In [None]:
# !git clone --depth 1 -b v2.5.0 https://github.com/tensorflow/models.git
# !pip install -Uqr models/official/requirements.txt
# !pip install lime

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re

from sklearn.model_selection import train_test_split

import tensorflow as tf
import tensorflow_hub as hub
import sys
sys.path.append('models')

from official.nlp.data import classifier_data_lib
from official.nlp.bert import tokenization
from official.nlp import optimization

from lime import lime_text
from lime.lime_text import LimeTextExplainer

In [None]:
print("TF Version: ", tf.__version__)
print("Eager mode: ", tf.executing_eagerly())
print("Hub version: ", hub.__version__)
print("GPU is", "available" if tf.config.experimental.list_physical_devices("GPU") else "NOT AVAILABLE")

In [None]:
TARGET = "target"
SEED = 42
class_names = ['Normal', 'disaster']

TRAIN_PATH = "../input/nlp-getting-started/train.csv"
TEST_PATH = "../input/nlp-getting-started/test.csv"



df = pd.read_csv(TRAIN_PATH)
df_test = pd.read_csv(TEST_PATH)
df_original = df.copy()
df.head()

#Analysis

In [None]:
df[TARGET].plot(kind='hist', title='Target distribution');

In [None]:
df[TARGET].value_counts() / len(df)

In [None]:
df['text'].apply(len).plot(kind='hist', title='length distribution');

In [None]:
df.shape

#Preprocessing

In [None]:
replacement_patterns = [
    (r'won\'t', 'will not'),
    (r'can\'t', 'cannot'),
    (r'i\'m', 'i am'),
    (r'ain\'t', 'is not'),
    (r'(\w+)\'ll', '\g<1> will'),
    (r'(\w+)n\'t', '\g<1> not'),
    (r'(\w+)\'ve', '\g<1> have'),
    (r'(\w+)\'s', '\g<1> is'),
    (r'(\w+)\'re', '\g<1> are'),
    (r'(\w+)\'d', '\g<1> would'),
]

class RegexpReplacer(object):
    # Replaces regular expression in a text.
    def __init__(self, patterns=replacement_patterns):
        self.patterns = [(re.compile(regex), repl) for (regex, repl) in patterns]
    
    def replace(self, text):
        s = text
        
        for (pattern, repl) in self.patterns:
            s = re.sub(pattern, repl, s)
        
        return s

class SpellingReplacer(object):
    """ Replaces misspelled words with a likely suggestion based on shortest
    edit distance
    """
    def __init__(self, dict_name='en', max_dist=2):
        self.spell_dict = enchant.Dict(dict_name)
        self.max_dist = max_dist
    
    def replace(self, word):
        if self.spell_dict.check(word):
            return word
        
        suggestions = self.spell_dict.suggest(word)
        
        if suggestions and edit_distance(word, suggestions[0]) <= self.max_dist:
            return suggestions[0]
        else:
            return word

def clean_tweet(text) :
    # remove urls
    #text = df.apply(lambda x: re.sub(r'http\S+', '', x))
    # text = re.sub(r'http\S+', ' ', text)

    # replace contractions
    replacer = RegexpReplacer()
    text = replacer.replace(text)



    return text

df['text'] = df['text'].apply(clean_tweet)

In [None]:
df_original[TARGET].value_counts()

In [None]:
# max_class_count = max(df[TARGET].value_counts())
# min_class_count = min(df[TARGET].value_counts())
# diff_class_count = max_class_count - min_class_count
# df_disaster = df[df[TARGET] == 1]
# df_sample = df_disaster.sample(diff_class_count, random_state=SEED)

# df = df.append(df_sample)

# df[TARGET].value_counts()

In [None]:
df_train, df_val = train_test_split(df, random_state=SEED, test_size=0.1, stratify=df[TARGET].values)
# df_val, df_test = train_test_split(remaining, random_state=SEED, test_size=0.5, stratify=remaining[TARGET].values)
df_train.shape, df_val.shape, df_test.shape

In [None]:
with tf.device("/cpu:0"):
  data_train = tf.data.Dataset.from_tensor_slices((df_train['text'], df_train[TARGET]))
  data_val = tf.data.Dataset.from_tensor_slices((df_val['text'], df_val[TARGET]))
  data_test = tf.data.Dataset.from_tensor_slices((df_test['text']))

In [None]:
# model_url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2"
model_url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/2"
bert_layer = hub.KerasLayer(model_url, trainable=True)

In [None]:
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

In [None]:
text_length = df['text'].apply(lambda x: len(tokenizer.wordpiece_tokenizer.tokenize(x)))

plt.figure(figsize=(10, 8))
sns.distplot(text_length)
print(f'max lenth of text: {max(text_length)}')
print(f'avg lenth of text: {(text_length.mean())}');

In [None]:
label_list = [0, 1] # Label categories
max_seq_length = 40 # maximum length of (token) input sequences
batch_size = 32

In [None]:
def to_feature(text, label, label_list=label_list, max_seq_length=max_seq_length, tokenizer=tokenizer):
  example = classifier_data_lib.InputExample(guid=None, text_a = text.numpy(), text_b=None, label = label.numpy())
  feature = classifier_data_lib.convert_single_example(0, example, label_list,
                                    max_seq_length, tokenizer)
  
  return (feature.input_ids, feature.input_mask, feature.segment_ids, feature.label_id)

In [None]:
def map_feature(text, label):
  input_ids, input_mask, segment_ids, label_id = tf.py_function(to_feature, inp=[text, label], 
                                Tout=[tf.int32, tf.int32, tf.int32, tf.int32])

  input_ids.set_shape([max_seq_length])
  input_mask.set_shape([max_seq_length])
  segment_ids.set_shape([max_seq_length])
  label_id.set_shape([])

  x = {
        'input_word_ids': input_ids,
        'input_mask': input_mask,
        'input_type_ids': segment_ids
    }
  return (x, label_id)

In [None]:
with tf.device("/cpu:0"):
  data_train = (data_train.map(map_feature, num_parallel_calls=tf.data.experimental.AUTOTUNE)
                          .shuffle(1000, seed=SEED)
                          .batch(batch_size, drop_remainder=False)
                          .prefetch(tf.data.experimental.AUTOTUNE))
  
  data_val = (data_val.map(map_feature, num_parallel_calls=tf.data.experimental.AUTOTUNE)
                          .batch(batch_size, drop_remainder=False)
                          .prefetch(tf.data.experimental.AUTOTUNE))
  
  # data_test = (data_test.map(map_feature, num_parallel_calls=tf.data.experimental.AUTOTUNE)
  #                         .batch(batch_size, drop_remainder=True)
  #                         .prefetch(tf.data.experimental.AUTOTUNE))

In [None]:
def to_feature_test(text, label_list=label_list, max_seq_length=max_seq_length, tokenizer=tokenizer):
  example = classifier_data_lib.InputExample(guid=None, text_a = text.numpy(), text_b=None, label = None)
  feature = classifier_data_lib.convert_single_example(0, example, None,
                                    max_seq_length, tokenizer)
  return (feature.input_ids, feature.input_mask, feature.segment_ids)

def map_feature_test(text):
  input_ids, input_mask, segment_ids= tf.py_function(to_feature_test, inp=[text], 
                                  Tout=[tf.int32, tf.int32, tf.int32])
  input_ids.set_shape([max_seq_length])
  input_mask.set_shape([max_seq_length])
  segment_ids.set_shape([max_seq_length])
  x = {
          'input_word_ids': input_ids,
          'input_mask': input_mask,
          'input_type_ids': segment_ids
      }
  return x
with tf.device("/cpu:0"):

  data_test = (data_test.map(map_feature_test, num_parallel_calls=tf.data.experimental.AUTOTUNE)
                      .batch(batch_size, drop_remainder=False)
                      .prefetch(tf.data.experimental.AUTOTUNE))

#Modeling

In [None]:
def create_model(max_seq_length):
  input_word_ids = tf.keras.layers.Input(shape=(max_seq_length, ), dtype=tf.int32, name="input_word_ids")
  input_mask = tf.keras.layers.Input(shape=(max_seq_length, ), dtype=tf.int32, name="input_mask")
  input_type_ids = tf.keras.layers.Input(shape=(max_seq_length, ), dtype=tf.int32, name="input_type_ids")

  pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, input_type_ids])
  # output = bert_layer([input_word_ids, input_mask, input_type_ids])
  # pooled_output = output['pooled_output']
  output = pooled_output

  # output = tf.keras.layers.Dropout(0.3)(output)
  output = tf.keras.layers.Dense(1, activation="sigmoid", name="finale_output")(output)

  model = tf.keras.Model( inputs={
        'input_word_ids': input_word_ids,
        'input_mask': input_mask,
        'input_type_ids': input_type_ids
    },
    outputs=output)
  return model

In [None]:
print(f"If we only predict ones: {df_val['target'].mean()}\nIf we only predict zeros: {1 - df_val['target'].mean()}")

In [None]:
model = create_model(max_seq_length)
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
              loss=tf.keras.losses.BinaryCrossentropy(),
              metrics=["accuracy", tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])
model.summary()

In [None]:
%%time

from tensorflow.keras.callbacks import ModelCheckpoint

checkpoint = ModelCheckpoint('bert_model.h5', monitor='val_accuracy', save_best_only=True)

callbacks = [checkpoint]

epochs = 5
history = model.fit(data_train,
                    validation_data=data_val,
                    epochs=epochs, callbacks=callbacks,
                    # verbose=1
                    )

In [None]:
model.load_weights('bert_model.h5')
model.evaluate(data_val, verbose=1)

In [None]:
preds = model.predict(data_val)
preds = preds.round().astype(int)

In [None]:
from sklearn.metrics import f1_score

f1_score(df_val[TARGET], preds)

In [None]:
df_val_with_preds = df_val.copy()
df_val_with_preds['preds'] = preds
df_val_with_preds.reset_index(inplace=True)
df_wrong_preds = df_val_with_preds.query("target != preds").reset_index()
df_wrong_preds

In [None]:
def new_predict(X):
  test_data = tf.data.Dataset.from_tensor_slices((X))
  test_data = (test_data.map(map_feature_test).batch(1))
  pred = model.predict(test_data)
  return np.hstack([1 - pred, pred])

In [None]:
exp = LimeTextExplainer(class_names=class_names, random_state=SEED)

In [None]:
idx = 0
explained = exp.explain_instance(df_wrong_preds.iloc[idx]['text'], new_predict, num_features=5, top_labels=1, num_samples=100)
explained.show_in_notebook(text=df_val_with_preds.iloc[idx]['text'])

In [None]:
train_loss = history.history['loss']         # train loss
train_acc = history.history['accuracy']      # train accuracy
val_loss = history.history['val_loss']       # validation loss
val_acc = history.history['val_accuracy']    # validation accuracy

# Plotting 
plt.figure(figsize=(20, 8))   # figure size

plt.subplot(1, 2, 1)          # first plot: loss plot
# line plot
plt.plot( train_loss, label='train loss')   # train loss line plot
plt.plot( val_loss, label='val loss')       # validation loss line plot

plt.title('Loss')     # plot title
plt.legend()          # to display labels

plt.subplot(1, 2, 2)         # second plot: accuracy plot
# line plot
plt.plot(train_acc, label='train accuracy')    # train accuracy line plot
plt.plot(val_acc, label='val accuracy')        # validation accuracy line plot

plt.title('Accuracy')    # plot title
plt.legend()             # to display labels
plt.show();

#Submission

In [None]:
test_pred = model.predict(data_test)
submission = pd.read_csv("../input/nlp-getting-started/sample_submission.csv")
submission['target'] = test_pred.round().astype(int)
submission.to_csv('sub.csv', index=False)

In [None]:
submission