In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
import re
import math
import datetime
from wordcloud import WordCloud, STOPWORDS
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_df = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
test_df = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')
train_df.head()

In [None]:
train_df.shape

In [None]:
test_df.head()

In [None]:
test_df.shape

In [None]:
train_df.isna().sum()

In [None]:
test_df.isna().sum()

In [None]:
train_df['keyword'] = train_df['keyword'].fillna('DUMMY_VALUE')
test_df['keyword'] = test_df['keyword'].fillna('DUMMY_VALUE')
train_df.isna().sum()

In [None]:
train_df['keyword'].value_counts()

Next we will merge Keyword and text feature and use this combined feature for detecting disaster tweets.

In [None]:
train_df['final_text'] = train_df['keyword'] + train_df['text']
test_df['final_text'] = test_df['keyword'] + test_df['text']

Next we will delete id, keyword and the location feature as it has large number of mising values and it also doesnt serve the purpose in detecting disaster tweets.

In [None]:
cols_del =['id','keyword','location']
train_df = train_df.drop(cols_del,axis=1)
test_df = test_df.drop(cols_del,axis=1)

In [None]:
train_df.head()

In [None]:
test_df.head()

### Class Distribution

In [None]:
sns.countplot(x = 'target', data = train_df)
plt.xlabel('Class Names')
plt.ylabel('Count')
plt.title('Distribution of classes in the training dataset')
plt.show()

In [None]:
train_df['word_count'] = train_df.final_text.apply(len)
train_df.head()

In [None]:
def cleaned_text(text):
    clean=re.sub("http\S+","",text)
    clean=re.sub("pic.twitter\S+","",clean)
    clean=re.sub("@\S+","",clean)
    clean = re.sub('#', '', clean)
    clean = re.sub('goooooooaaaaaal', 'goal', clean)
    clean = re.sub('SOOOO', 'SO', clean)
    clean = re.sub('LOOOOOOL', 'LOL', clean)
    clean = re.sub('Cooool', 'cool', clean)
    clean = re.sub('|', '', clean)
    clean = re.sub(r'\?{2,}', '? ', clean)
    clean = re.sub(r'\.{2,}', '. ', clean)
    clean = re.sub(r'\!{2,}', '! ', clean)
    clean = re.sub('&amp;', '&', clean)
    clean = re.sub('Comin', 'Coming', clean)
    clean = re.sub('&gt;', '> ', clean)
    clean = re.sub('&lt;', '< ', clean)
    clean = re.sub(r'.:', '', clean)
    clean = re.sub('baaaack', 'back', clean)
    clean = re.sub('RT', '', clean)
    clean = re.sub('\s{2,}', ' ', clean)
    clean = clean.lower()
    return clean
train_df['cleaned_text'] = train_df['final_text'].apply(cleaned_text)
test_df['cleaned_text'] = test_df['final_text'].apply(cleaned_text)

In [None]:
train_df.head()

segregating disaster and normal tweets for plotting

In [None]:
train_disaster = train_df[train_df['target']==1]
train_normal = train_df[train_df['target']==0]

### Distribution of Word Count of Disaster Tweets

In [None]:

fig, ax = plt.subplots(1, 2)

disaster = train_disaster.word_count.to_list()
normal = train_normal.word_count.to_list()

ax[0].hist(disaster, bins=50, alpha = 0.5, color = 'r')
ax[1].hist(normal, bins=50, alpha = 0.5, color = 'g')

plt.show()

### Observation:

- Disaster tweets are comparatively shorter in length as compared to normal tweets in general

In [None]:
train_disaster.word_count.describe()

In [None]:
train_disaster[train_disaster['word_count'] == 163]['cleaned_text'].iloc[0]

In [None]:
spec_chars = ["!",'"',"#","%","&","'","(",")",
              "*","+",",","-",".","/",":",";","<",
              "=",">","?","@","[","\\","]","^","_",
              "`","{","|","}","~","–"]
for char in spec_chars:
    train_df['cleaned_text'] = train_df['cleaned_text'].str.replace(char, ' ')
    train_disaster['cleaned_text'] = train_disaster['cleaned_text'].str.replace(char, ' ')
    train_normal['cleaned_text'] = train_normal['cleaned_text'].str.replace(char, ' ')
    test_df['cleaned_text'] = test_df['cleaned_text'].str.replace(char, ' ')
    

In [None]:
train_df[train_df['word_count'] == 163]['cleaned_text'].iloc[0]

### Word cloud Disaster Tweets

In [None]:
stopwords = set(STOPWORDS) 
wordcloud = WordCloud(background_color ='white', 
                    stopwords = stopwords, 
                    min_font_size = 10).generate(' '.join(train_disaster['cleaned_text']))

In [None]:
print(wordcloud)
fig = plt.figure(1)
plt.figure(figsize=(14,8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

### Word Cloud Normal Tweets

In [None]:
wordcloud = WordCloud(background_color ='white', 
                    stopwords = stopwords, 
                    min_font_size = 10).generate(' '.join(train_normal['cleaned_text']))
print(wordcloud)
fig = plt.figure(1)
plt.figure(figsize=(14,8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

### Observations

- From both the Word clouds we can easily see the difference between disaster and normal tweets

In [None]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(train_df, test_size=0.2)

### Bert Modelling

#### About BERT

BERT is an acronym of **Bidirectional Encoder Representations from Transformers**. The term bidirectional means that the context of a word is given by both the words that follow it and by the words preceding it. This technique makes this algorithm hard to train but very effective. Exploring the surrounding text around words is computationally expensive but allows a deeper understanding of words and sentences.

![bert](https://pytorch.org/tutorials/_images/bert.png)

Unidirectional context-oriented algorithm already exist. A neural network can be trained to predict which word will follow a sequence of given words, once trained on a huge dataset of sentences. However, predicting that word from both the previous and following words is not an easy task. 

The only way to do so effectively is to mask some words in a sentence and predict them too, e.g., the sentence **the quick brown fox jumps over the lazy dog** might be masked as **the X brown fox jumps over the Y dog** with label (**X = quick, Y = lazy**) to become a labelled record in a training set of sentences. One can easily derive a training set from a bundle of unsupervised texts by simply masking 15% of words (as BERT does), and training the neural network to deduce the missing words from the remaining ones.

Notice that BERT is truly a deep learning algorithm, while context-free algorithms such as word2vec, based on shallow recurrent networks, may not be. 

However, as such, BERT’s training is very expensive, due to its transformer aspect. Training on a huge body of text – for example, all English-language Wikipedia pages – is an Herculean effort that requires decidedly nontrivial computational power.

Whatever the task, it is not necessary to pre-train the BERT model, but only to fine-tune a pre-trained model on the specific dataset that relates to the problem we want to use BERT to study. We will try to use such a pre-trained model to perform our simple classification task: more exciting use cases may be found on the GitHub page of the project mentioned above, as well as elsewhere on the Web.

First, we choose the pre-trained model: in the BERT GitHub repository there are several choices available, we will use `uncased_L-12_H-768_A-12`.

The pre-trained model can be downloaded from the repository and extracted into a local folder. This folder will contain the following files:

- **bert_config.json**
- **bert_model.ckpt.data-00000-of-00001**
- **bert_model.ckpt.index**
- **vocab.txt**

The first file contains all the configuration necessary to build a network layer to use this BERT model, while the latter files are needed to properly tokenize our texts. The largest file contains the model, which may be loaded from the BERT library using the methods demonstrated below.

To remain focused on the model, the assumption will be that our code is run inside a directory. This is necessary before running the following programs:

Before setting up the model, our dataset is tokenized according to the format expected by the BERT layers; this can be done via the **FullTokenizer** class from the BERT package. 

Next, the tokenizer is fed with each sentence in our datsaset. The tokenizer result, which is a list of strings, between **[CLS]** and **[SEP]** is enclosed, as required by the BERT algorithm implementation.

The output of our model will be simply a number between 0 and 1.

In [None]:
# reqire to download tensorflow 2 for bert without that it shows error
!pip install bert-for-tf2

In [None]:
import tensorflow as tf
from tensorflow import keras
from pylab import rcParams
import bert
from bert import BertModelLayer
from bert.loader import StockBertConfig, map_stock_config_to_params, load_stock_weights
from bert.tokenization.bert_tokenization import FullTokenizer
from sklearn.metrics import confusion_matrix, classification_report

%matplotlib inline
%config InlineBackend.figure_format='retina'

sns.set(style='whitegrid', palette='muted', font_scale=1.2)

HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02", "#8F00FF"]

sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))

rcParams['figure.figsize'] = 12, 8

RANDOM_SEED = 42

np.random.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)

In [None]:
!wget https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip

In [None]:
!unzip uncased_L-12_H-768_A-12.zip

In [None]:
os.makedirs("model", exist_ok=True)
!mv uncased_L-12_H-768_A-12/ model
bert_model_name="uncased_L-12_H-768_A-12"

bert_ckpt_dir = os.path.join("model/", bert_model_name)
bert_ckpt_file = os.path.join(bert_ckpt_dir, "bert_model.ckpt")
bert_config_file = os.path.join(bert_ckpt_dir, "bert_config.json")

In [None]:
class DisasterDetectionData:
  DATA_COLUMN = "cleaned_text"
  LABEL_COLUMN = "target"

  def __init__(self, train, test, tokenizer: FullTokenizer, classes, max_seq_len=192):
    self.tokenizer = tokenizer
    self.max_seq_len = 0
    self.classes = classes
    
    ((self.train_x, self.train_y), (self.test_x, self.test_y)) = map(self._prepare, [train, test])

    print("max seq_len", self.max_seq_len)
    self.max_seq_len = min(self.max_seq_len, max_seq_len)
    self.train_x, self.test_x = map(self._pad, [self.train_x, self.test_x])

  def _prepare(self, df):
    x, y = [], []
    
    for _, row in tqdm(df.iterrows()):
      text, label = row[DisasterDetectionData.DATA_COLUMN], row[DisasterDetectionData.LABEL_COLUMN]
      tokens = self.tokenizer.tokenize(text)
      tokens = ["[CLS]"] + tokens + ["[SEP]"]
      token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
      self.max_seq_len = max(self.max_seq_len, len(token_ids))
      x.append(token_ids)
      y.append(self.classes.index(label))

    return np.array(x), np.array(y)

  def _pad(self, ids):
    x = []
    for input_ids in ids:
      input_ids = input_ids[:min(len(input_ids), self.max_seq_len - 2)]
      input_ids = input_ids + [0] * (self.max_seq_len - len(input_ids))
      x.append(np.array(input_ids))
    return np.array(x)

In [None]:
tokenizer = FullTokenizer(vocab_file=os.path.join(bert_ckpt_dir, "vocab.txt"))
tokenizer.tokenize("I can't wait to visit Bulgaria again!")

In [None]:
tokens = tokenizer.tokenize("I can't wait to visit Bulgaria again!")
tokenizer.convert_tokens_to_ids(tokens)

In [None]:
from tqdm import tqdm

def create_model(max_seq_len, bert_ckpt_file):

  with tf.io.gfile.GFile(bert_config_file, "r") as reader:
      bc = StockBertConfig.from_json_string(reader.read())
      bert_params = map_stock_config_to_params(bc)
      bert_params.adapter_size = None
      bert = BertModelLayer.from_params(bert_params, name="bert")
        
  input_ids = keras.layers.Input(shape=(max_seq_len, ), dtype='int32', name="input_ids")
  bert_output = bert(input_ids)

  print("bert shape", bert_output.shape)

  cls_out = keras.layers.Lambda(lambda seq: seq[:, 0, :])(bert_output)
  cls_out = keras.layers.Dropout(0.5)(cls_out)
  logits = keras.layers.Dense(units=512, activation="tanh")(cls_out)
  logits = keras.layers.Dropout(0.5)(logits)
  logits = keras.layers.Dense(units=128, activation="tanh")(cls_out)
  logits = keras.layers.Dropout(0.3)(logits)
  logits = keras.layers.Dense(units=len(classes), activation="softmax")(logits)

  model = keras.Model(inputs=input_ids, outputs=logits)
  model.build(input_shape=(None, max_seq_len))

  load_stock_weights(bert, bert_ckpt_file)
        
  return model

In [None]:
classes = train.target.unique().tolist()

data = DisasterDetectionData(train, test, tokenizer, classes, max_seq_len=128)

In [None]:
data.train_x.shape

In [None]:
data.train_x[0]

In [None]:
model = create_model(data.max_seq_len, bert_ckpt_file)
model.summary()

In [None]:
model.compile(
  optimizer=keras.optimizers.Adam(1e-5),
  loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
  metrics=[keras.metrics.SparseCategoricalAccuracy(name="acc")]
)

In [None]:
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping


    
callbacks = [ReduceLROnPlateau(monitor='val_loss', patience=1, verbose=1, factor=0.5), EarlyStopping(monitor='val_loss', patience=2)]

history = model.fit(
  x=data.train_x, 
  y=data.train_y,
  validation_split=0.1,
  batch_size=16,
  shuffle=True,
  epochs=5,
  callbacks=callbacks
)

In [None]:
from matplotlib.ticker import MaxNLocator
from matplotlib import rc

ax = plt.figure().gca()
ax.xaxis.set_major_locator(MaxNLocator(integer=True))

ax.plot(history.history['loss'])
ax.plot(history.history['val_loss'])
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['train', 'test'])
plt.title('Loss over training epochs')
plt.show();

In [None]:
ax = plt.figure().gca()
ax.xaxis.set_major_locator(MaxNLocator(integer=True))

ax.plot(history.history['acc'])
ax.plot(history.history['val_acc'])
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['train', 'test'])
plt.title('Accuracy over training epochs')
plt.show();

In [None]:
_, train_acc = model.evaluate(data.train_x, data.train_y)
_, test_acc = model.evaluate(data.test_x, data.test_y)

print("train acc", train_acc)
print("test acc", test_acc)

In [None]:
y_pred = model.predict(data.test_x).argmax(axis=-1)
print(classification_report(data.test_y, y_pred))

In [None]:
cm = confusion_matrix(data.test_y, y_pred)
df_cm = pd.DataFrame(cm, index=classes, columns=classes)
hmap = sns.heatmap(df_cm, annot=True, fmt="d")
hmap.yaxis.set_ticklabels(hmap.yaxis.get_ticklabels(), rotation=0, ha='right')
hmap.xaxis.set_ticklabels(hmap.xaxis.get_ticklabels(), rotation=30, ha='right')
plt.ylabel('True label')
plt.xlabel('Predicted label');

### Testing model on Random sentences

In [None]:
sentences = [
  "Just happened a terrible car crash",
    "Heard about #earthquake is different cities, stay safe everyone.",
    "No I don't like cold!",
    "@RosieGray Now in all sincerety do you think the UN would move to Israel if there was a fraction of a chance of being annihilated?"
  ]

pred_tokens = map(tokenizer.tokenize, sentences)
pred_tokens = map(lambda tok: ["[CLS]"] + tok + ["[SEP]"], pred_tokens)
pred_token_ids = list(map(tokenizer.convert_tokens_to_ids, pred_tokens))

pred_token_ids = map(lambda tids: tids +[0]*(data.max_seq_len-len(tids)),pred_token_ids)
pred_token_ids = np.array(list(pred_token_ids))

predictions = model.predict(pred_token_ids).argmax(axis=-1)

for text, label in zip(sentences, predictions):
    if classes[label]==1:
        target="Disaster Tweet"
        print("text:", text, "\nClass:", target)
        print()
    else:
        target="Normal Tweet"
        print("text:", text, "\nClass:", target)
        print()
        
  

As we can see the model correctly predicted disaster tweets in first two sentences and Normal in last two sentences.