The problem
--------------------------------
Twitter has become an important communication channel in times of emergency. The ubiquitousness of smartphones enables people to announce an emergency they’re observing in real-time. Because of this, more agencies are interested in programatically monitoring Twitter (i.e. disaster relief organizations and news agencies).

But, it’s not always clear whether a person’s words are actually announcing a disaster.

The objective
Predicting whether a given tweet is about a real disaster or not. If so, predict a 1. If not, predict a 0.

# Loading Libraries and Data

In [None]:
!pip install tensorflow-text

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf

import nltk
import spacy
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

from sklearn.model_selection import train_test_split

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text

import matplotlib.pyplot as plt
import seaborn as sns

**LOADING THE DATA**

In [None]:
df = pd.read_csv('../input/nlp-getting-started/train.csv')

In [None]:
df.head()

# Cleaning the data

Filling Nan Values

In [None]:
df.info()

In [None]:
def fillna(df, column, fillwith):
  df[column] = df[column].fillna(fillwith)
  return df

In [None]:
for i in ['keyword','location']:
  df = fillna(df,i,'a')

In [None]:
df.info()

In [None]:
df.head()

In [None]:
df['space'] = ' '
df['full_text'] = df['keyword'] + df['space'] + df['location']+ df['space'] + df['text']
df.drop('keyword', axis=1, inplace=True)
df.drop('location', axis=1, inplace=True)
df.drop('text', axis=1, inplace=True)
df.drop('space', axis=1, inplace=True)
df.drop('id', axis=1, inplace=True)

In [None]:
df.head()

Removing punctuation and stopwords.

In [None]:
import string


def remove_punct(text):
    table = str.maketrans("", "", string.punctuation)
    return text.translate(table)
df["full_text"] = df.full_text.map(lambda x: remove_punct(x))

from nltk.corpus import stopwords

stop = set(stopwords.words("english"))


def remove_stopwords(text):
    text = [word.lower() for word in text.split() if word.lower() not in stop]

    return " ".join(text)

df["full_text"] = df["full_text"].map(remove_stopwords)

In [None]:
df.head()

Visualising Named Entity

---
Visualising these can help us understand the features data might reply upon the most


In [None]:
df_dis = df[df['target']==1]
df_no_dis = df[df['target']==0]

In [None]:
#Func to represent randomly
nlp = spacy.load('en_core_web_sm')
def namedRandom(df):
  random = [np.random.randint(0,len(df)-1) for i in range(0,5)]
  for index in random:
    text = df.full_text.iloc[index]
    doc = nlp(text)
    spacy.displacy.render(doc, style="ent", jupyter=True)

In [None]:
namedRandom(df_dis)

In [None]:
namedRandom(df_no_dis)

**LOOKING AT THE WORD CLOUD**

In [None]:
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
def cloudRandom(df, type):
  random = [np.random.randint(0,len(df)-1) for i in range(0,5)]
  text = ''
  for index in random:
    text = text + df.full_text.iloc[index]
  wc = WordCloud(background_color="white", 
               max_words=350, 
               width=1000, 
               height=600, 
               random_state=1).generate(text)
  
  plt.figure(figsize=(15,15))
  plt.imshow(wc)
  plt.axis("off")
  plt.title('Word Cloud for '+ (type) +' tweets')

In [None]:
cloudRandom(df_dis, 'disaster')

In [None]:
cloudRandom(df_no_dis, 'non - disaster')

Lemmetizing and Stemming the text

In [None]:
def getLemmText(text):
 tokens=word_tokenize(text)
 lemmatizer = WordNetLemmatizer()
 tokens=[lemmatizer.lemmatize(word) for word in tokens]
 return ' '.join(tokens)
df['full_text'] = list(map(getLemmText,df['full_text']))

def getStemmText(text):
 tokens=word_tokenize(text)
 ps = PorterStemmer()
 tokens=[ps.stem(word) for word in tokens]
 return ' '.join(tokens)
df['full_text'] = list(map(getStemmText,df['full_text']))

# Exploratory data analysis after cleaning

In [None]:
#defining visualisation params
fig_dims = (10, 8)

In [None]:
#Looking at class distribution to see if it is balanced
classes = df['target']
fig, ax = plt.subplots(figsize=fig_dims)
sns.countplot(x = classes, ax=ax, palette='Oranges')
plt.title('Class Distribution')
plt.xlabel('0: No disaster tweet           1: Disaster tweet')
plt.ylabel('Count of tweets')
plt.show()

Looks well balanced

In [None]:
# Removing # and @ from the tweets.
def removetags(text):
  tags = ['@','#']
  tokens = word_tokenize(text)
  tokens = [word for word in tokens if word not in tags]
  return ' '.join(tokens)
df['full_text'] = list(map(removetags,df['full_text']))

Checking for Big Numbers if they can be considered as features or not.

In [None]:
#Logic to find big numbers
for i in df.full_text[3].split():
  print(i)

In [None]:
#Function to check bigNumbers Count
def bigNumCount(df):
  bigNum_Count = []
  truth_val = []
  for text in df.full_text:
    for token in text.split():
      if token.isnumeric():
        try:
          if int(token)>10000:
            bigNum_Count.append(text)
            break
        except:
          pass
  return bigNum_Count

In [None]:
#Looking at the Big number analysis
#looking at BigNumbers in entire dataset
df_entire_BigNum = bigNumCount(df)
# If there are more than 300 bigNum tweets we'll consider big numbers as features
print('There are '+ str(len(df_entire_BigNum)) +' big Numbers in the entire dataset.')

In [None]:
#Looking at the Big number analysis
#looking at BigNumbers in disaster dataset
df_dis = df[df['target']==1]
df_dis_BigNum = bigNumCount(df_dis)
# If there are more than 100 bigNum tweets we'll consider big numbers as features
print('There are '+ str(len(df_dis_BigNum)) +' big Numbers in the disaster dataset.')

In [None]:
#Defining a function to replace 
def replaceNumbers(text):
  tokens = word_tokenize(text)
  tokens = [word if word.isalpha() else 'bignumber' for word in tokens]
  return ' '.join(tokens)
print('Example:')
print('text : 13000 one guy')
print(replaceNumbers('result : 13000 one guy'))
df['full_text'] = list(map(replaceNumbers,df['full_text']))

In [None]:
## Code to remove numbers If you decide otherwise.
# def removeNumbers(text):
#   tokens = word_tokenize(text)
#   tokens = [word for word in tokens if word.isalpha()]
#   return ' '.join(tokens)

# df['full_text'] = list(map(removeNumbers,df['full_text']))

In [None]:
df.head()

# Training The model using BERT

In [None]:
labels = df.target

In [None]:
xtrain, xtest, ytrain, ytest = train_test_split(df.full_text.values, labels, 
 random_state=42, test_size=0.3, shuffle=True, stratify = labels)

Choosing BERT model for training

In [None]:
bert_model_name = 'small_bert/bert_en_uncased_L-4_H-512_A-8' 

map_name_to_handle = {
    'bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3',
    'bert_en_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_cased_L-12_H-768_A-12/3',
    'bert_multi_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/3',
    'small_bert/bert_en_uncased_L-2_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-2_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-2_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-2_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-4_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-4_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-4_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-4_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-6_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-6_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-6_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-6_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-8_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-8_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-8_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-8_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-10_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-10_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-10_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-10_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-12_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-12_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-12_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-768_A-12/1',
    'albert_en_base':
        'https://tfhub.dev/tensorflow/albert_en_base/2',
    'electra_small':
        'https://tfhub.dev/google/electra_small/2',
    'electra_base':
        'https://tfhub.dev/google/electra_base/2',
    'experts_pubmed':
        'https://tfhub.dev/google/experts/bert/pubmed/2',
    'experts_wiki_books':
        'https://tfhub.dev/google/experts/bert/wiki_books/2',
    'talking-heads_base':
        'https://tfhub.dev/tensorflow/talkheads_ggelu_bert_en_base/1',
}

map_model_to_preprocess = {
    'bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'bert_en_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_cased_preprocess/3',
    'small_bert/bert_en_uncased_L-2_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-2_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-2_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-2_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-4_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-4_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-4_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-4_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-6_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-6_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-6_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-6_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-8_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-8_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-8_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-8_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-10_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-10_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-10_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-10_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-12_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-12_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-12_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'bert_multi_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_multi_cased_preprocess/3',
    'albert_en_base':
        'https://tfhub.dev/tensorflow/albert_en_preprocess/3',
    'electra_small':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'electra_base':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'experts_pubmed':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'experts_wiki_books':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'talking-heads_base':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
}

tfhub_handle_encoder = map_name_to_handle[bert_model_name]
tfhub_handle_preprocess = map_model_to_preprocess[bert_model_name]

print(f'BERT model selected           : {tfhub_handle_encoder}')
print(f'Preprocess model auto-selected: {tfhub_handle_preprocess}')

In [None]:
bert_preprocess = hub.KerasLayer(tfhub_handle_preprocess)
bert_encoder = hub.KerasLayer(tfhub_handle_encoder)

In [None]:
# Bert layers
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
preprocessed_text = bert_preprocess(text_input)
outputs = bert_encoder(preprocessed_text)

# Neural network layers
l = tf.keras.layers.Dropout(0.1, name="dropout")(outputs['pooled_output'])
l = tf.keras.layers.Dense(1, activation='sigmoid', name="output")(l)

# Use inputs and outputs to construct a final model
model = tf.keras.Model(inputs=[text_input], outputs = [l])

In [None]:
# with tpu_strategy.scope(): creating the model in the TPUStrategy scope means we will train the model on the TPU
# model = create_model()

In [None]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [None]:
model.fit(xtrain, ytrain, epochs=10, validation_data=(xtest, ytest))

# Result Evaluation

In [None]:
def graph_plots(history, string):
 plt.plot(history.history[string])
 plt.plot(history.history['val_'+string])
 plt.xlabel('Epochs')
 plt.ylabel(string)
 plt.legend([string, 'val_'+string])
 plt.show()
 
graph_plots(model.history, 'accuracy')
graph_plots(model.history, 'loss')

In [None]:
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix
predicted = model.predict(xtest)

In [None]:
np.unique(predicted)

In [None]:
results = []
for i in predicted:
  if i>0.5:
    results.append(1)
  else:
    results.append(0)

In [None]:
real = np.array(ytest)
predicted = results

In [None]:
print(classification_report(real,predicted))

# Submission

In [None]:
subtest = pd.read_csv('../input/nlp-getting-started/test.csv')
subtest.head()

Data Pre-processing on Test data

In [None]:
for i in ['keyword','location']:
  subtest = fillna(subtest,i,'a')

subtest['space'] = ' '
subtest['full_text'] = subtest['keyword'] + subtest['space'] + subtest['location']+ subtest['space'] + subtest['text']
subtest.drop('keyword', axis=1, inplace=True)
subtest.drop('location', axis=1, inplace=True)
subtest.drop('text', axis=1, inplace=True)
subtest.drop('space', axis=1, inplace=True)
subtest.drop('id', axis=1, inplace=True)

subtest["full_text"] = subtest.full_text.map(lambda x: remove_punct(x))
subtest["full_text"] = subtest["full_text"].map(remove_stopwords)

subtest['full_text'] = list(map(getLemmText,subtest['full_text']))
subtest['full_text'] = list(map(getStemmText,subtest['full_text']))

subtest['full_text'] = list(map(removetags,subtest['full_text']))
subtest['full_text'] = list(map(replaceNumbers,subtest['full_text']))

In [None]:
subtest.head()

In [None]:
test_pred = model.predict(subtest)
submission = pd.read_csv("../input/nlp-getting-started/sample_submission.csv")
submission['target'] = test_pred.round().astype(int)
submission.to_csv('sub.csv', index=False)