# Introduction
The challenge in this kernel is to build a model to predict which tweets are disaster tweets and which ones are not given a train set of tweets with their labels. The model will be evaluated with the given test set.

## 1.0 Data exploration
Load and explore data

In [None]:
#load libraries for data manipulation and visualization
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
# text/file processing libraries
import string
import re
import sys
from nltk.corpus import stopwords
from itertools import chain
# warnings
import string
import warnings
warnings.filterwarnings('ignore')

The dataset for this kernel is the kaggle Real or Not? NLP with  Disater Tweets

In [None]:
# load the train and test data sets
train_df = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
test_df = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")
print('Number of Training Samples = {}'.format(train_df.shape[0]))
print('Number of Test Samples = {}\n'.format(test_df.shape[0]))
print('Training X Shape = {}'.format(train_df.shape))
print('Training y Shape = {}\n'.format(train_df['target'].shape[0]))
print('Test X Shape = {}'.format(test_df.shape))

print('Test y Shape = {}\n'.format(test_df.shape[0]))
print('Index of Train Set:\n', train_df.columns)
print('Index of Test Set:\n', test_df.columns)

In [None]:
# class distribution of train set
pl = sb.countplot(train_df['target'])

Non disaster tweets represented with 0 are more than disaster tweets represented with 1

####  Sample data
Explore sample disaster and non disaster tweet

In [None]:
# display sample train data
train_df.sample(5)

In [None]:
# sample train disaster tweet
train_df.loc[1241]['text']

The keywords 'buildings on fire' in the above tweet are in the text. Though, the keywords do not appear in that order in the text.

In [None]:
# sample train non disaster tweet
train_df.loc[2301]['text']

The keyword 'demolish' in the above tweet may literally mean disaster but reading the text indicates no disaster.

An analysis of the sample disaster and non disaster tweet indicate that the text column is the most important as it contains the  keywords and the context of the text is important in determining a disaster and non disaster tweet.

#### Mislabelled tweets
This idea was adapted from  [disaster nlp: keras bert using tfhub](https://www.kaggle.com/gunesevitan/nlp-with-disaster-tweets-eda-cleaning-and-bert)

"There are **18** unique tweets in training set which are labeled differently in their duplicates. Those tweets are probably labeled by different people and they interpreted the meaning differently because some of them are not very clear. Tweets with two unique `target` values are relabeled since they can affect the training score."

In [None]:
train_df.groupby(['text']).nunique().sort_values(by='target', ascending=False)[0:18]

In [None]:
df_mislabeled = train_df.groupby(['text']).nunique().sort_values(by='target', ascending=False)
df_mislabeled = df_mislabeled[df_mislabeled['target'] > 1]['target']
df_mislabeled_all = df_mislabeled.index.tolist()
print(f'Number of repeated tweets(after preprocessing): {len(df_mislabeled_all)}')

In [None]:
train_df['target_relabeled'] = train_df['target'].copy() 

target_1_list = [   
   
    "CLEARED:incident with injury:I-495  inner loop Exit 31 - MD 97/Georgia Ave Silver Spring",
    ".POTUS #StrategicPatience is a strategy for #Genocide; refugees; IDP Internally displaced people; horror; etc. https://t.co/rqWuoy1fm4",
    "Who is bringing the tornadoes and floods. Who is bringing the climate change. God is after America He is plaguing her\n \n#FARRAKHAN #QUOTE",
    "RT NotExplained: The only known image of infamous hijacker D.B. Cooper. http://t.co/JlzK2HdeTG",
    "Caution: breathing may be hazardous to your health." ]
    
for mislabeled_sample in df_mislabeled_all:
    if mislabeled_sample in target_1_list:
        train_df.loc[train_df['text'] == mislabeled_sample, 'target_relabeled'] = 1
    else:
        train_df.loc[train_df['text'] == mislabeled_sample, 'target_relabeled'] = 0

filter_mislabel = (train_df['target'] != train_df['target_relabeled'])
print(f'Number of relabeled: {len(train_df[filter_mislabel])}')
train_df[filter_mislabel][:12]  

#### Exploring text
Explore text column for data cleaning

In [None]:
train_df['text'].sample(20).tolist()

There is need to remove or filter out characters that may not be relevant in predicting disaster or non disaster tweets such as: punctuations, contractions, stop words, short words, urls, html tags, emojis, mentions, hashtags, and bad spellings 

## 2.0 Data cleaning
Cleaning text means converting it to a list of words or tokens, different cleaning task will be performed on the dataset.

#### Remove html links and entity references

In [None]:
def html_references(tweets):
    texts = tweets
    # remove url - references to websites
    url_remove  = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    texts  = re.sub(url_remove, '', texts)
    # remove common html entity references in utf-8 as '&lt;', '&gt;', '&amp;'
    entities_remove = r'&amp;|&gt;|&lt'
    texts = re.sub(entities_remove, "", texts)
    # split into words by white space
    words = texts.split()
    #convert to lower case
    words = [word.lower() for word in words]
    return " ".join(words)
train_df['clean_text'] = train_df['text'].apply(lambda x : html_references(x))
test_df['clean_text'] = test_df['text'].apply(lambda x : html_references(x))

#### Remove apostrophes/contractions

In [None]:
def decontraction(tweet):
    # specific
    tweet = re.sub(r"won\'t", " will not", tweet)
    tweet = re.sub(r"won\'t've", " will not have", tweet)
    tweet = re.sub(r"can\'t", " can not", tweet)
    tweet = re.sub(r"don\'t", " do not", tweet)
    
    tweet = re.sub(r"can\'t've", " can not have", tweet)
    tweet = re.sub(r"ma\'am", " madam", tweet)
    tweet = re.sub(r"let\'s", " let us", tweet)
    tweet = re.sub(r"ain\'t", " am not", tweet)
    tweet = re.sub(r"shan\'t", " shall not", tweet)
    tweet = re.sub(r"sha\n't", " shall not", tweet)
    tweet = re.sub(r"o\'clock", " of the clock", tweet)
    tweet = re.sub(r"y\'all", " you all", tweet)
    # general
    tweet = re.sub(r"n\'t", " not", tweet)
    tweet = re.sub(r"n\'t've", " not have", tweet)
    tweet = re.sub(r"\'re", " are", tweet)
    tweet = re.sub(r"\'s", " is", tweet)
    tweet = re.sub(r"\'d", " would", tweet)
    tweet = re.sub(r"\'d've", " would have", tweet)
    tweet = re.sub(r"\'ll", " will", tweet)
    tweet = re.sub(r"\'ll've", " will have", tweet)
    tweet = re.sub(r"\'t", " not", tweet)
    tweet = re.sub(r"\'ve", " have", tweet)
    tweet = re.sub(r"\'m", " am", tweet)
    tweet = re.sub(r"\'re", " are", tweet)
    return tweet 
train_df['clean_text'] = train_df['clean_text'].apply(lambda x : decontraction(x))
test_df['clean_text'] = test_df['clean_text'].apply(lambda x : decontraction(x))

#### Remove punctuations and unprintable characters
Remove punctuations will remove the characters specified by string.punctuation while the inverse of string.printable will remove non ascii characters.

In [None]:
# print puntuation characters
string.punctuation

In [None]:
# print printable characters
string.printable

The puntuation characters will be removed with non english and unicode characters not in string.printable

In [None]:
def filter_punctuations_etc(tweets):
    words = tweets.split()
    # prepare regex for char filtering
    re_punc = re.compile( '[%s]' % re.escape(string.punctuation))
    # remove punctuation from each word
    words = [re_punc.sub('', w) for w in words]
    # filter out non-printable characters
    re_print = re.compile( '[^%s]' % re.escape(string.printable))
    words = [re_print.sub(' ', w) for w in words]
    return " ".join(words)
train_df['clean_text'] = train_df['clean_text'].apply(lambda x : filter_punctuations_etc(x))
test_df['clean_text'] = test_df['clean_text'].apply(lambda x : filter_punctuations_etc(x))

#### Separate alphanumeric characters

In [None]:
def separate_alphanumeric(tweets):
    words = tweets
    # separate alphanumeric
    words = re.findall(r"[^\W\d_]+|\d+", words)
    return " ".join(words)
train_df['clean_text'] = train_df['clean_text'].apply(lambda x : separate_alphanumeric(x))
test_df['clean_text'] = test_df['clean_text'].apply(lambda x : separate_alphanumeric(x))

#### Change repetitive characters
Change repetitive characters e.g.goooooooaaaal to gooaal, so the spell checker can try correcting it. An english word cannot have more than 2 consecutive same letter.

In [None]:
def cont_rep_char(text):
    tchr = text.group(0) 
    
    if len(tchr) > 1:
        return tchr[0:2] # take max of 2 consecutive letters
def unique_char(rep, tweets):
    substitute = re.sub(r'(\w)\1+', rep, tweets)
    return substitute
train_df['clean_text'] = (train_df['clean_text'].astype('str').apply(lambda x : unique_char(cont_rep_char, x)))
test_df['clean_text'] = (test_df['clean_text'].astype('str').apply(lambda x : unique_char(cont_rep_char, x)))

#### Spell checking
Check spellings and make corrections where possible, this a computational expensive exercise.

In [None]:
!pip install pyspellchecker

In [None]:
from spellchecker import SpellChecker

spell = SpellChecker()
def correct_spellings(text):
    corrected_text = []
    misspelled_words = spell.unknown(text.split())
    for word in text.split():
        if word in misspelled_words:
            corrected_text.append(spell.correction(word))
        else:
            corrected_text.append(word)
    return " ".join(corrected_text)
#train_df['clean_text'] = train_df['clean_text'].apply(lambda x : correct_spellings(x))
#test_df['clean_text'] = test_df['clean_text'].apply(lambda x : correct_spellings(x))

#### Split attached words
Split attached words especially common with twitter hashtags like "caraccidentlawyer" into "car", "accident", and "lawyer". Some desirable words may be split, but the gain may be more than the loss.

In [None]:
!pip install wordninja

In [None]:
import wordninja # !pip install wordninja
def split_attached_words(tweet):
    words = wordninja.split(tweet)
    return" ".join(words)
train_df['clean_text'] = train_df['clean_text'].apply(lambda x : split_attached_words(x))
test_df['clean_text'] = test_df['clean_text'].apply(lambda x : split_attached_words(x))

#### Remove stopwords and short words
Stopwords are common words that may not add to keywords. Stopwords and single letter words will be removed to reduce vocabularity and sparsity  of a bag of words model.

In [None]:
def stopwords_shortwords(tweet):
    # filter out stop words
    words = tweet.split()
    stop_words = set(stopwords.words( 'english' ))
    words = [w for w in words if not w in stop_words]
    # filter out short tokens
    for word in words:
        if word.isalpha():
            words = [word for word in words if len(word) > 1 ]
        else:
            words = [word for word in words]
    return" ".join(words)
train_df['clean_text'] = train_df['clean_text'].apply(lambda x : stopwords_shortwords(x))
test_df['clean_text'] = test_df['clean_text'].apply(lambda x : stopwords_shortwords(x))

## 3.0 GloVe Sentiment Analysis
Prepare data for a word embedding sentiment analysis model. A word embedding is a context based dense vector representation of texts. There are 2 approaches of using word embeddings, the 1st approach is learn word embedding for a specific task or to be reused in another project,  the 2nd approach is the use of pretrained word embeddings like Word2Vec, GloVe, BERT etc.

#### Pre-processing
Prepare data for processing

In [None]:
from sklearn.model_selection import train_test_split
# split train set into train/validate 
train_df2, validate_df = train_test_split(train_df, test_size=0.075, random_state=0)
train_df2 = train_df2.reset_index(drop=True)
validate_df = validate_df.reset_index(drop=True)

Separating texts and targets for modeling

In [None]:
# train and test sets
all_df=pd.concat([train_df,test_df])
X_all = all_df['clean_text']
# training set
X_train = train_df2['clean_text']
y_train = train_df2['target_relabeled'].astype(int)
# validation set
X_validate= validate_df['clean_text']
y_validate = validate_df['target_relabeled'].astype(int)
# test set
X_test = test_df['clean_text']

#### Tokenization

Tokenizing the text

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
# create a tokenizer for encoding texts as digits
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer
# create the tokenizer - mapping data to integer values
tokenizer = create_tokenizer(X_all)
word_index=tokenizer.word_index
max_words = len(word_index) + 1
print( 'unique words are : %d' % max_words)

In [None]:
# tweet with maximum length
max_length = max([len(s.split()) for s in X_all])
print( ' Maximum length: %d ' % max_length)

Transform preprocessed text into padded sequences of word ids to get a feature matrix

In [None]:
# integer encode and pad tweets
from tensorflow.keras.preprocessing.sequence import pad_sequences
def encode_data(tokenizer, max_length, data):
    # integer encode
    encoded = tokenizer.texts_to_sequences(data)
    # pad sequences
    padded = pad_sequences(encoded, maxlen=max_length, padding= 'post' )
    return padded
Xtrain = encode_data(tokenizer, max_length, X_train)
Xvalidate = encode_data(tokenizer, max_length, X_validate)
Xtest = encode_data(tokenizer, max_length, X_test)

In [None]:
sb.heatmap(Xtrain==0, vmin=0, cbar=False)
plt.show()

#### GloVe Embedding
Create an embedding matrix with GloVe

In [None]:
# parsing the GloVe word-embeddings file
import os
glove_dir = '../input/glove6b'
embeddings_index = {}
f = open(os.path.join(glove_dir, 'glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Found %s word vectors.' % len(embeddings_index))

In [None]:
# preparing the GloVe word-embeddings matrix
embedding_dim = 100
embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in word_index.items():
    if i < max_words:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

####  GloVe Embedding and LSTM

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding
import  tensorflow.keras.optimizers as optimizers
from tensorflow.keras.layers import LSTM, Bidirectional
model = Sequential()
model.add(Embedding(max_words, 100, input_length=max_length))
# lstm layer
model.add(Bidirectional(LSTM(64, dropout=0.2, recurrent_dropout=0.2, return_sequences=True)))   
model.add(Bidirectional(LSTM(64,  dropout=0.2, recurrent_dropout=0.2,))) 
# densely connected classifier
model.add(Dense(64, activation= 'relu' ))
model.add(Dense(1, activation='sigmoid'))
# summarize
model.summary()

In [None]:
# load pretrained word embeddings into the Embedding layer
model.layers[0].set_weights([embedding_matrix])
model.layers[0].trainable = True

In [None]:
# compile
model.compile(loss= 'binary_crossentropy',  optimizer=optimizers.Adam(lr=.0001), metrics=[ 'accuracy' ])

In [None]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
callbacks = [
    EarlyStopping(patience=3, verbose=1),
    ReduceLROnPlateau(factor=0.25, patience=2, min_lr=0.00001, verbose=1),
    ModelCheckpoint('model_lstm.h5', verbose=1, save_best_only=True, save_weights_only=True)
]

In [None]:
# fit network
model.fit(Xtrain, y_train, epochs=10, callbacks=callbacks, validation_data=(Xvalidate,y_validate))

In [None]:
sample_submission=pd.read_csv('/kaggle/input/nlp-getting-started/sample_submission.csv')
y_pre=model.predict(Xtest)
y_pre=np.round(y_pre).astype(int).reshape(3263)
sub=pd.DataFrame({'id':sample_submission['id'].values.tolist(),'target':y_pre})
sub.to_csv('tweet5a.csv',index=False)

## 4.0 BERT Sentiment Analysis
BERT(Bidirectional Encoded Representation from Transformers) is a language model that can recognize context and semantics in a sentence. It takes input text transformed into 3 vectors ids, masks and segments. It can be used in a variety of NLP including sentiment analysis and is known to perform well with minimal or no text preprocessing. This is because the tokenizer needs to capture the context of each sentence which can be lost in a text cleaning exercise. This kernel will process text for BERT without text cleaning.

In [None]:
# We will use the official tokenization script created by the Google team
!wget --quiet https://raw.githubusercontent.com/tensorflow/models/master/official/nlp/bert/tokenization.py

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import tokenization
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.models import Model
import  tensorflow.keras.optimizers as optimizers

#### Preprocessing

A pretrained BERT model can be used where the BERT layers are frozen during training preserving the model parameters or the prefered method where the BERT  layers are finetuned and trained on the new data . Running BERT the first time can be a hassle, gained insight on implementation from [disaster nlp: keras bert using tfhub](https://www.kaggle.com/xhlulu/disaster-nlp-keras-bert-using-tfhub) 

In [None]:
# create BERT embedding layer
module_url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1"
bert_layer = hub.KerasLayer(module_url, trainable=True)

In [None]:
# create BERT tokenizer
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

Define function to encode input as 3 matrices of tokens or ids, masks and segments

In [None]:
def bert_encode(texts, tokenizer, max_len=512):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        text = tokenizer.tokenize(text)
            
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence)
        tokens += [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

In [None]:
# encoding input data
train_input = bert_encode(train_df.text.values, tokenizer, max_len=128)
test_input = bert_encode(test_df.text.values, tokenizer, max_len=128)
train_labels = train_df.target_relabeled.values

#### Modeling

In [None]:
# define create model function
def build_model(bert_layer, max_len=512):
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")

    _, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    clf_output = sequence_output[:, 0, :]
    out = Dense(1, activation='sigmoid')(clf_output)
    
    model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
    model.compile(optimizers.Adam(lr=2e-5), loss='binary_crossentropy', metrics=['accuracy'])
   
    return model

In [None]:
# create model
model = build_model(bert_layer, max_len=128)
model.summary()

In [None]:
from tensorflow.keras.callbacks import ModelCheckpoint
callbacks2 = [
    ModelCheckpoint('model_bert.h5', monitor='val_loss', save_best_only=True)
]

In [None]:
model.fit(train_input, train_labels, validation_split=0.075, epochs=3, 
          callbacks=callbacks2, batch_size=32)

In [None]:
sample_submission=pd.read_csv('/kaggle/input/nlp-getting-started/sample_submission.csv')
y_pre2=model.predict(test_input)
y_pre2=np.round(y_pre2).astype(int).reshape(3263)
sub2=pd.DataFrame({'id':sample_submission['id'].values.tolist(),'target':y_pre2})
sub2.to_csv('tweet5b.csv',index=False)

## If you like this kernel, please upvote, corrections are welcome.