## Imports

In [None]:
import pandas as pd
import numpy as np

pd.set_option('display.max_colwidth', None)

import re

from wordcloud import WordCloud
import contractions

import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')
plt.rcParams['font.size'] = 15

import nltk
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()

from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))

import warnings
warnings.filterwarnings("ignore")

## Data Load

In [None]:
df_train = pd.read_csv('../Datasets/disaster_tweet/train.csv')
df_train.head(20)

In [None]:
df_train.tail(20)

### Observation

1. Mixed case
2. Contractions
3. Hashtags and mentions
4. Incorrect spellings
5. Punctuations
6. websites and urls

## Functions

In [None]:
all_text = ' '.join(list(df_train['text']))

def check_texts(check_item, all_text):
    return check_item in all_text

In [None]:
print(check_texts('<a', all_text))
print(check_texts('<div', all_text))
print(check_texts('<p', all_text))

In [None]:
print(check_texts('#x', all_text))

In [None]:
print(check_texts(':)', all_text))
print(check_texts('<3', all_text))
print(check_texts('heard', all_text))

In [None]:
def remove_urls(text):
    ''' This method takes in text to remove urls and website links, if any'''
    url_pattern = r'(www.|http[s]?://)(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    text = re.sub(url_pattern, '', text)
    return text

def remove_html_entities(text):
    ''' This method removes html tags'''
    html_entities = r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});'
    text = re.sub(html_entities, '', text)
    return text

def convert_lower_case(text):
    return text.lower()

def detect_news(text):
    if 'news' in text:
        text = text + ' news'
    return text

def remove_social_media_tags(text):
    ''' This method removes @ and # tags'''
    tag_pattern = r'@([a-z0-9]+)|#'
    text = re.sub(tag_pattern, '', text)
    return text

# Count it before I remove them altogether
def count_punctuations(text):
    getpunctuation = re.findall('[.?"\'`\,\-\!:;\(\)\[\]\\/“”]+?',text)
    return len(getpunctuation)


def preprocess_text(x):
    cleaned_text = re.sub(r'[^a-zA-Z\d\s]+', '', x)
    word_list = []
    for each_word in cleaned_text.split(' '):
        word_list.append(contractions.fix(each_word).lower())
    word_list = [wnl.lemmatize(each_word.strip()) for each_word in word_list if each_word not in STOPWORDS and each_word.strip() != '']
    return " ".join(word_list)

In [None]:
porter_stemmer = PorterStemmer()

df_train['text'] = df_train['text'].apply(remove_urls)
df_train['text'] = df_train['text'].apply(remove_html_entities)
df_train['text'] = df_train['text'].apply(convert_lower_case)
df_train['text'] = df_train['text'].apply(detect_news)
df_train['text'] = df_train['text'].apply(remove_social_media_tags)
df_train['punctuation_count'] = df_train['text'].apply(count_punctuations)
df_train['text'] = df_train['text'].apply(preprocess_text)

df_train['text_tokenized'] = df_train['text'].apply(word_tokenize)
df_train['words_per_tweet'] = df_train['text_tokenized'].apply(len)

In [None]:
df_train

## Tweet Length Analysis

In [None]:
sns.histplot(x='words_per_tweet', hue='target', data=df_train, kde=True)
plt.show()

## Punctuation Analysis

In [None]:
sns.countplot(x='target', hue='punctuation_count', data=df_train)
plt.legend([])
plt.show()

## Tweet Text Analysis using WordCloud

In [None]:
real_disaster_tweets = ' '.join(list(df_train[df_train['target'] == 1]['text']))

In [None]:
real_disaster_tweets

In [None]:
non_real_disaster_tweets = ' '. join(list(df_train[df_train['target'] == 0]['text']))

In [None]:
wc = WordCloud(background_color="black", 
               max_words=100, 
               width=1000, 
               height=600, 
               random_state=1).generate(real_disaster_tweets)

plt.figure(figsize=(15,15))
plt.imshow(wc)
plt.axis("off")
plt.title("Wordcloud of Tweets about Real Disasters")
plt.show()

In [None]:
wc = WordCloud(background_color="black", 
               max_words=100, 
               width=1000, 
               height=600,
               font_step=1,
               random_state=1).generate(non_real_disaster_tweets)

plt.figure(figsize=(15,15))
plt.imshow(wc)
plt.axis("off")
plt.title("Wordcloud of Tweets NOT about Real Disasters")
plt.show()

## TF-IDF

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

In [None]:
target = df_train['target'].values
X_train, X_test, y_train, y_test = train_test_split(df_train['text'], target, random_state=0)

In [None]:
tf_idf = TfidfVectorizer(min_df=0.,
                         max_df=1.,
                         use_idf=True)

In [None]:
tfidf_vector = tf_idf.fit(X_train)

In [None]:
list(tfidf_vector.vocabulary_.items())[:10]

In [None]:
tf_idf_train = tf_idf.transform(X_train)

In [None]:
tf_idf_test = tf_idf.transform(X_test)

In [None]:
clf = LogisticRegression()
clf.fit(tf_idf_train, y_train)

In [None]:
y_pred_log_reg = clf.predict(tf_idf_test)

In [None]:
accuracy_score(y_test, y_pred_log_reg)

In [None]:
precision_score(y_test, y_pred_log_reg), recall_score(y_test, y_pred_log_reg), f1_score(y_test, y_pred_log_reg)

**Future Work**

1. Testing different algorithms
2. Test different tf-idf settings, like set idf=False, then, only term frequency will be considered
3. Hyperparameter Tuning

## GloVe and Neural Nets

Embedding Layer - learn an embedding for all of the words in the training dataset. It is defined as the first hidden layer of a network.

It is a flexible layer that can be used in a variety of ways, such as:
* It can be used alone to learn a word embedding that can be saved and used in another model later.
* It can be used as part of a deep learning model where the embedding is learned along with the model itself.
* It can be used to load a pre-trained word embedding model, a type of transfer learning.

Resources - 

1. https://keras.io/api/layers/core_layers/embedding/

2. https://machinelearningmastery.com/use-word-embedding-layers-deep-learning-keras/

### Using Keras API's Embedding Layer

In [None]:
from gensim.test.utils import get_tmpfile
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models import KeyedVectors

import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Embedding
from keras.layers import Flatten

In [None]:
glove_file = r'C:\Users\nroy0\Documents\Resources\glove.6B\glove.6B.300d.txt'
w2v_file = get_tmpfile("glove_w2v.txt")
glove2word2vec(glove_file, w2v_file)

In [None]:
w2v_model = KeyedVectors.load_word2vec_format(w2v_file)

In [None]:
vocab = w2v_model.key_to_index.keys()

glove_embedding_matrix = w2v_model[vocab]
glove_embedding_matrix

In [None]:
target = df_train['target'].values

X_train, X_test, y_train, y_test = train_test_split(df_train['text'], target, random_state=0)

In [None]:
max_length = 1000
tokenizer = Tokenizer(oov_token = "<OOV>", num_words=max_length)
tokenizer.fit_on_texts(X_train)

sequences_train = tokenizer.texts_to_sequences(X_train)
sequences_test = tokenizer.texts_to_sequences(X_test)

padded_train = pad_sequences(sequences_train, padding = 'post', maxlen=max_length)
padded_test = pad_sequences(sequences_test, padding = 'post', maxlen=max_length)

In [None]:
vocab_size = len(tokenizer.word_index) + 1
embedding_matrix = np.zeros((vocab_size, 300))
keys_not_present = []
for word, i in tokenizer.word_index.items():
    try:
        embedding_vector = w2v_model.get_vector(word)
        embedding_matrix[i] = embedding_vector
    except KeyError as e:
        keys_not_present.append(word)

In [None]:
print(keys_not_present)

In [None]:
embedding_matrix.shape

In [None]:
def get_model():
    model = Sequential()
    model.add(Embedding(vocab_size, 
                        300, 
                        weights=[embedding_matrix], 
                        input_length=max_length,
                        trainable=False))
    model.add(Flatten())
    model.add(Dense(1, activation='sigmoid'))
    return model

In [None]:
callbacks = [
    keras.callbacks.EarlyStopping(monitor="val_loss",
                                  patience=15,
                                  verbose=1,
                                  mode="min",
                                  restore_best_weights=True),
    keras.callbacks.ModelCheckpoint(filepath='model.hdf5',
                                    verbose=1,
                                    save_best_only=True)
]

In [None]:
model = get_model()
print(model.summary())

In [None]:
model.compile(loss='binary_crossentropy',
              optimizer='adam', 
              metrics=[tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])

In [None]:
tf.config.run_functions_eagerly(True) 
## Stackoverflow - https://stackoverflow.com/questions/58352326/running-the-tensorflow-2-0-code-gives-valueerror-tf-function-decorated-functio
history = model.fit(padded_train, 
                    y_train, 
                    epochs=50, 
                    validation_data=(padded_test, y_test), 
                    callbacks=callbacks)

In [None]:
model = keras.models.load_model('model.hdf5')
y_pred = (model.predict(padded_test) > 0.5).astype("int32")
print('Accuracy: ', accuracy_score(y_test, y_pred))
print('Precision: ', precision_score(y_test, y_pred))
print('Recall: ', recall_score(y_test, y_pred))
print('F1 Score: ', f1_score(y_test, y_pred))

### Using Embedding Layer + LSTM

In [None]:
def get_lstm_model():
    model = Sequential()
    model.add(Embedding(vocab_size, 
                        300, 
                        weights=[embedding_matrix], 
                        input_length=max_length,
                        trainable=False))
    model.add(LSTM(100))
    model.add(Dropout(0.3))
    model.add(Flatten())
    model.add(Dense(1, activation='sigmoid'))
    return model

In [None]:
callbacks = [
    keras.callbacks.EarlyStopping(monitor="val_loss",
                                  verbose=1,
                                  mode="min",
                                  restore_best_weights=True),
    keras.callbacks.ModelCheckpoint(filepath='lstm_model.hdf5',
                                    verbose=1,
                                    save_best_only=True)
]

In [None]:
model = get_lstm_model()
print(model.summary())

In [None]:
model.compile(loss='binary_crossentropy',
              optimizer='adam', 
              metrics=[tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])

In [None]:
tf.config.run_functions_eagerly(True)
## Stackoverflow - https://stackoverflow.com/questions/58352326/running-the-tensorflow-2-0-code-gives-valueerror-tf-function-decorated-functio
history = model.fit(padded_train, 
                    y_train, 
                    epochs=2, 
                    validation_data=(padded_test, y_test), 
                    callbacks=callbacks)

In [None]:
model = keras.models.load_model('lstm_model.hdf5')
y_pred = (model.predict(padded_test) > 0.5).astype("int32")
print('Accuracy: ', accuracy_score(y_test, y_pred))
print('Precision: ', precision_score(y_test, y_pred))
print('Recall: ', recall_score(y_test, y_pred))
print('F1 Score: ', f1_score(y_test, y_pred))