<a href="https://www.kaggle.com/sdysch/nlp-disaster-tweets?scriptVersionId=88748764" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# TODO
* Look at location and keyword data (if not NaN)
* Hashtags in tweets:
    * Try and extract location info like so
```
from geotext import GeoText
places = GeoText("London is a great city")
places.cities
```
* LSTM
* cross validation

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os

In [None]:
df_train = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
print(df_train.columns)

In [None]:
print(df_train.head())

In [None]:
# preprocessing, lower case
df_train['text'] = df_train['text'].str.lower()

In [None]:
# remove URLS
import re

def remove_urls(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    
    return url.sub(r'', text)

df_train['text'] = df_train['text'].apply(remove_urls)
print(df_train['text'])

In [None]:
# removing stop words
from nltk.corpus import stopwords
def remove_stopwords(text):
    sw = stopwords.words('english')
    words = text.split(' ')
    filtered = [w for w in words if w not in sw]
    return ' '.join([str(v) for v in filtered])

df_train['text'] = df_train['text'].apply(remove_stopwords)
print(df_train['text'])

In [None]:
# removing punctuation
def remove_punctuation(text):
    text = text.replace('.', '')
    text = text.replace(',', '')
    text = text.replace('!', '')
    text = text.replace('?', '')
    text = text.replace('"', '')
    text = text.replace("'", '')
    text = text.replace(':', '')
    text = text.replace(';', '')
    # remove @? Might want to strip twitter usernames later
    return text
df_train['text'] = df_train['text'].apply(remove_punctuation)
print(df_train['text'])

In [None]:
print(len(df_train))

In [None]:
print(df_train['location'].dropna())

In [None]:
# split train into train + testing set, for model validation
# for final submissions, all the training data will be used to fit the model
X = df_train.drop(['target'], axis=1)
y = df_train['target']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=42)

In [None]:
# check for class imbalance issues
print(len(y_train[y_train==1]) / len(y_train))
print(len(y_train[y_train==0]) / len(y_train))

print(len(y_test[y_test==1]) / len(y_test))
print(len(y_test[y_test==0]) / len(y_test))

* Seems that similar class imbalance is present in testing and training split, likely this is the same in the total sample
* Might want to consider class reweighting in model training

In [None]:
print(X_train)

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_length = 40
n_words = 100000
tokenizer = Tokenizer(oov_token='<OOV>', num_words=n_words)
tokenizer.fit_on_texts(X_train['text'])
word_index = tokenizer.word_index

training_sequences = tokenizer.texts_to_sequences(X_train['text'])
training_padded = pad_sequences(training_sequences, padding='post', maxlen=max_length)

testing_sequences = tokenizer.texts_to_sequences(X_test['text'])
testing_padded    = pad_sequences(testing_sequences, padding='post', maxlen=max_length)
print(training_padded.shape)
print(testing_padded.shape)

In [None]:
print(training_padded.max())

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding, GlobalAveragePooling1D, Dropout
model = Sequential()
model.add(Embedding(n_words, 50, input_length=training_padded.shape[1]))
model.add(GlobalAveragePooling1D())
#model.add(Dropout(0.25))
model.add(Dense(30, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()

In [None]:
from tensorflow.keras.callbacks import EarlyStopping

es = EarlyStopping(monitor='val_loss', min_delta=0.00001, patience=20, verbose=1, restore_best_weights=True, mode='auto')
#callbacks = [es]
callbacks = []
    
# model is *really* prone to overfitting, only a few epochs needed
epochs = 4
batch_size = 32

history = model.fit(training_padded,
          y_train,
          batch_size=batch_size,
          epochs=epochs,
          callbacks=callbacks,
          validation_split=0.2,
          shuffle=True)

In [None]:
import matplotlib.pyplot as plt
fig, ax = plt.subplots(1, 2, figsize=(10, 5))

ax[0].plot(history.history['loss'], label='Train')
ax[0].plot(history.history['val_loss'], label='Validation')
ax[0].set_xlabel('epochs')
ax[0].set_ylabel('loss')
ax[0].legend(loc='best')

ax[1].plot(history.history['accuracy'], label='Train')
ax[1].plot(history.history['val_accuracy'], label='Validation')
ax[1].set_xlabel('epochs')
ax[1].set_ylabel('accuracy')
ax[1].legend(loc='best')


plt.show()

In [None]:
# confusion matrix
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score
import seaborn as sns

y_pred = model.predict(testing_padded)
y_pred = y_pred > 0.5

cm = confusion_matrix(y_test, y_pred, normalize='true')
print(cm)
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(f'Precision: {precision_score(y_test, y_pred)}')
print(f'Recall: {recall_score(y_test, y_pred)}')

# LSTM model

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding, GlobalAveragePooling1D, Dropout, Bidirectional
rnn_model = Sequential()
rnn_model.add(Embedding(n_words, 50))
#rnn_model.add(Bidirectional(LSTM(50, activation='relu', return_sequences=False)))
rnn_model.add(LSTM(50, activation='relu', return_sequences=True))
rnn_model.add(LSTM(20, activation='relu', return_sequences=False))
rnn_model.add(Dropout(0.2))
rnn_model.add(Dense(10, activation='relu'))
rnn_model.add(Dense(1, activation='sigmoid'))

rnn_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

rnn_model.summary()

In [None]:
# model is *really* prone to overfitting, only a few epochs needed
epochs = 5
batch_size = 32

history = rnn_model.fit(training_padded,
          y_train,
          batch_size=batch_size,
          epochs=epochs,
          callbacks=callbacks,
          validation_split=0.2,
          shuffle=True)

In [None]:
import matplotlib.pyplot as plt
fig, ax = plt.subplots(1, 2, figsize=(10, 5))

ax[0].plot(history.history['loss'], label='Train')
ax[0].plot(history.history['val_loss'], label='Validation')
ax[0].set_xlabel('epochs')
ax[0].set_ylabel('loss')
ax[0].legend(loc='best')

ax[1].plot(history.history['accuracy'], label='Train')
ax[1].plot(history.history['val_accuracy'], label='Validation')
ax[1].set_xlabel('epochs')
ax[1].set_ylabel('accuracy')
ax[1].legend(loc='best')


plt.show()

In [None]:
# confusion matrix
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score
import seaborn as sns

y_pred = rnn_model.predict(testing_padded)
y_pred = y_pred > 0.5

cm = confusion_matrix(y_test, y_pred, normalize='true')
print(cm)
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(f'Precision: {precision_score(y_test, y_pred)}')
print(f'Recall: {recall_score(y_test, y_pred)}')

In [None]:
"""
y_pred = model.predict(testing_padded)
y_pred[y_pred > 0.5] = 1
y_pred[y_pred <= 0.5] = 0
y_pred

sub = pd.read_csv('/kaggle/input/nlp-getting-started/sample_submission.csv')
sub['target'] = y_pred.round(0).astype('int')
sub.to_csv('submission.csv', index=False)"""