# **Natural Language Processing with Disaster Tweets**
*Predict which Tweets are about real disasters and which ones are not*

### Competition Description
Twitter has become an important communication channel in times of emergency.
The ubiquitousness of smartphones enables people to announce an emergency they’re observing in real-time. Because of this, more agencies are interested in programatically monitoring Twitter (i.e. disaster relief organizations and news agencies).

But, it’s not always clear whether a person’s words are actually announcing a disaster.  

In [None]:
#Data
import pandas as pd
import re
import numpy as np
import string

#visualization
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import tqdm

# Scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score,plot_confusion_matrix,plot_precision_recall_curve,plot_roc_curve,roc_curve,auc
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer

# TF-Keras
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout, Embedding, Flatten, Conv1D, MaxPooling1D, LSTM, GRU,SimpleRNN, Bidirectional
from keras import utils
from keras.callbacks import ReduceLROnPlateau, EarlyStopping

# nltk
import nltk
from nltk.corpus import stopwords
from  nltk.stem import SnowballStemmer

# Word2vec
import gensim


In [None]:
# Avoid OOM errors by setting GPU Memory Consumption Growth
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus: 
    tf.config.experimental.set_memory_growth(gpu, True)

In [None]:
train = pd.read_csv('../input/nlp-getting-started/train.csv')
test = pd.read_csv('../input/nlp-getting-started/test.csv')

In [None]:
train.head(5)

In [None]:
test.head(5)

# For each ID in the test set, you must predict 1 if the tweet is describing a real disaster, and 0 otherwise.

In [None]:
train.info()

In [None]:
train.describe().transpose()

In [None]:
train.isna().sum()

In [None]:
train = train.drop(['id','keyword','location'],axis=1)

In [None]:
train.dropna(axis=1,inplace=True)

In [None]:
plt.figure(figsize=(12,6))
sns.countplot(x='target',data=train)

In [None]:
def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

In [None]:
train['text'] = train['text'].apply(lambda x:clean_text(x))

In [None]:
train['text'][10]

In [None]:
stop_words = stopwords.words("english")
stemmer = SnowballStemmer("english")

In [None]:
def removing_stop_word(text, stem=False):

    tokens = []
    for token in text.split():
        if token not in stop_words:
            if stem:
                tokens.append(stemmer.stem(token))
            else:
                tokens.append(token)
    return " ".join(tokens)

In [None]:
train['text'] = train['text'].apply(lambda x: removing_stop_word(x))

In [None]:
train['text'][10]

In [None]:
cv = CountVectorizer(stop_words='english')

In [None]:
matrix = cv.fit_transform(train[train['target']==0]['text'])
freqs = zip(cv.get_feature_names(), matrix.sum(axis=0).tolist()[0])    
# sort from largest to smallest
print("Top 20 words used for Negative reviews.")
print(sorted(freqs, key=lambda x: -x[1])[:50])

In [None]:
matrix = cv.fit_transform(train[train['target']==1]['text'])
freqs = zip(cv.get_feature_names(), matrix.sum(axis=0).tolist()[0])    
# sort from largest to smallest
print("Top 20 words used for Positive reviews.")
print(sorted(freqs, key=lambda x: -x[1])[:50])

In [None]:
#We will check the maximum number of words that can be present in a comment , this will help us in padding later
train['text'].apply(lambda x:len(x.split())).max()

In [None]:
df_train, df_test = train_test_split(train, test_size=0.3, random_state=42)
print("TRAIN size:", len(df_train))
print("TEST size:", len(df_test))

In [None]:
def roc_auc(predictions,target):
    '''
    This methods returns the AUC Score when given the Predictions
    and Labels
    '''
    
    fpr, tpr, thresholds = roc_curve(target, predictions)
    roc_auc = auc(fpr, tpr)
    return roc_auc

In [None]:
scores_model = []

# Model-Building
## First i will use simple text classification technique Multinomial Naive Bayes.

## --> Naive Bayes 

In [None]:
tfidf = TfidfVectorizer(stop_words='english')

In [None]:
X_train = df_train['text']
X_test = df_test['text']
y_train = df_train['target']
y_test = df_test['target']

In [None]:
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [None]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(X_train_tfidf,y_train)

In [None]:
preds = nb.predict(X_test_tfidf)
print(classification_report(y_test,preds))
plot_confusion_matrix(nb,X_test_tfidf,y_test)

In [None]:
scores = nb.predict(X_test_tfidf)
print("Auc: %.2f%%" % (roc_auc(scores,y_test)))

In [None]:
scores_model.append({'Model': 'Multinomial Naive Bayes','AUC_Score': roc_auc(scores,y_test)})

In [None]:
plot_roc_curve(nb,X_test_tfidf,y_test)

In [None]:
accuracy_score(y_test,scores)

# Deep Learning

## --->Simple RNN 

In [None]:
### Vocabulary size
voc_size=5000

In [None]:
# using keras tokenizer here
token = Tokenizer(num_words=None)
max_len = 35

token.fit_on_texts(list(X_train) + list(X_test))
X_train_seq = token.texts_to_sequences(X_train)
X_valid_seq = token.texts_to_sequences(X_test)

#zero pad the sequences
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_valid_pad = pad_sequences(X_valid_seq, maxlen=max_len)

word_index = token.word_index

In [None]:
# A simpleRNN without any pretrained embeddings and one dense layer
SRNN_model = Sequential()
SRNN_model.add(Embedding(len(word_index) + 1,300, input_length=max_len))
SRNN_model.add(SimpleRNN(100))
SRNN_model.add(Dense(1, activation='sigmoid'))
SRNN_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    
SRNN_model.summary()

In [None]:
SRNN_model.fit(X_train_pad, y_train, epochs=20, batch_size=64)

In [None]:
scores = SRNN_model.predict(X_valid_pad)
print("Auc: %.2f%%" % (roc_auc(scores,y_test)))

In [None]:
scores_model.append({'Model': 'Simple RNN','AUC_Score': roc_auc(scores,y_test)})

## Word2Vec
### First i will create embeddeding matrix using word2vec

In [None]:
documents = [t.split() for t in df_train.text]
documents[:3]

In [None]:
w2v_model = gensim.models.Word2Vec(vector_size=300,window=7, min_count=10, workers=8)

In [None]:
w2v_model.build_vocab(documents)

In [None]:
words = w2v_model.wv.key_to_index.keys()
vocab_size = len(words)
print("Vocab size", vocab_size)

In [None]:
w2v_model.train(documents, total_examples=len(documents), epochs=32)

In [None]:
w2v_model.wv.most_similar('fire')

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df_train.text)

vocab_size = len(tokenizer.word_index) + 1
print("Total words", vocab_size)

In [None]:
x_train = pad_sequences(tokenizer.texts_to_sequences(df_train.text), maxlen=30)
x_test = pad_sequences(tokenizer.texts_to_sequences(df_test.text), maxlen=30)

In [None]:
encoder = LabelEncoder()
encoder.fit(df_train.target.tolist())

y_train = encoder.transform(df_train.target.tolist())
y_test = encoder.transform(df_test.target.tolist())

y_train = y_train.reshape(-1,1)
y_test = y_test.reshape(-1,1)

print("y_train",y_train.shape)
print("y_test",y_test.shape)

In [None]:
embedding_matrix = np.zeros((vocab_size, 300))
for word, i in tokenizer.word_index.items():
    if word in w2v_model.wv:
        embedding_matrix[i] = w2v_model.wv[word]
print(embedding_matrix.shape)

## --> LSTM 

In [None]:
LSTM_model = Sequential()
LSTM_model.add(Embedding(vocab_size, 300, weights=[embedding_matrix], input_length=30, trainable=False))
LSTM_model.add(Dropout(0.5))
LSTM_model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
LSTM_model.add(Dense(1, activation='sigmoid'))

LSTM_model.summary() 

In [None]:
LSTM_model.compile(loss='binary_crossentropy',
              optimizer="adam",
              metrics=['accuracy'])

In [None]:
callbacks = [ ReduceLROnPlateau(monitor='val_loss', patience=5, cooldown=0),
              EarlyStopping(monitor='val_acc', min_delta=1e-4, patience=5)]

In [None]:
history = LSTM_model.fit(x_train, y_train,
                    batch_size=1024,
                    epochs=20,
                    validation_split=0.1,
                    verbose=1,
                    callbacks=callbacks)

In [None]:
score = LSTM_model.evaluate(x_test, y_test, batch_size=1024)
print()
print("ACCURACY:",score[1])
print("LOSS:",score[0])

In [None]:
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']
 
epochs = range(len(acc))
 
plt.plot(epochs, acc, 'b', label='Training acc')
plt.plot(epochs, val_acc, 'r', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()
 
plt.figure()
 
plt.plot(epochs, loss, 'b', label='Training loss')
plt.plot(epochs, val_loss, 'r', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
 
plt.show()

In [None]:
scores = LSTM_model.predict(x_test)
print("Auc: %.2f%%" % (roc_auc(scores,y_test)))

In [None]:
scores_model.append({'Model': 'LSTM','AUC_Score': roc_auc(scores,y_test)})

## --> GRU 

In [None]:
GRU_model = Sequential()
GRU_model.add(Embedding(vocab_size, 300, weights=[embedding_matrix], input_length=30, trainable=False))
GRU_model.add(Dropout(0.3))
GRU_model.add(GRU(300))
GRU_model.add(Dense(1, activation='sigmoid'))

GRU_model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])   

GRU_model.summary()

In [None]:
GRU_model.compile(loss='binary_crossentropy',
              optimizer="adam",
              metrics=['accuracy'])

In [None]:
history = GRU_model.fit(x_train, y_train,
                    batch_size=1024,
                    epochs=20,
                    validation_split=0.1,
                    verbose=1,
                    callbacks=callbacks)

In [None]:
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']
 
epochs = range(len(acc))
 
plt.plot(epochs, acc, 'b', label='Training acc')
plt.plot(epochs, val_acc, 'r', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()
 
plt.figure()
 
plt.plot(epochs, loss, 'b', label='Training loss')
plt.plot(epochs, val_loss, 'r', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
 
plt.show()

In [None]:
scores = GRU_model.predict(x_test)
print("Auc: %.2f%%" % (roc_auc(scores,y_test)))

In [None]:
scores_model.append({'Model': 'GRU','AUC_Score': roc_auc(scores,y_test)})

## --> Bi-Directional RNN's 

In [None]:
BRNN_model = Sequential()
BRNN_model.add(Embedding(vocab_size, 300, weights=[embedding_matrix], input_length=30, trainable=False))
BRNN_model.add(Bidirectional(LSTM(300, dropout=0.3, recurrent_dropout=0.3)))

BRNN_model.add(Dense(1,activation='sigmoid'))
BRNN_model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])


BRNN_model.summary()

In [None]:
BRNN_model.compile(loss='binary_crossentropy',
              optimizer="adam",
              metrics=['accuracy'])

In [None]:
history = BRNN_model.fit(x_train, y_train,
                    batch_size=1024,
                    epochs=20,
                    validation_split=0.1,
                    verbose=1,
                    callbacks=callbacks)

In [None]:
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']
 
epochs = range(len(acc))
 
plt.plot(epochs, acc, 'b', label='Training acc')
plt.plot(epochs, val_acc, 'r', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()
 
plt.figure()
 
plt.plot(epochs, loss, 'b', label='Training loss')
plt.plot(epochs, val_loss, 'r', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
 
plt.show()

In [None]:
scores = BRNN_model.predict(x_test)
print("Auc: %.2f%%" % (roc_auc(scores,y_test)))

In [None]:
scores_model.append({'Model': 'Bidirectional RNN','AUC_Score': roc_auc(scores,y_test)})

In [None]:
# Visualization of Results obtained from various Deep learning models
results = pd.DataFrame(scores_model).sort_values(by='AUC_Score',ascending=False)
results.style.background_gradient(cmap='Blues')

**SO, here LSTM has high AUC_SCORE, these are not very good model's, but it's just a beginner level work in NLP.**
References:
* 1)https://www.kaggle.com/code/tanulsingh077/deep-learning-for-nlp-zero-to-transformers-bert#BERT-and-Its-Implementation-on-this-Competition
* 2)https://www.kaggle.com/code/paoloripamonti/twitter-sentiment-analysis

In [None]:
def predict(text):
    # Tokenize text
    x_test = pad_sequences(tokenizer.texts_to_sequences([text]), maxlen=30)
    # Predict
    score = LSTM_model.predict([x_test])[0][0]
    # Decode sentiment
    if score <= 0.5:
        label = 'NEGATIVE'
    elif score >= 0.5:
        label = 'POSITIVE'
    return {"label": label, "score": float(score)} 

In [None]:
predict("there was a fire in woods")