Recently I have been learning about RNNs (Recurrent Neural Networks) and NLP (Natural Language Processing) through Andrew Ngs excellent "Sequence Models" course on Coursera ([link](https://www.coursera.org/learn/nlp-sequence-models)). I wanted to have a go implementing a language model using this knowledge and Tensorflow v2.

I picked the "Real or Not? NLP with Disaster Tweets" ([link](https://www.kaggle.com/c/nlp-getting-started/overview)) getting started competition for its straight forward task (label tweets as either reporting a disaster or not reporting disaster) and the size of the dataset (large enough to contain enough information for the model but not so much that there will be a lot of processing).

First things first then, let's load the libraries.

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from wordcloud import WordCloud
from keras. utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense,Embedding,LSTM,Flatten
from keras.layers import Dropout
from keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import *
from keras.optimizers import Adam
from sklearn.metrics import classification_report, confusion_matrix


In [None]:
pd.set_option('display.max_colwidth', -1)

## Load Data

Now I'll load the training dataset. 

In [None]:
df_train = pd.read_csv(
    '/kaggle/input/nlp-getting-started/train.csv', 
    usecols=['text', 'target'], 
    dtype={'text': str, 'target': np.int64}
)

len(df_train)

In [None]:
df_train.head()

In [None]:
df_test = pd.read_csv(
    '/kaggle/input/nlp-getting-started/test.csv', 
    usecols=['text', 'id'], 
    dtype={'text': str, 'id': str}
)
df_test.head()

Mislabelled examples
There are a number of examples in the training dataset that are mislabelled. The keyword can be used to find these.

Thanks to Dmitri Kalyaevs whose notebook is where I found to do this: https://www.kaggle.com/dmitri9149/transformer-svm-semantically-identical-tweets

In [None]:
indices = [4415, 4400, 4399,4403,4397,4396, 4394,4414, 4393,4392,4404,4407,4420,4412,4408,4391,4405]
df_train.loc[indices]

In [None]:
df_train.loc[indices, 'target'] = 0

In [None]:
indices = [6840,6834,6837,6841,6816,6828,6831]
df_train.loc[indices]

In [None]:
df_train.loc[indices, 'target'] = 0

In [None]:
indices = [601,576,584,608,606,603,592,604,591, 587]
df_train.loc[indices]

In [None]:
df_train.loc[indices, 'target'] = 1

In [None]:
indices = [3913,3914,3936,3921,3941,3937,3938,3136,3133,3930,3933,3924,3917]
df_train.loc[indices]

In [None]:
df_train.loc[indices, 'target'] = 0

In [None]:
indices = [246,270,266,259,253,251,250,271]
df_train.loc[indices]

In [None]:
df_train.loc[indices, 'target'] = 0

In [None]:
indices = [6119,6122,6123,6131,6160,6166,6167,6172,6212,6221,6230,6091,6108]
df_train.loc[indices]

In [None]:
df_train.loc[indices, 'target'] = 0

In [None]:
indices = [7435,7460,7464,7466,7469,7475,7489,7495,7500,7525,7552,7572,7591,7599]
df_train.loc[indices]

In [None]:
df_train.loc[indices, 'target'] = 0

In [None]:
df_train.shape

In [None]:
sns.countplot(df_train['target'])
plt.title('Counts of Target')
plt.show()

In [None]:
df_train['len_text'] = df_train['text'].str.split().apply(lambda x: len(x))

In [None]:
df_train.head()

In [None]:
fig,(ax1,ax2) = plt.subplots(1,2,figsize=(10,5))

ax1.hist(df_train[df_train['target']==1]['len_text'],color = 'red')
ax1.set_title('Disaster Tweet')
ax2.hist(df_train[df_train['target']==0]['len_text'])
ax2.set_title('Not Disaster Tweet')
plt.show()

In [None]:
#Preprocessing



In [None]:
ps = PorterStemmer()
def preprocess_data(data):
    review = re.sub(r'https?://\S+|www\.\S+|http?://\S+',' ',data) # remove URL
    review = re.sub(r'<.*>',' ',review) # remove HTML tags
    review = re.sub("["
                           u"\U0001F600-\U0001F64F"  # removal of emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+",' ',review)
    review = re.sub('[^a-zA-Z]',' ',review) # filtering out miscellaneous text.
    review = review.lower()
    review = review.split()
    review = [ps.stem(words) for words in review if words not in stopwords.words('english') and words.isalpha()]
    review = ' '.join(review)
    return review

In [None]:
df_train['new_text']= df_train['text'].apply(preprocess_data)

In [None]:
df_test['new_text'] = df_test['text'].apply(preprocess_data)

In [None]:
# Now we have cleaned our data 

In [None]:
#Lets create WordCloud with common words for Disaster and Not Disaster texts

In [None]:
wc = WordCloud(background_color = 'white')
wc.generate(' '.join(df_train[df_train['target']==1]['new_text']))
plt.imshow(wc,interpolation = 'bilinear')
plt.title('Real Disaster')
plt.axis('off')
plt.show()

In [None]:
wc1= WordCloud(background_color = 'white')
wc1.generate(' '.join(df_train[df_train['target']==0]['new_text']))
plt.imshow(wc1,interpolation = 'bilinear')
plt.title('Not Disaster')
plt.axis('off')
plt.show()

In [None]:
#Lets analyse top 50 words of Disaster and Not Disaster Text

In [None]:
disaster_tweet = df_train[df_train['target']==1]['new_text']
notdisaster_tweet = df_train[df_train['target']==0]['new_text']

In [None]:
series_disaster = pd.Series(' '.join([i for i in disaster_tweet]).split())
series_disaster_top = series_disaster.value_counts().head(50)

In [None]:
series_disaster_top.plot(kind = 'bar',figsize = (20,20))
plt.title('Disaster Tweet')
plt.show()

In [None]:
series_not_disaster_top = pd.Series(' '.join([i for i in notdisaster_tweet ]).split())
series_not_disaster_top = series_not_disaster_top.value_counts().head(50)

In [None]:
series_not_disaster_top.plot(kind = 'bar',figsize = (20,20))
plt.title('Not Disaster')
plt.show()

In [None]:
common_words = set(series_disaster_top.index).intersection(set(series_not_disaster_top.index))

In [None]:
def text_cleaning(data):
    review = ' '.join([i for i in data.split() if i not in common_words])
    return review

In [None]:
df_train['new_text'] = df_train['new_text'].apply(text_cleaning)
df_test['new_text'] = df_test['new_text'].apply(text_cleaning)

In [None]:
df_train['target'] = df_train['target'].astype('category')
df_train['target'] = df_train['target'].cat.codes
df_train_target = to_categorical([df_train['target']])

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df_train['new_text'])
df_train_text = tokenizer.texts_to_sequences(df_train['new_text'])
df_train_text = pad_sequences(df_train_text, maxlen = 120)

In [None]:
tokenizer1 = Tokenizer()
tokenizer.fit_on_texts(df_train['new_text'])
df_test_text = tokenizer.texts_to_sequences(df_test['new_text'])
df_test_text = pad_sequences(df_test_text,maxlen = 120)

In [None]:
df_train_text_train = df_train_text[:6500]
df_train_text_test = df_train_text[6500:]
df_train_target_train = df_train_target[:6500]
df_train_target_test = df_train_target[6500:]

In [None]:
df_train_target_train = df_train_target[0][:6500]


In [None]:
df_train_target_test = df_train_target[0][6500:]

In [None]:
#Start to establish Deep learning model

model = Sequential()
model.add(Embedding(input_dim = len(tokenizer.word_index) + 1,input_length = 120,output_dim =120))

In [None]:
model.add(Dropout(0.35))
model.add(LSTM(120))
model.add(Dropout(0.35))
model.add(Dense(32,activation='relu'))

model.add(Dense(2,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer=Adam(lr=1e-4),metrics=['accuracy'])
print(model.summary())

In [None]:
early_stop = EarlyStopping(monitor='val_loss', patience=5, verbose=1, 
                           mode='min', restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=5, 
                              verbose=1, mode='min')

In [None]:
# Training the model

history = model.fit(x = df_train_text_train,y = df_train_target_train,validation_data=(df_train_text_test,df_train_target_test),callbacks=[early_stop,reduce_lr],epochs=30,batch_size= 64)

In [None]:
pred = model.predict(df_test_text)
pred

In [None]:
pred = [np.argmax(i) for i in pred]

In [None]:
df_test['target'] = pred
df_test

In [None]:
df = pd.DataFrame(history.history)
df

In [None]:
plt.plot(history.history['accuracy'] )
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
# We see that epoch 4 is the best for prevent overfitting

In [None]:
print('The accuracy of our model is {}'.format(model.evaluate(df_train_text_test,df_train_target_test)[1]))

In [None]:
train_pred = model.predict(df_train_text_train)
train_pred

In [None]:
train_pred =  [np.argmax(i)for i in train_pred]


In [None]:
confusion_matrix(df_train.target[:6500],train_pred)

In [None]:
submission = df_test[['id','target']]
submission.to_csv("Submission.csv",index=False)
submission