## Introduction

In the following notebook, we will develop a machine learning model to identify whether the tweet is a real disaster tweet or not. We will be using Bidirectional LSTM to develop this model.

## Dataset

- `id` - a unique identifier for each tweet
- `text` - the text of the tweet
- `location` - the location the tweet was sent from (may be blank)
- `keyword` - a particular keyword from the tweet (may be blank)
- `target` - in train.csv only, this denotes whether a tweet is about a real disaster or not
        Real = 1
        Fake = 0

## Contents

- Introduction
- Dataset
- Importing Libraries
- Reading Dataset
- Data Pre-Processing
- Visualization
- Building Model
- Training Model
- Testing Data Pre-Processing
- Submission File
- Conclusion

## Importing Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import tensorflow as tf
import keras
import nltk

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras import Sequential
from keras.layers import Embedding, LSTM, Dense, Bidirectional, Dropout

from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import PorterStemmer

import re
import string
from tqdm import tqdm

from wordcloud import WordCloud

In [None]:
df = pd.read_csv('../input/nlp-getting-started/train.csv', index_col = 'id')

In [None]:
df.sample(5)

In [None]:
df.info()

## Data Pre-Processing

In [None]:
df.isnull().sum()

In [None]:
df = df.drop(['location', 'keyword'], axis = 1)

In [None]:
sns.countplot(df['target'])

In [None]:
df.shape

In [None]:
stop_words = stopwords.words('english')
ps = PorterStemmer()

Removing URLs

In [None]:
def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',text)

In [None]:
df['text']=df['text'].apply(lambda x : remove_URL(x))

Removing HTML tags

In [None]:
def remove_html(text):
    html=re.compile(r'<.*?>')
    return html.sub(r'',text)

In [None]:
df['text']=df['text'].apply(lambda x : remove_html(x))

Removing Emojis

In [None]:
# Reference : https://gist.github.com/slowkow/7a7f61f495e3dbb7e3d767f97bd7304b
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [None]:
df['text']=df['text'].apply(lambda x: remove_emoji(x))

Removing Punctuations

In [None]:
def remove_punct(text):
    table=str.maketrans('','',string.punctuation)
    return text.translate(table)

In [None]:
df['text']=df['text'].apply(lambda x : remove_punct(x))

In [None]:
!pip install pyspellchecker

In [None]:
from spellchecker import SpellChecker

spell = SpellChecker()
def correct_spellings(text):
    corrected_text = []
    misspelled_words = spell.unknown(text.split())
    for word in text.split():
        if word in misspelled_words:
            corrected_text.append(spell.correction(word))
        else:
            corrected_text.append(word)
    return " ".join(corrected_text)

In [None]:
# df['text']=df['text'].apply(lambda x : correct_spellings(x))

In [None]:
def create_corpus(df):
    corpus=[]
    for tweet in tqdm(df['text']):
        words=[word.lower() for word in word_tokenize(tweet) if((word.isalpha()==1) & (word not in stop_words))]
        corpus.append(words)
    return corpus
        

In [None]:
corpus=create_corpus(df)

In [None]:
def preprocess(text):

    text=text.lower()
    # remove hyperlinks
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)
    text = re.sub(r'http?:\/\/.*[\r\n]*', '', text)
    #Replace &amp, &lt, &gt with &,<,> respectively
    text=text.replace(r'&amp;?',r'and')
    text=text.replace(r'&lt;',r'<')
    text=text.replace(r'&gt;',r'>')
    #remove hashtag sign
    #text=re.sub(r"#","",text)   
    #remove mentions
    text = re.sub(r"(?:\@)\w+", '', text)
    #text=re.sub(r"@","",text)
    #remove non ascii chars
    text=text.encode("ascii",errors="ignore").decode()
    #remove some puncts (except . ! ?)
    text=re.sub(r'[:"#$%&\*+,-/:;<=>@\\^_`{|}~]+','',text)
    text=re.sub(r'[!]+','!',text)
    text=re.sub(r'[?]+','?',text)
    text=re.sub(r'[.]+','.',text)
    text=re.sub(r"'","",text)
    text=re.sub(r"\(","",text)
    text=re.sub(r"\)","",text)
    
    text=" ".join(text.split())
    return text

df['text'] = df['text'].apply(preprocess)
df=df[df["text"]!='']

In [None]:
df['target'].value_counts()

## Visualization

WordCloud for Real Disaster Tweets

In [None]:
plt.figure(figsize=(20,20))
wordCloud = WordCloud(max_words = 1000 , width = 1600 , height = 800 , stopwords = stop_words).generate(" ".join(df[df["target"] == 1]["text"]))
plt.imshow(wordCloud, interpolation = 'bilinear')

Word Cloud for Fake Disaster Tweets

In [None]:
plt.figure(figsize=(20,20))
wordCloud = WordCloud(max_words = 1000 , width = 1600 , height = 800 , stopwords = stop_words).generate(" ".join(df[df["target"] == 0]["text"]))
plt.imshow(wordCloud, interpolation = 'bilinear')

In [None]:
X = df['text']
y = df['target']

In [None]:
vocab_size = 40000
embedding_dim = 200
trunc_type = 'post'
pad_type = 'post'
max_len = 80

tokenizer = Tokenizer(num_words = vocab_size)
tokenizer.fit_on_texts(X)
X_sequences = tokenizer.texts_to_sequences(X)

X_paded = pad_sequences(X_sequences, truncating=trunc_type, padding=pad_type, maxlen=max_len)

In [None]:
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim))
model.add(Bidirectional(LSTM(64)))
model.add(Dropout(0.2))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
model.summary()

In [None]:
model.compile(optimizer='adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

In [None]:
history = model.fit(X_paded, y, batch_size = 32, validation_split = 0.1, epochs = 8)

## Testing

In [None]:
df_test = pd.read_csv("../input/nlp-getting-started/test.csv")

df_test['text'] = df_test['text'].apply(preprocess)

df_test.head()

In [None]:
test_id = df_test['id']

In [None]:
df_test = df_test.drop(['id', 'location', 'keyword'], axis = 1)

In [None]:
df_test.head()

In [None]:
df_test.info()

In [None]:
X_test = df_test['text']

In [None]:
test_sequences = tokenizer.texts_to_sequences(X_test)
padded_test = pad_sequences(test_sequences,maxlen = max_len, truncating = trunc_type, padding=pad_type) 

In [None]:
pred = model.predict_classes(padded_test)

pred

## Submission

In [None]:
sub=[]
for i in pred:
    sub.append(i[0])

In [None]:
submission = pd.DataFrame({'id':test_id, 'target':sub})
submission.shape

In [None]:
submission.head()

In [None]:
submission.to_csv('submission.csv',index=False)