## Aim
#### Aim of this notebook is to create model for classifying the tweets into disaster and non disaster using LSTM

#### So, let's start our code by importing libraries required

In [None]:

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
import pandas as pd
import numpy as np
import seaborn as sns
sns.set_style("darkgrid")
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.simplefilter(action='ignore', category=Warning)
import re
import nltk
from nltk.corpus import stopwords
from tqdm import tqdm

#### Let's import the Dataset

In [None]:
df = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
test = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")

In [None]:
df.head()

In [None]:
df.shape, test.shape

#### Plotting the number of  disaster and non disaster tweets

In [None]:
sns.countplot(df["target"]);

In [None]:
df["target"].value_counts(normalize = True) #normalized value counts

#### Creating a function for plotting a histogram (for length of tweets)

In [None]:
def len_plot(data, name):
  length = [len(sent.split()) for sent in data]
  plt.hist(length)
  plt.title(name)

In [None]:
len_plot(df[df["target"]==0]["text"], "Not Disaster") #passing non disaster tweets

In [None]:
len_plot(df[df["target"]==1]["text"], "Disaster") #passing disaster tweets

#### Separating input and output features

In [None]:
X = df["text"] # indpendent
y = df["target"] # dependent
y = np.array(y) # converting into array

#### Function for finding the number of unique words in our dataset

In [None]:
def unique_words(text):
  unique_words_list = []
  for sent in tqdm(text):
    for word in sent.split():
      if word.lower() not in unique_words_list:
        unique_words_list.append(word.lower())
      else:
        pass
  return unique_words_list
un_words = unique_words(X)

In [None]:
print("Total number of unique words :",len(un_words))

In [None]:
un_words[:50]

#### As it is a twitter dataset, it contains several words starting with "@" and "#". Let's find this words

#### words with starting letter "#"

In [None]:
SYMBOL1 = "#"
words_sym1 = [word for word in un_words if word.startswith(SYMBOL1)]
len(words_sym1)

In [None]:
words_sym1[:50]

#### words with starting letter "@"

In [None]:
# words with starting letter "@"
SYMBOL2 = "@"
words_sym2 = [word for word in un_words if word.startswith(SYMBOL2)]
len(words_sym2)

In [None]:
words_sym2[:50]

Since many of the words starting with "@" doesn't give any impact to our model accuracy, so we need to remove it

#### Function for url removing

In [None]:
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

#### Lemmatizer

In [None]:
from nltk.stem import WordNetLemmatizer
wl = WordNetLemmatizer()

#### Preprocessing function

In [None]:
def preprocessing(text):
  
  tweets = []
  for tweet in tqdm(text):
    tweet = tweet.lower() # converting to lower case
    tweet =  remove_urls(tweet) # url removing
    tweet = re.sub(r'@\w+',  '', tweet).strip() # removing the words start with "@"
    tweet = re.sub("[^a-zA-Z0-9 ']", "", tweet) # removing unwanted symbols
    tweet = tweet.split()
    tweet1 = [wl.lemmatize(word) for word in tweet if word not in set(stopwords.words("english"))] #lemmatization and stopwrds removal
    tweet1 = " ".join(tweet1)
    tweets.append(tweet1)
  return tweets

tweets = preprocessing(X)

In [None]:
tweets[:50]

### LSTM

In [None]:
# importing libraries
import tensorflow as tf
tf.__version__

from tensorflow.keras.layers import (Embedding,
                                     LSTM,
                                     Dense,
                                     Dropout,
                                     GlobalMaxPool1D,
                                     BatchNormalization)
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import one_hot

#### Performing onehot encoding

In [None]:
VOC_SIZE = 30000
onehot_repr = [one_hot(words, VOC_SIZE) for words in tweets]
onehot_repr[100:110]

In [None]:
# finding sentence length for each tweets
word_length = []
for i in onehot_repr:
  word_length.append(len(i))

len(word_length)

In [None]:
word_length[1100:1150]

In [None]:
# plotting graph (length of the tweets vs Numbers)
plt.hist(word_length)
plt.xlabel("Length of Words")
plt.ylabel("Nos")
plt.show()

In [None]:
max(word_length) # lenth of the longest tweet

#### Padding the Sequence

In [None]:
SENT_LENGTH = 15
embedded_docs = pad_sequences(onehot_repr, padding="post", maxlen=SENT_LENGTH)
embedded_docs

#### Function for model creation

In [None]:
def create_model():
  VECT_FEATURES = 32
  model = Sequential()
  model.add(Embedding(VOC_SIZE,
                      VECT_FEATURES,
                      input_length=SENT_LENGTH))
  model.add(LSTM(100, return_sequences = True))
  model.add(GlobalMaxPool1D())
  model.add(BatchNormalization())
  model.add(Dropout(0.5))
  model.add(Dense(10, activation="relu"))
  model.add(Dropout(0.2))
  model.add(Dense(1, activation = "sigmoid"))
  return model


In [None]:
model = create_model()
model.compile(optimizer = "adam", loss = "binary_crossentropy", metrics = ["accuracy"]) # compiling
model.summary() #summary

#### Training the Model

In [None]:
history = model.fit(embedded_docs, y, epochs=6, batch_size=32)

#### Plotting the graph of model accuracy and loss

In [None]:
fig,(ax1,ax2) = plt.subplots(1,2,figsize=(10,5))
ax1.plot(history.history["accuracy"])
ax1.set_title("Accuracy")
ax1.set_xlabel("Epochs")
ax1.set_ylabel("Accuracy")

ax2.plot(history.history["loss"])
ax2.set_title("Loss")
ax2.set_xlabel("Epochs")
ax2.set_ylabel("Loss")
plt.show()