In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Objective
The aim is to develop a model for predicting whether a tweet is about a real disaster or not.

# Rationale

Inspired from the below mentioned research article:

Aryan Karnati, Shashank Reddy Boyapally, D. S. K. . Natural Language Processing with Disaster tweets using Bidirectional LSTM. J. XI’AN Univ. Archit. Technol. 13, 1–6 (2021).

In [None]:
#import libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import re
import warnings
warnings.filterwarnings("ignore")
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import  train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import  pad_sequences
from tensorflow.keras.models import Model,Sequential
from nltk.stem import PorterStemmer
from keras.layers.embeddings import Embedding
from keras import Input
from keras.layers import Dense, Dropout, Embedding, LSTM, Flatten, Bidirectional,BatchNormalization
from keras.layers.convolutional import Conv1D, MaxPooling1D
from keras.models import Model
from keras.utils import plot_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import one_hot
from sklearn.model_selection import  train_test_split
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report


# Data loading

In [None]:
# load train data
train_data = pd.read_csv("../input/nlp-getting-started/train.csv")
train_data.head()

In [None]:
# load test data
test_data = pd.read_csv("../input/nlp-getting-started/test.csv") 
test_data.head()

In [None]:
test_data.dtypes

# Data cleansing

In [None]:
# Checking if any duplicate records are present in both test and train data
test_duplicate = test_data[test_data.duplicated()] 
test_duplicate

train_duplicate = train_data[train_data.duplicated()] 
train_duplicate

We found no duplicates in both test and train datasets, hence we proceed for further cleansing of data.

In [None]:
#Remove unwanted colums
test_data = test_data.drop('location',axis = 1) 
train_data = train_data.drop('location',axis = 1) 

In [None]:
def cleanhtml(raw_html):
  cleanr = re.compile('<.*?>')
  cleantext = re.sub(cleanr, '', raw_html)
  return cleantext

In [None]:
#Remove URL and html tags from the text
train_data['clean_text'] = train_data['text'].apply(cleanhtml)

test_data['clean_text'] = test_data['text'].apply(cleanhtml)



In [None]:
def cleansing_data(text):
    cln_text = re.sub('[^a-zA-Z \n\.]', '', text)
    cln_text = cln_text.lower()
    cln_text = cln_text.split()
    cln_text = ' '.join(cln_text)   
    return cln_text

In [None]:
#Remove speicial characters
test_data['clean_text'] = test_data['clean_text'].apply(cleansing_data)
train_data['clean_text'] = train_data['clean_text'].apply(cleansing_data)



In [None]:
def filter_text(text):
    text = text.split()
    text = [ps.stem(word) for word in text if word not in stopwords.words('english')]
    text = ' '.join(text)
    return text

In [None]:
#Remove suffixes and prefixes
ps = PorterStemmer()

train_data['filtered_text'] = train_data['clean_text'].apply(filter_text)
test_data['filtered_text'] =  test_data['clean_text'].apply(filter_text)


In [None]:
x_train = train_data['filtered_text']
y_train = train_data['target']
x_test = test_data['filtered_text']

In [None]:
max_size = 20000
sent_len = 100
embed_vector_len = 100
tokenizer = Tokenizer(num_words = max_size)
tokenizer.fit_on_texts(x_train)
words_to_index = tokenizer.word_index
seq_xtrain = tokenizer.texts_to_sequences(x_train)
seq_xtest = tokenizer.texts_to_sequences(x_test)


# One-Hot encoding on datasets

In [None]:
onehot_train = [one_hot(words,max_size)for words in x_train] 
onehot_test = [one_hot(words,max_size)for words in x_test] 

# Embedding representation 

Word Embeddings are vector representations of words that help us to retrieve linear substructures at the same time  process the text inorder for the model to understand. Usually, word embeddings are weights of the hidden layer of the neural network architecture, after the defined model converges on the cost function.

In [None]:
#embed
embedded_train = pad_sequences(onehot_train, padding='pre', maxlen=sent_len)
print(embedded_train)

In [None]:
embedded_test = pad_sequences(onehot_test, padding='pre', maxlen=sent_len)
print(embedded_test)

In [None]:
x = np.array(embedded_train)

y = np.array( train_data['target'])

x_test = np.array(embedded_test)


x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size=0.20,
                                                    random_state=45, shuffle=True)

x_test.shape

**Global Vectors(GloVE**) algorithm is used to obtain the vector epresentation of words.
**Word2Vec** relates the target words to their respective context without carrying any additional informations.
In comparisoon with Word2Vec, GloVe builds word embeddings in such a way that a combination of word vectors relates directly to the probability of those words’ co-occurrence in the corpus.

In [None]:
def read_glove_vector(glove_vec):
  with open(glove_vec, 'r', encoding='UTF-8') as f:
    words = set()
    word_to_vec_map = {}
    for line in f:
      w_line = line.split()
      curr_word = w_line[0]
      word_to_vec_map[curr_word] = np.array(w_line[1:], dtype=np.float64)



  return word_to_vec_map

In [None]:
word_to_vec_map = read_glove_vector('../input/glove6b100dtxt/glove.6B.100d.txt')



tweet_max_length = max([len(i) for i in train_data['text']])


# Model creation

In [None]:
model = Sequential()
model.add(Embedding(max_size, embed_vector_len, input_length=sent_len))
model.add(Bidirectional(LSTM(30, dropout=0.2)))
model.add(BatchNormalization())
#model.add(Dense(sent_len, activation = "relu"))
#model.add(Dropout(0.5))
model.add(Dense(20, activation = "relu"))
model.add(Dropout(0.3))
model.add(Dense(1,activation='sigmoid'))


model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

print(model.summary())

In [None]:
plot_model(model, show_shapes=True, to_file='model.png')

# Model training

In [None]:
#mod_train = model.fit(train_data,y_train,epochs=5,validation_data=(test_data,y_test))
model_train = model.fit(x_train,y_train,validation_data=(x_valid,y_valid),epochs=10,batch_size=64)
#X_train.info

# Model prediction and evaluation

In [None]:
y_valid_pred = model.predict_classes(x_valid)

print(confusion_matrix(y_valid,y_valid_pred))
print('Accuracy Score :',accuracy_score(y_valid, y_valid_pred))
print('Report : ')
print(classification_report(y_valid, y_valid_pred))

In [None]:
#predicting values
y_pred = model.predict_classes(x_test).flatten()


submission = pd.DataFrame({
       "id": test_data['id'],
      "target": y_pred
    })
submission.to_csv(r"C:\Users\sreej\Documents\Kaggle\NLP Tweet classification\submission_tweet.csv", index=False)
submission.head(12)

