In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# !pip install text-hammer
!pip install BeautifulSoup4

In [None]:
# import text_hammer as th
import tensorflow as tf
from keras.models import Sequential
from tensorflow.keras import layers, Input
from tensorflow.keras.optimizers import Adam, SGD
from keras.losses import BinaryCrossentropy
from keras.layers import Dense, LSTM, Dropout, Flatten, GRU, Bidirectional
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

from sklearn.model_selection import train_test_split

from  matplotlib import pyplot as plt
import matplotlib.image as mpimg
import random

import re
import nltk
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
%matplotlib inline

In [None]:
# Loading pretrained glove word embeddings
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip -q glove.6B.zip

In [None]:
nltk.download()

In [None]:
# Loading the word embeddings
def read_glove_vecs():
    path_to_glove_file = os.path.join("./glove.6B.50d.txt")

    embeddings_index = {}
    word_to_index = {}
    index_to_word = {}
    with open(path_to_glove_file) as f:
        for line in f:
            word, coefs = line.split(maxsplit=1)
            coefs = np.fromstring(coefs, "f", sep=" ")
            embeddings_index[word] = coefs
    
    words_list = list(embeddings_index.keys())
    
    for i in range(len(words_list)):
        word_to_index[words_list[i]] = i
        index_to_word[i] = words_list[i]            
    
    print("Found %s word vectors." % len(embeddings_index))
    return word_to_index, index_to_word, embeddings_index

In [None]:
word_to_index, index_to_word, embeddings_matrix = read_glove_vecs()

In [None]:
TRAIN_PATH = "../input/nlp-getting-started/train.csv"
TEST_PATH = "../input/nlp-getting-started/test.csv"
SAMPLE_PATH = "../input/nlp-getting-started/sample_submission.csv"
MAX_LENGTH = 30

In [None]:
train_df = pd.read_csv(TRAIN_PATH, usecols=["id", "text", "target"])
test_df = pd.read_csv(TEST_PATH, usecols=["id", "text"])
sample_df = pd.read_csv(SAMPLE_PATH, usecols=["id", "target"])
print(train_df.head())
print(test_df.head())

In [None]:
# def clean_text_data(clean_df, colname):
#     clean_df[colname] = clean_df[colname].progress_apply(lambda x: str(x).lower())
# #     clean_df[colname] = clean_df[colname].progress_apply(lambda x: th.cont_expt(x))
# #     clean_df[colname] = clean_df[colname].progress_apply(lambda x: th.remove_emails(x))
#     clean_df[colname] = clean_df[colname].progress_apply(lambda x: th.remove_html_tags(x))
# #     clean_df[colname] = clean_df[colname].progress_apply(lambda x: th.remove_stopwords(x))
#     clean_df[colname] = clean_df[colname].progress_apply(lambda x: th.remove_special_chars(x))
#     clean_df[colname] = clean_df[colname].progress_apply(lambda x: th.remove_accented_chars(x))
# #     clean_df[colname] = clean_df[colname].progress_apply(lambda x: th.make_base(x))
#     return clean_df

In [None]:
def review_to_words(raw_review_df, colname):
    # Function to convert a raw review to a string of words
    # The input is a single string (a raw movie review), and 
    # the output is a single string (a preprocessed movie review)
    #
    for i in range(len(raw_review_df[colname])):
        # 1. Remove HTML
        raw_review_df[colname][i] = BeautifulSoup(raw_review_df[colname][i]).get_text() 
        #
        # 2. Remove non-letters        
        raw_review_df[colname][i] = re.sub("[^a-zA-Z]", " ", raw_review_df[colname][i]) 
        #
        # 3. Convert to lower case, split into individual words
        raw_review_df[colname][i] = raw_review_df[colname][i].lower().split()                             
        #
        # 4. Remove stop words
        stops = set(stopwords.words("english"))
        raw_review_df[colname][i] = [w for w in raw_review_df[colname][i] if not w in stops]  
        # 4. Join the words back into one string separated by space, 
        # and return the result.
        raw_review_df[colname][i] = " ".join(raw_review_df[colname][i])
    return raw_review_df  

In [None]:
train_clean_df = review_to_words(train_df, "text")
train_clean_df.head()

In [None]:
train_clean_df[train_clean_df["target"] == 1]

In [None]:
train_clean_df[train_clean_df["target"] == 0]

In [None]:
print("max len of tweets",max([len(x.split()) for x in train_clean_df.text]))

In [None]:
test_clean_df = review_to_words(test_df, "text")
test_clean_df.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(np.array(train_clean_df["text"]), np.array(train_clean_df["target"]), test_size=0.3, shuffle=True)
print("X_train shape: " + str(X_train.shape))
print("X_test shape: " + str(X_test.shape))
print("y_train shape: " + str(y_train.shape))
print("y_test shape: " + str(y_test.shape))

In [None]:
def sentences_to_indices(X, word_to_index, max_len):
    m = X.shape[0]
    X_indices = np.zeros((m, max_len))
    for i in range(m):
        sentence_words = X[i].lower().split()
        j = 0
        for  w in sentence_words:
            try:
                X_indices[i, j] = word_to_index[w]
            except:
                X_indices[i, j] = 0
            j += 1
    X_indices = np.array(X_indices).astype(np.int64)
    return X_indices

In [None]:
X_train_ind = sentences_to_indices(X_train, word_to_index, max_len = MAX_LENGTH)
X_test_ind = sentences_to_indices(X_test, word_to_index, max_len = MAX_LENGTH)
print(X_train_ind[:5, : ])
print(X_test_ind[:5, : ])

In [None]:
X_train_ind.shape

In [None]:
# X_train_ind = sequence.pad_sequences(X_train_ind, maxlen = MAX_LENGTH, dtype="object", padding="post", truncating="post")
# X_test_ind = sequence.pad_sequences(X_test_ind, maxlen = MAX_LENGTH, dtype="object")

In [None]:
def pretrained_embedding_layer(word_to_vec_map, word_to_index):
    vocab_len = len(word_to_index) + 1
    emb_dim = word_to_vec_map["the"].shape[0]
    emb_matrix = np.zeros((vocab_len, emb_dim))
    
    for word, index in word_to_index.items():
        emb_matrix[index, :] = word_to_vec_map[word]
    
    embedding_layer = Embedding(vocab_len, emb_dim, trainable=False)
    embedding_layer.build((None,))
    embedding_layer.set_weights([emb_matrix])
    
    return embedding_layer

In [None]:
def create_model(input_shape, word_to_vec_map, word_to_index):
    model = Sequential()
    
    model.add(Input(shape=input_shape))
    
    embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
    model.add(embedding_layer)
    
    model.add(Bidirectional(LSTM(128, return_sequences=True)))
    model.add(Dropout(0.6))
    model.add(Bidirectional(LSTM(80)))
    model.add(Dropout(0.6))
    model.add(Dense(1, "sigmoid"))
    
    return model

In [None]:
model = create_model(MAX_LENGTH, embeddings_matrix, word_to_index)

In [None]:
model.summary()

In [None]:
optimizer = Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-07, amsgrad=False, name="Adam")
loss_fn = BinaryCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss_fn, metrics=['accuracy'])

In [None]:
X_train_tensor = tf.convert_to_tensor(X_train_ind, np.int64)
X_train_tensor.shape

# **Notice that as you train the model multiple times, it somehow retains the previous train data and give better accuracy from the start**

In [None]:
EPOCH = 20

In [None]:
history = model.fit(x=X_train_tensor, y=y_train, epochs=EPOCH)

In [None]:
X_test_tensor = tf.convert_to_tensor(X_test_ind, np.int64)
X_test_tensor.shape

In [None]:
loss = history.history["loss"]
acc = history.history["accuracy"]

In [None]:
epoch = np.arange(EPOCH)
plt.plot(epoch, loss)
# plt.plot(epoch, val_loss)
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training Loss')
plt.legend(['train', 'val'])

In [None]:
epoch = np.arange(EPOCH)
plt.plot(epoch, acc)
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.title('Training Accuracy');

In [None]:
eval_score = model.evaluate(X_test_tensor, y_test)
print("Test loss:", eval_score[0])
print("Test accuracy:", eval_score[1])

In [None]:
model.save("DisasterTweetv10-BS4-re-nltk")

In [None]:
# Creating a zip of the model folder 
!tar -zcvf DisasterTweetv9.tar.gz /kaggle/working/DisasterTweetv6

In [None]:
# Creating submision file from here
test_clean_df.head()

In [None]:
x_pred = sentences_to_indices(np.array(test_clean_df["text"]), word_to_index, max_len = MAX_LENGTH)
x_pred.shape

In [None]:
predicted = model.predict(x_pred)
predicted

In [None]:
y_predicted = np.where(predicted>0.5, 1, 0)
y_predicted

In [None]:
y_predicted = y_predicted.reshape((1, len(y_predicted)))[0]
y_predicted

In [None]:
sample_df.head()

In [None]:
sample_df["id"] = test_clean_df["id"]
sample_df["target"] = y_predicted
sample_df.head()
sample_df.shape

In [None]:
sample_df.to_csv("submission3.csv", index=False)