In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import tensorflow as tf
from tensorflow import keras
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import time

In [None]:
# https://www.kaggle.com/c/nlp-getting-started : NLP Disaster Tweets
df = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
df_test = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")

In [None]:
df.shape , df_test.shape

In [None]:
df.head()

In [None]:
df_test.head()

In [None]:
df_test.info()

In [None]:
print((df.target == 1).sum()) # Disaster
print((df.target == 0).sum()) # No Disaster

In [None]:
# Preprocessing
import re
import string

def remove_URL(text):
    url = re.compile(r"https?://\S+|www\.\S+")
    return url.sub(r"", text)

# https://stackoverflow.com/questions/34293875/how-to-remove-punctuation-marks-from-a-string-in-python-3-x-using-translate/34294022
def remove_punct(text):
    translator = str.maketrans("", "", string.punctuation)
    return text.translate(translator)

string.punctuation

In [None]:
pattern = re.compile(r"https?://(\S+|www)\.\S+")
for t in df.text:
    matches = pattern.findall(t)
    for match in matches:
        print(t)
        print(match)
        print(pattern.sub(r"", t))
    if len(matches) > 0:
        break

In [None]:
pattern = re.compile(r"https?://(\S+|www)\.\S+")
for t in df_test.text:
    matches = pattern.findall(t)
    for match in matches:
        print(t)
        print(match)
        print(pattern.sub(r"", t))
    if len(matches) > 0:
        break

In [None]:
df["text"] = df.text.map(remove_URL) # map(lambda x: remove_URL(x))
df["text"] = df.text.map(remove_punct)
df["text"]

In [None]:
df_test["text"] = df_test.text.map(remove_URL) # map(lambda x: remove_URL(x))
df_test["text"] = df_test.text.map(remove_punct)
df_test["text"]

In [None]:
# remove stopwords
# pip install nltk
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

# Stop Words: A stop word is a commonly used word (such as “the”, “a”, “an”, “in”) that a search engine
# has been programmed to ignore, both when indexing entries for searching and when retrieving them 
# as the result of a search query.
stop = set(stopwords.words("english"))

# https://stackoverflow.com/questions/5486337/how-to-remove-stop-words-using-nltk-or-python
def remove_stopwords(text):
    filtered_words = [word.lower() for word in text.split() if word.lower() not in stop]
    return " ".join(filtered_words)

In [None]:
stop

In [None]:
df["text"] = df.text.map(remove_stopwords)

In [None]:
df_test["text"] = df_test.text.map(remove_stopwords)

In [None]:
df_test.text

In [None]:
df.text

In [None]:
from collections import Counter

# Count unique words
def counter_word(text_col):
    count = Counter()
    for text in text_col.values:
        for word in text.split():
            count[word] += 1
    return count


counter = counter_word(df.text)

In [None]:
len(counter)

In [None]:
counter

In [None]:
counter.most_common(5)

In [None]:
num_unique_words = len(counter)
num_unique_words

In [None]:
# Split dataset into training and validation set
train_size = int(df.shape[0] * 0.9)

train_df = df[:train_size]
val_df = df[train_size:]

# split text and labels
train_sentences = train_df.text.to_numpy()
train_labels = train_df.target.to_numpy()
val_sentences = val_df.text.to_numpy()
val_labels = val_df.target.to_numpy()

In [None]:
test_sentences = df_test.text.to_numpy()

In [None]:
df.info()

In [None]:
train_sentences.shape, val_sentences.shape

In [None]:
test_sentences.shape

In [None]:
# Tokenize
from tensorflow.keras.preprocessing.text import Tokenizer

# vectorize a text corpus by turning each text into a sequence of integers
tokenizer = Tokenizer(num_words=num_unique_words)
tokenizer.fit_on_texts(train_sentences) # fit only to training

In [None]:
# each word has unique index
word_index = tokenizer.word_index

In [None]:
word_index

In [None]:
train_sequences = tokenizer.texts_to_sequences(train_sentences)
val_sequences = tokenizer.texts_to_sequences(val_sentences)

In [None]:
test_sequences = tokenizer.texts_to_sequences(test_sentences)

In [None]:
print(train_sentences[10:15])
print(train_sequences[10:15])

In [None]:
print(test_sentences[10:15])
print(test_sequences[10:15])

In [None]:
# Pad the sequences to have the same length
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Max number of words in a sequence
max_length = 20

train_padded = pad_sequences(train_sequences, maxlen=max_length, padding="post", truncating="post")
val_padded = pad_sequences(val_sequences, maxlen=max_length, padding="post", truncating="post")
train_padded.shape, val_padded.shape

In [None]:
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding="post", truncating="post")

In [None]:
train_padded[10]

In [None]:
test_padded[10]

In [None]:
print(train_sentences[10])
print(train_sequences[10])
print(train_padded[10])

In [None]:
print(test_sentences[10])
print(test_sequences[10])
print(test_padded[10])

In [None]:
# Check reversing the indices

# flip (key, value)
reverse_word_index = dict([(idx, word) for (word, idx) in word_index.items()])

In [None]:
reverse_word_index

In [None]:
def decode(sequence):
    return " ".join([reverse_word_index.get(idx, "?") for idx in sequence])

In [None]:
decoded_text = decode(train_sequences[10])

print(train_sequences[10])
print(decoded_text)

In [None]:
decoded_text = decode(test_sequences[10])

print(test_sequences[10])
print(decoded_text)

In [None]:
# Create LSTM model
from tensorflow.keras import layers

# Embedding: https://www.tensorflow.org/tutorials/text/word_embeddings
# Turns positive integers (indexes) into dense vectors of fixed size. (other approach could be one-hot-encoding)

# Word embeddings give us a way to use an efficient, dense representation in which similar words have 
# a similar encoding. Importantly, you do not have to specify this encoding by hand. An embedding is a 
# dense vector of floating point values (the length of the vector is a parameter you specify).

model = keras.models.Sequential()
model.add(layers.Embedding(num_unique_words, 32, input_length=max_length))

# The layer will take as input an integer matrix of size (batch, input_length),
# and the largest integer (i.e. word index) in the input should be no larger than num_words (vocabulary size).
# Now model.output_shape is (None, input_length, 32), where `None` is the batch dimension.


model.add(layers.LSTM(64, dropout=0.1))
model.add(layers.Dense(1, activation="sigmoid"))

model.summary()

In [None]:
loss = keras.losses.BinaryCrossentropy(from_logits=False)
optim = keras.optimizers.Adam(lr=0.001)
metrics = ["accuracy"]

model.compile(loss=loss, optimizer=optim, metrics=metrics)

In [None]:
#model.fit(train_padded, train_labels, epochs=20, validation_data=(val_padded, val_labels), verbose=2)

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(num_unique_words, 32, input_length=max_length),
    tf.keras.layers.LSTM(64, dropout=0.1),
    tf.keras.layers.Dense(1, activation="sigmoid"),
])
model.compile(optimizer='adam', loss= 'binary_crossentropy', metrics=['accuracy'])
batch_size = 50
max_epochs = 20
early_stopping = tf.keras.callbacks.EarlyStopping(patience = 3)
model.fit(train_padded,
         train_labels,
         batch_size = batch_size,
         epochs = max_epochs,
         callbacks = [early_stopping],
          validation_data = (val_padded, val_labels),
          verbose = 2
         )

In [None]:
predictions = model.predict(train_padded)
predictions = [1 if p > 0.5 else 0 for p in predictions]

In [None]:
print(train_sentences[10:20])

print(train_labels[10:20])
print(predictions[10:20])

In [None]:
predictions = model.predict(test_padded)
predictions = [1 if p > 0.5 else 0 for p in predictions]

In [None]:
print(test_sentences[0])
print(predictions[0])

In [None]:
submission = pd.DataFrame({
        "id": df_test["id"],
        "target": predictions
    })

In [None]:
submission.to_csv('submission.csv',index = False)