<h1>Natural Language Processing with Disaster Tweets</h1>

[https://www.kaggle.com/c/nlp-getting-started](https://www.kaggle.com/c/nlp-getting-started)

Author: Vaasudevan Srinivasan [(Portfolio)](https://vaasudevans.github.io) <br>
Created on: July 06, 2021 <br>




# Import all the Modules

In [None]:
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from sklearn.model_selection import train_test_split
import tensorflow_hub as hub
import tensorflow as tf
import pandas as pd
import numpy as np

tf.__version__

# Read and Visualize

In [None]:
train_df = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
train_df = train_df.sample(frac=1, random_state=42)
train_df.head()

In [None]:
train_df.target.value_counts().plot.bar();

In [None]:
# Keep 10% for validation

x_train, x_val, y_train, y_val = train_test_split(train_df['text'].to_numpy(),
                                                  train_df['target'].to_numpy(),
                                                  test_size=0.1,
                                                  random_state=42)
print(x_train.shape, x_val.shape)

# Tokenization and Embedding

In [None]:
# Average number of words in each tweet
np.mean([len(i.split()) for i in x_train])

In [None]:
max_vocab_length = 10_000
max_length = 15  # Average words in each tweet

text_vectorizer = TextVectorization(max_tokens=max_vocab_length,
                                    output_mode='int',
                                    output_sequence_length=max_length)

text_vectorizer.adapt(x_train)

In [None]:
words_in_vocab = text_vectorizer.get_vocabulary()
print(words_in_vocab[:5])   # Most common words
print(words_in_vocab[-5:])  # Least common words

# Transfer Learning

In [None]:
sentence_encoder = hub.KerasLayer('https://tfhub.dev/google/universal-sentence-encoder/4',
                                  input_shape=[],
                                  dtype=tf.string,
                                  trainable=False,
                                  name='USE')
model = tf.keras.Sequential([
    sentence_encoder,
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
h = model.fit(x=x_train,
              y=y_train,
              validation_data=(x_val, y_val),
              epochs=10)

In [None]:
pd.DataFrame(h.history).plot();

# Prediction and Submission

In [None]:
test_df = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')
x_test = test_df['text'].to_numpy()
print(x_test.shape)

In [None]:
pred = model.predict(x_test)
pred = tf.squeeze(tf.round(pred)).numpy()
print(pred.shape)
print(pred[:10])

In [None]:
test_df['target'] = pred.astype('uint8')
test_df.head()

In [None]:
test_df[['id', 'target']].to_csv('submission.csv', index=False)