In [None]:
import spacy
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf

# Get the Data

In [None]:
ds_train = pd.read_csv("../input/nlp-getting-started/train.csv")
ds_test = pd.read_csv("../input/nlp-getting-started/test.csv")

In [None]:
ds_train.head()

In [None]:
ds_test.head()

In [None]:
sequence_length = ds_train.text.map(lambda x: len(x)).max()
print('Train max length sentence', ds_train.text.map(lambda x: len(x)).max())
print('Test max length sentence', ds_test.text.map(lambda x: len(x)).max())

# Build Model

**build embedding matrix**

In [None]:
vectorization_layer = tf.keras.layers.experimental.preprocessing.TextVectorization(
    standardize='lower_and_strip_punctuation',
    output_mode='int'
)
vectorization_layer.adapt(np.array(ds_train.text))

In [None]:
nlp = spacy.load('en_core_web_lg')
embedding_dim=len(nlp('and').vector)
vocab_size = len(vectorization_layer.get_vocabulary())
embedding_matrix = np.zeros((vocab_size, embedding_dim))

for i, word in enumerate(vectorization_layer.get_vocabulary()):
        embedding_matrix[i] = nlp(word).vector

In [None]:
print('Vocabulary sample', vectorization_layer.get_vocabulary()[:20])
print('Vocabulary length', len(vectorization_layer.get_vocabulary()))

In [None]:
model = tf.keras.Sequential([
    vectorization_layer,
    tf.keras.layers.Embedding(
        vocab_size,
        embedding_dim,
        embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(16, return_sequences=True)),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

In [None]:
class F1_Score(tf.keras.metrics.Metric):

    def __init__(self, name='f1_score', **kwargs):
        super().__init__(name=name, **kwargs)
        self.f1 = self.add_weight(name='f1', initializer='zeros')
        self.precision_fn = tf.metrics.Precision(thresholds=0.5)
        self.recall_fn = tf.metrics.Recall(thresholds=0.5)

    def update_state(self, y_true, y_pred, sample_weight=None):
        p = self.precision_fn(y_true, y_pred)
        r = self.recall_fn(y_true, y_pred)
        self.f1.assign(2 * ((p * r) / (p + r + 1e-6)))

    def result(self):
        return self.f1

    def reset_states(self):
        self.precision_fn.reset_states()
        self.recall_fn.reset_states()
        self.f1.assign(0)

The competition uses F1 score to evaluate submissions so we use that as a better metric

In [None]:
opt = tf.keras.optimizers.Nadam(0.01)
model.compile(optimizer=opt, loss='binary_crossentropy', metrics=['accuracy', F1_Score()])

In [None]:
X, y = np.array(ds_train.text), np.array(ds_train.target)
X.shape, y.shape

In [None]:
history = model.fit(
    X, y, 
    epochs=100, 
    batch_size=128, 
    validation_split=.1,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(monitor='val_f1_score', mode='max', patience=5, restore_best_weights=True)
    ]
)

# Plot History

In [None]:
import matplotlib.pyplot as plt

**Loss**

In [None]:
# plot some data
plt.plot(history.history['loss'], label='loss')
plt.plot(history.history['val_loss'], label='val_loss')
plt.legend()
plt.show()

**Accuracy**

In [None]:
# Plotting accuracies
plt.plot(history.history['accuracy'], label='acc')
plt.plot(history.history['val_accuracy'], label='val_acc')
plt.legend()
plt.show()

**F1 Score**

This is the most important metric for this competition as it will represent better the public score

In [None]:
plt.plot(history.history['f1_score'], label='f1_score')
plt.plot(history.history['val_f1_score'], label='val_f1_score')
plt.legend()
plt.show()

We can safely conclude that the model overfits as more epochs pass

# Make some analysis on validation predictions

# Make Predictions

In [None]:
prediction_scores = model.predict(np.array(ds_test.text))
prediction_classes = prediction_scores > 0.5 

In [None]:
submission = pd.DataFrame({'id': ds_test.id, 'target': prediction_classes.flatten().astype(int)})

In [None]:
submission.to_csv('submission.csv', index=False)

In [None]:
pd.read_csv('submission.csv')