## Disaster Tweets Classification with Conv1D

In [None]:
import tensorflow as tf
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tensorflow import keras
from sklearn.model_selection import train_test_split
import nltk

In [None]:
class Config:
    vocab_size = 5000
    embed_size = 100
    filters = 256
    num_words = 3
    batch_size = 64
    epochs = 20
    maxlen = 100
    model_path = "model.tf"
    
config = Config() 

In [None]:
train = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
test = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")
sample_submission = pd.read_csv("/kaggle/input/nlp-getting-started/sample_submission.csv")

In [None]:
train.head()

In [None]:
test.head()

## Preprocessing

In [None]:
train["sentence_preprocessed"] = train["text"].apply(lambda sentence: " ". join(nltk.word_tokenize(sentence.lower())))
test["sentence_preprocessed"] = test["text"].apply(lambda sentence: " ". join(nltk.word_tokenize(sentence.lower())))

In [None]:
train.head()

## Text Vectorization

In [None]:
vectorizor = keras.layers.TextVectorization(max_tokens=config.vocab_size)
vectorizor.adapt(list(train["sentence_preprocessed"]) + list(test["sentence_preprocessed"]))

## Train Validation Split

In [None]:
x_train, x_val, y_train, y_val = train_test_split(train["sentence_preprocessed"], train["target"], test_size=0.2, random_state=42)
x_train.shape, x_val.shape, y_train.shape, y_val.shape

## Modeling

In [None]:
keras.backend.clear_session()
model = keras.Sequential([
    keras.Input(shape=(None, ), dtype="string"),
    vectorizor,
    keras.layers.Embedding(config.vocab_size, config.embed_size, input_length=config.maxlen),
    keras.layers.SpatialDropout1D(0.2),
    keras.layers.Conv1D(filters=config.filters, kernel_size=config.num_words, activation="relu"),
    keras.layers.GlobalMaxPooling1D(),
    keras.layers.Dense(1, activation="sigmoid")
])

In [None]:
keras.utils.plot_model(model, show_shapes=True)

In [None]:
model.summary()

In [None]:
checkpoint = keras.callbacks.ModelCheckpoint(config.model_path, monitor="val_accuracy", save_best_only=True, save_weights_only=True)
early_stop = keras.callbacks.EarlyStopping(monitor="val_accuracy", patience=5)
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
histry = model.fit(
    x_train, y_train, 
    batch_size=config.batch_size, 
    epochs=config.epochs,
    validation_data=(x_val, y_val),
    callbacks=[checkpoint, early_stop]
)

In [None]:
model.load_weights(config.model_path)

## Submission

In [None]:
pred = np.array(model.predict(test["sentence_preprocessed"]) > 0.5, dtype=int)
sample_submission["target"] = pred
sample_submission.to_csv("submission.csv", index=False)
sample_submission.head()