In [None]:

import numpy as np 
import pandas as pd 

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

* Import required libraries

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

In [None]:
data = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")

tst = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")

print(data.columns.values)

* Keywords

In [None]:
data['keyword'].value_counts()[0:15]

* Location

In [None]:
data['location'].value_counts()[0:15]

* Text

In [None]:
data['text'][0:5]

* parameters

In [None]:
vocab_size = 10000
embedding_dim = 16
max_length = 100
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"
training_size = 20000


* Feature selection and data split into training and testing splits

In [None]:
sentences = data['text']
target = data['target']

X_train, X_test, y_train, y_test = train_test_split(sentences, target, random_state=3, test_size=0.2)

* Data prepation and cleaning(encoding)

In [None]:
tokenizer = Tokenizer(num_words=10000, oov_token='<OOV>')

tokenizer.fit_on_texts(X_train)

text_index = tokenizer.word_index

data_sequences = tokenizer.texts_to_sequences(X_train)

data_padded = pad_sequences(data_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

test_sequences = tokenizer.texts_to_sequences(X_test)

test_padded = pad_sequences(test_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

tst_sequences = tokenizer.texts_to_sequences(tst["text"])

tst_padded = pad_sequences(tst_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

print(test_padded[0])

Final Data preparation

In [None]:
training_data = np.array(data_padded)

training_labels = np.array(y_train)

eval_data = np.array(test_padded)

eval_labels = np.array(y_test)

tst_data = tst_padded

tst_labels = np.array(tst["id"])
                

# Building model

* Defining layers

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

Model summary

In [None]:
model.summary()


Early stopping

In [None]:
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)


Reduce learning rate

In [None]:
lrr = ReduceLROnPlateau(monitor='val_loss',patience=3,verbose=1,factor=0.5, min_lr=0.00001)

model fitting

In [None]:
num_epochs = 30
hist = model.fit(training_data, training_labels, epochs=num_epochs, validation_data=(eval_data, eval_labels), verbose=2,
callbacks=[early_stopping, lrr])

Evaluating model

In [None]:
test_loss, test_acc = model.evaluate(eval_data, eval_labels)

visualization

In [None]:
def plot_loss_and_accuracy(history):
    history_df = pd.DataFrame(history)
    history_df.loc[0:, ['loss', 'val_loss']].plot()
    history_df.loc[0:, ['accuracy', 'val_accuracy']].plot()

plot_loss_and_accuracy(hist.history)

* Making predictions

In [None]:
predictions = model.predict(tst_data)

predictions = np.around(predictions)

predictions = predictions.astype(int)



In [None]:
output = pd.DataFrame()

output["id"] = tst["id"]

output["target"] = predictions

output.to_csv("my_submission.csv", index=False)