In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Imports


In [None]:
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

from keras_preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM, GRU, SimpleRNN, Embedding, Flatten

# Load Data

In [None]:
train = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
test = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')
submission = pd.read_csv('/kaggle/input/nlp-getting-started/sample_submission.csv')
print(train.shape, test.shape, submission.shape)

In [None]:
train.head()

In [None]:
test.head()

In [None]:
submission.head(3)

# Preparation of data

In [None]:
train_text = train['text']
y = train['target']

In [None]:
max_len = 100
max_words = 10000

In [None]:
tokenizer = Tokenizer(num_words=max_words)

In [None]:
tokenizer.fit_on_texts(train_text)
word_index = tokenizer.word_index
len(word_index)

In [None]:
sequences = tokenizer.texts_to_sequences(train_text)
X = pad_sequences(sequences, maxlen=max_len)
X.shape

In [None]:
test_data = tokenizer.texts_to_sequences(test['text'])
test_data = pad_sequences(test_data, maxlen=max_len)
test_data.shape

In [None]:
y = np.array(y).reshape((-1,1))
y.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.15)

# Utils

In [None]:
def train_model(model, batch_size=32, epochs=8):
    model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
    history = history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_split=0.2)
    print('-' * 100)
    print('Test data')
    model.evaluate(X_test, y_test)
    return history

In [None]:
def visual_validation_and_accuracy(history):
    acc = history.history['acc']
    val_acc = history.history['val_acc']
    loss = history.history['loss']
    val_loss = history.history['val_loss']

    epochs_plot = np.arange(1, len(loss) + 1)
    plt.clf()

    plt.plot(epochs_plot, acc, 'r', label='Training acc')
    plt.plot(epochs_plot, val_acc, 'b', label='Validation acc')
    plt.plot(epochs_plot, loss, 'r:', label='Training loss')
    plt.plot(epochs_plot, val_loss, 'b:', label='Validation loss')
    plt.title('Validation and accuracy')
    plt.xlabel('Epochs')
    plt.legend()
    plt.show()

# Dense Model

In [None]:
model = Sequential([
    Embedding(max_words, 32, input_length=max_len),
    Flatten(),
    Dense(16, activation='relu'),
    Dropout(0.5),
    Dense(8, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])
model.summary()

In [None]:
history = train_model(model)

In [None]:
visual_validation_and_accuracy(history)

# SimpleRNN Model

In [None]:
model = Sequential([
    Embedding(max_words, 32, input_length=max_len),
    SimpleRNN(32, return_sequences=True),
    SimpleRNN(32),
    Dense(1, activation='sigmoid')
])
model.summary()

In [None]:
history = train_model(model)

In [None]:
visual_validation_and_accuracy(history)

# LSTM Model

In [None]:
model = Sequential([
    Embedding(max_words, 32, input_length=max_len),
    LSTM(32, return_sequences=True),
    LSTM(32),
    Dense(1, activation='sigmoid')
])
model.summary()

In [None]:
history = train_model(model)

In [None]:
visual_validation_and_accuracy(history)

# GRU Model

In [None]:
model = Sequential([
    Embedding(max_words, 32, input_length=max_len),
    GRU(32, return_sequences=True),
    GRU(32),
    Dense(1, activation='sigmoid')
])
model.summary()

In [None]:
history = train_model(model)

In [None]:
visual_validation_and_accuracy(history)

# Predict Best Model

In [None]:
model = Sequential([
    Embedding(max_words, 32, input_length=max_len),
    LSTM(32, return_sequences=True),
    LSTM(32),
    Dense(1, activation='sigmoid')
])
model.summary()

In [None]:
history = train_model(model, batch_size=32, epochs=5)

In [None]:
visual_validation_and_accuracy(history)

In [None]:
predict = model.predict(test_data)
predict.shape

In [None]:
predict = (predict >= 0.5).astype(int)

In [None]:
submission.target = predict
submission.head(10)

In [None]:
submission.to_csv('submission.csv', index=False)