In [None]:
import numpy as np
import pandas as pd
import os

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout
from keras.layers import Bidirectional, GlobalAveragePooling1D
from keras.models import Sequential, Model

In [None]:
from pathlib import Path

path = Path('../input/nlp-getting-started/')

train = pd.read_csv(path/'train.csv')
test = pd.read_csv(path/'test.csv')

train.head()

In [None]:
train = train.drop(columns = ['id', 'keyword', 'location'])
test = test.drop(columns = ['id', 'keyword','location'])

In [None]:
train['target'].value_counts().plot(kind = 'bar')
train['target'].value_counts()

In [None]:
train_sentences = train['text'].fillna('_na_').values
test_sentences = test['text'].fillna('_na_').values
ytrain = train['target'].values

In [None]:
maxlen = 100
embedding_size = 50
vocab_size = 20000

In [None]:
tokenizer = Tokenizer(num_words = vocab_size)
tokenizer.fit_on_texts(list(train_sentences))
train_sentences = tokenizer.texts_to_sequences(train_sentences)
test_sentences = tokenizer.texts_to_sequences(test_sentences)
xtrain = pad_sequences(train_sentences , maxlen = maxlen)
xtest = pad_sequences(test_sentences, maxlen = maxlen)

In [None]:
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.strip().split()) for o in open('../input/glove6b50d/glove.6B.50d.txt'))

In [None]:
all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
emb_mean,emb_std

In [None]:
word_index = tokenizer.word_index
nb_words = min(vocab_size, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embedding_size))
for word, i in word_index.items():
    if i >= vocab_size: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

In [None]:
inp = Input(shape=(maxlen,))
x = Embedding(vocab_size, embedding_size, weights=[embedding_matrix])(inp)
x = Bidirectional(LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
x = GlobalAveragePooling1D()(x)
x = Dense(50, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(1, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
model.fit(xtrain, ytrain, batch_size=32, epochs=15, validation_split=0.1);

In [None]:
ytest = model.predict(xtest)
ytest = np.round(ytest).astype(int)

In [None]:
sample_submission = pd.read_csv(path/'sample_submission.csv')
sample_submission['target'] = ytest
sample_submission.to_csv('submission.csv', index=False)

In [None]:
sample_submission.head()