In [None]:
import numpy as np 
import pandas as pd 

import os
import time
from tqdm import tqdm
import math
from sklearn.model_selection import train_test_split
from sklearn import metrics

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Embedding, Dropout, CuDNNLSTM, Bidirectional, GlobalMaxPool1D, InputLayer, SpatialDropout1D
from keras.models import Sequential

In [None]:
path = '../input'

train_file = f'{path}/train.csv'
test_file = f'{path}/test.csv'

In [None]:
train_df = pd.read_csv(train_file)
test_df = pd.read_csv(test_file)

In [None]:
train_df.head()

In [None]:
train_df['question_text'].apply(len).mean(), train_df['question_text'].apply(len).median()

In [None]:
# X_train, X_val, y_train, y_val = train_test_split(
#     train_df['question_text'].values, train_df['target'].values, test_size=0.33, random_state=42)
# X_test = test_df['question_text'].values

X_train, y_train, X_test = train_df['question_text'].values, train_df['target'].values, test_df['question_text'].values

max_features = 10000
embed_size = 100
maxlen = 100

In [None]:
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(X_train))
X_train = tokenizer.texts_to_sequences(X_train)
# X_val = tokenizer.texts_to_sequences(X_val)
X_test = tokenizer.texts_to_sequences(X_test)

In [None]:
X_train = pad_sequences(X_train, maxlen=maxlen)
# X_val = pad_sequences(X_val, maxlen=maxlen)
X_test = pad_sequences(X_test, maxlen=maxlen)

In [None]:
model = Sequential([
    InputLayer(input_shape=(maxlen,)),
    Embedding(max_features, embed_size),
    SpatialDropout1D(0.3),
    Bidirectional(CuDNNLSTM(64, return_sequences=True)),
    GlobalMaxPool1D(),
    Dense(64, kernel_initializer='normal', activation='relu'),
    Dropout(0.25),
    Dense(1, kernel_initializer='normal', activation='sigmoid')
])
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [None]:
# model.fit(X_train, y_train, batch_size=1024, epochs=3, validation_data=(X_val, y_val))
model.fit(X_train, y_train, batch_size=1024, epochs=3)

In [None]:
test_pred = model.predict([X_test], batch_size=1024, verbose=1)

In [None]:
out_df = pd.DataFrame({"qid":test_df["qid"].values})
out_df['prediction'] = (test_pred >= 0.5).astype(int)
out_df.to_csv("submission.csv", index=False)