In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

np.random.seed(42)

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from keras.models import Model
from keras.layers import Input, Embedding, Dense, Conv2D, MaxPool2D
from keras.layers import Reshape, Flatten, Concatenate, Dropout, SpatialDropout1D
from keras.preprocessing import text, sequence
from keras.callbacks import Callback
import warnings
warnings.filterwarnings('ignore')

print(os.listdir("../input"))

EMBEDDING_FILE = '../input/embeddings/glove.840B.300d/glove.840B.300d.txt'
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')
submission = pd.read_csv('../input/sample_submission.csv')

X_train = train["question_text"].fillna("_na_").values
y_train = train["target"].values
X_test = test["question_text"].fillna("_na_").values

max_features = 40000
maxlen = 200
embed_size = 300

tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(X_train) + list(X_test))

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)
x_train = sequence.pad_sequences(X_train, maxlen=maxlen)
x_test = sequence.pad_sequences(X_test, maxlen=maxlen)

# embdedding setup
# Source https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html
from tqdm import tqdm

embeddings_index = {}
f = open(EMBEDDING_FILE, 'r', encoding='utf-8')
for line in tqdm(f):
    values = line.split(" ")
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
print(len(word_index), max_features)
embedding_matrix = np.zeros((nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector
        
class F1Evaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            y_pred = (y_pred > 0.5).astype(int)
            score = f1_score(self.y_val, y_pred)
            print("\n F1 Score - epoch: %d - score: %.6f \n" % (epoch+1, score))     
            
from keras.models import Model
from keras.layers import Input, Dropout, LSTM, Activation, Dense, SpatialDropout1D, concatenate, BatchNormalization
from keras.layers.convolutional import Conv1D
from keras.layers.pooling import GlobalMaxPooling1D, GlobalAveragePooling1D,MaxPooling1D
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.initializers import glorot_uniform
np.random.seed(1)

def get_model():    
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable=False)(inp)
    
    X = SpatialDropout1D(0.1)(x)

    
    X1 = Conv1D(filters=256, kernel_size=3,activation='relu')(X)
    BatchNormalization()(X1)
    X2 = Conv1D(filters=256, kernel_size=4,activation='relu')(X)
    BatchNormalization()(X2)
    X3 = Conv1D(filters=256, kernel_size=5,activation='relu')(X)
    BatchNormalization()(X3)
    X4 = Conv1D(filters=256, kernel_size=6,activation='relu')(X)
    BatchNormalization()(X4)   
    X5 = Conv1D(filters=256, kernel_size=7,activation='relu')(X)
    BatchNormalization()(X5)
    
    
    avg_pool1 = GlobalAveragePooling1D()(X1)    
    max_pool1 = GlobalMaxPooling1D()(X1)
    avg_pool2 = GlobalAveragePooling1D()(X2)    
    max_pool2 = GlobalMaxPooling1D()(X2)
    avg_pool3 = GlobalAveragePooling1D()(X3)    
    max_pool3 = GlobalMaxPooling1D()(X3)
    avg_pool4 = GlobalAveragePooling1D()(X4)    
    max_pool4 = GlobalMaxPooling1D()(X4) 
    avg_pool5 = GlobalAveragePooling1D()(X5)    
    max_pool5 = GlobalMaxPooling1D()(X5)    
    z = concatenate([avg_pool1, max_pool1, avg_pool2, max_pool2,  avg_pool3, max_pool3, avg_pool4, max_pool4, avg_pool5, max_pool5])
          
    z = Dropout(0.1)(z)        
        
    outp = Dense(1, activation="sigmoid")(z)
    
    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    return model

model = get_model()

model.summary()

from keras.callbacks import ModelCheckpoint
from keras.models import load_model
modelname = "submission_paralleleCNN-50seqlen.h5"

checkpoint = ModelCheckpoint(filepath= modelname, monitor='val_acc', save_best_only=True, mode='max')
batch_size = 256
epochs = 2

X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, train_size=0.95,
                                              random_state=233)
F1_Score = F1Evaluation(validation_data=(X_val, y_val), interval=1)


hist = model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs,
                 validation_data=(X_val, y_val),
                 callbacks=[F1_Score, checkpoint], verbose=2)

maxvalue = max(hist.history['val_acc'])
epoch = hist.history['val_acc'].index(maxvalue) +1
print(epoch, maxvalue)

from keras.models import load_model

modelmax = load_model(modelname)

from sklearn.metrics import confusion_matrix
yprediction = modelmax.predict(X_val)

y_pred = (yprediction > 0.5).astype(int)
y_true = (y_val > 0.5).astype(int)

score = f1_score(y_val, y_pred)
print("F1_score: ", score)

#confusion matrix
import itertools
import matplotlib.pyplot as plt
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    

F1_score_lst = {}
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    score = f1_score(y_val, (yprediction>thresh).astype(int))
    print("F1 score at threshold {0} is {1}".format(thresh, score))
    F1_score_lst[thresh] = score
    
maxF1 = max(F1_score_lst.values())
print(maxF1)

thresh = list(F1_score_lst.keys())[list(F1_score_lst.values()).index(maxF1)]
print(thresh)

y_pred_thresh = (yprediction > thresh).astype(int)
y_true_thresh = (y_val > thresh).astype(int)

confusion_mtx = confusion_matrix(y_true_thresh, y_pred_thresh)
#plot confusion matrix
plot_confusion_matrix(confusion_mtx, classes = range(2))

#prediction on test sample
y_pred_test = modelmax.predict([x_test], batch_size = 1024, verbose = 1)
y_pred_test_thresh = (y_pred_test > thresh).astype(int)

out_df = pd.DataFrame({"qid":test["qid"].values})
out_df['prediction'] = y_pred_test_thresh

out_df.to_csv("submission.csv", index=False)

print("submission done")

