In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np
np.random.seed(42)
import pandas as pd
import string
import re

import gensim
from collections import Counter
import pickle

import tensorflow as tf
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, f1_score
from sklearn import metrics

from keras.models import Model
from keras.layers import Input, Dense, Dropout, Conv1D, Embedding, SpatialDropout1D, concatenate
from keras.layers import GRU, LSTM,Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.layers import CuDNNLSTM, CuDNNGRU
from keras.preprocessing import text, sequence

from keras.callbacks import Callback
from keras import optimizers
from keras.layers import Lambda
from keras.callbacks import *

import warnings
warnings.filterwarnings('ignore')

from nltk.corpus import stopwords

import os
os.environ['OMP_NUM_THREADS'] = '4'

import gc
from keras import backend as K
from sklearn.model_selection import KFold

from unidecode import unidecode

import time

eng_stopwords = set(stopwords.words("english"))



In [None]:
# 1. preprocessing
train = pd.read_csv("../input/train.csv")
test = pd.read_csv("../input/test.csv")

print("Train shape : ",train.shape)
print("Test shape : ",test.shape)


In [None]:
# 1-a. Count non ascii characters
special_character = re.compile(r'[A-Za-z0-9\.\-\?\!\,\#\@\% \'\/\"]',re.IGNORECASE)
train['spl_chars'] = train['question_text'].apply(lambda x: len(special_character.sub('', str(x))))
test['spl_chars'] = test['question_text'].apply(lambda x: len(special_character.sub('', str(x))))

In [None]:
#train['max_long_word'] = train['question_text'].apply(lambda x: max([len(i) for i in x.split(' ')])/len(x.split(' ')))
#train.loc[train['max_long_word']<0.7]['target'].count()

In [None]:
#pd.set_option('display.max_colwidth', -1)
#train.head()

In [None]:
#train.loc[train.target==0]['spl_chars'].mean()

In [None]:
# 2. remove numbers
def clean_numbers(x):
    x = re.sub('[0-9]{5,}', '#####', x)
    x = re.sub('[0-9]{4}', '####', x)
    x = re.sub('[0-9]{3}', '###', x)
    x = re.sub('[0-9]{2}', '##', x)
    return x

train['clean_text'] = train['question_text'].apply(lambda x: clean_numbers(str(x)))
test['clean_text'] = test['question_text'].apply(lambda x: clean_numbers(str(x)))


In [None]:
#train['clean_text']

In [None]:
#3.  remove non-ascii

special_character_removal = re.compile(r'[^A-Za-z\.\-\?\!\,\#\@\% ]',re.IGNORECASE)
def clean_text(x):
    x_ascii = unidecode(x)
    x_clean = special_character_removal.sub('',x_ascii)
    return x_clean

train['clean_text'] = train['clean_text'].apply(lambda x: clean_text(str(x)))
test['clean_text'] = test['clean_text'].apply(lambda x: clean_text(str(x)))


In [None]:
X_train = train['clean_text'].fillna("something").values
y_train = train.target.values
X_test = test['clean_text'].fillna("something").values


In [None]:
#X_train

In [None]:
def add_features(df):
    
    df['comment_text'] = df['clean_text'].fillna('something').apply(lambda x:str(x))
    df['total_length'] = df['comment_text'].apply(len)
    df['capitals'] = df['comment_text'].apply(lambda comment: sum(1 for c in comment if c.isupper()))
    df['caps_vs_length'] = df['capitals']/df['total_length']
    df['num_words'] = df.comment_text.str.count('\S+')
    df['num_unique_words'] = df['comment_text'].apply(lambda comment: len(set(w for w in comment.split())))
    df['words_vs_unique'] = df['num_unique_words'] / df['num_words']  
    df['spl_chars_vs_len'] = df['spl_chars']/df['total_length']
    return df

train = add_features(train)
test = add_features(test)



In [None]:
train.loc[np.isinf(train.caps_vs_length),'caps_vs_length'] =0
train.loc[np.isinf(train.words_vs_unique),'words_vs_unique'] =0
train.loc[np.isinf(train.spl_chars_vs_len),'spl_chars_vs_len'] =0

In [None]:
features = train[['caps_vs_length', 'words_vs_unique', 'spl_chars_vs_len']].fillna(0)
test_features = test[['caps_vs_length', 'words_vs_unique', 'spl_chars_vs_len']].fillna(0)


In [None]:
#test[test.num_words>=50].count()

In [None]:
ss = StandardScaler()
ss.fit(np.vstack((features, test_features)))
features = ss.transform(features)
test_features = ss.transform(test_features)


In [None]:
max_features = 180000
maxlen = 50

tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(X_train) + list(X_test))
X_train_sequence = tokenizer.texts_to_sequences(X_train)
X_test_sequence = tokenizer.texts_to_sequences(X_test)

x_train = sequence.pad_sequences(X_train_sequence, maxlen=maxlen)
x_test = sequence.pad_sequences(X_test_sequence, maxlen=maxlen)
print(len(tokenizer.word_index))


In [None]:
# Load the FastText Web Crawl vectors
EMBEDDING_FILE_FASTTEXT='../input/embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec'
EMBEDDING_FILE_TWITTER='../input/embeddings/glove.840B.300d/glove.840B.300d.txt'
EMBEDDING_FILE_PAR='../input/embeddings/paragram_300_sl999/paragram_300_sl999.txt'

def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')

# switching as glove has better support fot this text
embeddings_index_tw = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE_FASTTEXT,encoding='utf-8'))
embeddings_index_ft = dict(get_coefs(*o.strip().split(' ')) for o in open(EMBEDDING_FILE_TWITTER,encoding='utf-8'))
embeddings_index_pa = dict(get_coefs(*o.strip().split(' ')) for o in open(EMBEDDING_FILE_PAR,encoding='utf-8', errors='ignore'))

spell_model = gensim.models.KeyedVectors.load_word2vec_format(EMBEDDING_FILE_FASTTEXT)


In [None]:
# This code is  based on: Spellchecker using Word2vec by CPMP
# https://www.kaggle.com/cpmpml/spell-checker-using-word2vec

words = spell_model.index2word

w_rank = {}
for i,word in enumerate(words):
    w_rank[word] = i

WORDS = w_rank

# Use fast text as vocabulary
def words(text): return re.findall(r'\w+', text.lower())

def P(word): 
    "Probability of `word`."
    # use inverse of rank as proxy
    # returns 0 if the word isn't in the dictionary
    return - WORDS.get(word, 0)

def correction(word): 
    "Most probable spelling correction for word."
    return max(candidates(word), key=P)

def candidates(word): 
    "Generate possible spelling corrections for word."
    return (known([word])  or [word])# or known(edits1(word)) or known(edits2(word)) # cannot auto correct spelling as it is taking too much time

def known(words): 
    "The subset of `words` that appear in the dictionary of WORDS."
    return set(w for w in words if w in WORDS)

def edits1(word):
    "All edits that are one edit away from `word`."
    letters    = 'abcdefghijklmnopqrstuvwxyz'
    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
    deletes    = [L + R[1:]               for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
    replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
    inserts    = [L + c + R               for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)

def edits2(word): 
    "All edits that are two edits away from `word`."
    return (e2 for e1 in edits1(word) for e2 in edits1(e1))

def singlify(word):
    return "".join([letter for i,letter in enumerate(word) if i == 0 or letter != word[i-1]])
    


In [None]:
#WORDS

In [None]:
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.zeros((nb_words,601))

something_tw = embeddings_index_tw.get("something")
something_ft = embeddings_index_ft.get("something")

something = np.zeros((601,))
something[:300,] = something_ft
something[300:600,] = something_tw
something[600,] = 0


In [None]:
def all_caps(word):
    return len(word) > 1 and word.isupper()

def embed_word(embedding_matrix,i,word):
    embedding_vector_ft = embeddings_index_ft.get(word)
    if embedding_vector_ft is not None: 
        if all_caps(word):
            last_value = np.array([1])
        else:
            last_value = np.array([0])
        embedding_matrix[i,:300] = embedding_vector_ft
        embedding_matrix[i,600] = last_value
        embedding_vector_tw = embeddings_index_tw.get(word)
        embedding_vector_pa = embeddings_index_pa.get(str.lower(word))
        
        if embedding_vector_tw is not None:
            embedding_matrix[i,300:600] = embedding_vector_tw
        elif embedding_vector_pa is not None:
            embedding_matrix[i,300:600] = embedding_vector_pa
            
# Glove vector is used by itself if there is no glove vector but not the other way around.
for word, i in word_index.items():
    
    if i >= max_features: continue
        
    if embeddings_index_ft.get(word) is not None:
        embed_word(embedding_matrix,i,word)
    else:
        # change to > 20 for better score.
        if len(word) > 20:
            embedding_matrix[i] = something
            #print(word)
        else:
            word2 = correction(word)
            #print(word2)
            if embeddings_index_ft.get(word2) is not None:
                embed_word(embedding_matrix,i,word2)
            else:
                word2 = correction(singlify(word))
                if embeddings_index_ft.get(word2) is not None:
                    embed_word(embedding_matrix,i,word2)
                else:
                    embedding_matrix[i] = something     


In [None]:
embedding_matrix.shape

In [None]:
del(embeddings_index_tw, embeddings_index_ft, embeddings_index_pa); gc.collect()

In [None]:
class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data
        self.max_score = 0
        self.not_better_count = 0

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=1)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))
            if (score > self.max_score):
                print("*** New High Score (previous: %.6f) \n" % self.max_score)
                model.save_weights("best_weights.h5")
                self.max_score=score
                self.not_better_count = 0
            else:
                self.not_better_count += 1
                if self.not_better_count > 3:
                    print("Epoch %05d: early stopping, high score = %.6f" % (epoch,self.max_score))
                    self.model.stop_training = True


In [None]:
def get_model(features,clipvalue=1.,num_filters=40,dropout=0.5,embed_size=601):
    features_input = Input(shape=(features.shape[1],))
    inp = Input(shape=(maxlen, ))
    
    # Layer 1: concatenated fasttext and glove twitter embeddings.
    x = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable=False)(inp)
    
    # Uncomment for best result
    # Layer 2: SpatialDropout1D(0.5)
    x = SpatialDropout1D(dropout)(x)
    
    # Uncomment for best result
    # Layer 3: Bidirectional CuDNNLSTM
    x = Bidirectional(CuDNNLSTM(num_filters, return_sequences=True))(x)


    # Layer 4: Bidirectional CuDNNGRU
    x, x_h, x_c = Bidirectional(CuDNNGRU(num_filters, return_sequences=True, return_state = True))(x)  
    
    # Layer 5: A concatenation of the last state, maximum pool, average pool and 
    # two features: "Unique words rate" and "Rate of all-caps words"
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    
    x = concatenate([avg_pool, x_h, max_pool,features_input])
    
    # Layer 6: output dense layer.
    outp = Dense(1, activation="sigmoid")(x)

    model = Model(inputs=[inp,features_input], outputs=outp)
    adam = optimizers.adam(clipvalue=clipvalue)
    model.compile(loss='binary_crossentropy',
                  optimizer=adam,
                  metrics=['accuracy'])
    return model


In [None]:
model = get_model(features)

batch_size = 512

# Used epochs=100 with early exiting for best score.
epochs = 7
gc.collect()
K.clear_session()

# Change to 5
num_folds = 5 #number of folds

y_test = np.zeros((test.shape[0],1))

# Uncomment for out-of-fold predictions
scores = []
oof_predict = np.zeros((train.shape[0],1))

kf = KFold(n_splits=num_folds, shuffle=True, random_state=239)


In [None]:
def f1_smart(y_true, y_pred):
    args = np.argsort(y_pred)
    tp = y_true.sum()
    fs = (tp - np.cumsum(y_true[args[:-1]])) / np.arange(y_true.shape[0] + tp - 1, tp, -1)
    res_idx = np.argmax(fs)
    return 2 * fs[res_idx], (y_pred[args[res_idx]] + y_pred[args[res_idx + 1]]) / 2


In [None]:
bestscore = []

for train_index, test_index in kf.split(x_train):
    filepath="weights_best.h5"
    kfold_y_train,kfold_y_test = y_train[train_index], y_train[test_index]
    kfold_X_train = x_train[train_index]
    kfold_X_features = features[train_index]
    kfold_X_valid = x_train[test_index]
    kfold_X_valid_features = features[test_index] 
    
    gc.collect()
    K.clear_session()
    
    model = get_model(features)
    
    #ra_val = RocAucEvaluation(validation_data=([kfold_X_valid,kfold_X_valid_features], kfold_y_test), interval = 1)
    checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=2, save_best_only=True, mode='min')
    reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=1, min_lr=0.0001, verbose=2)
    earlystopping = EarlyStopping(monitor='val_loss', min_delta=0.0001, patience=2, verbose=2, mode='auto')
    
    if i == 0:print(model.summary()) 
    
    model.fit([kfold_X_train,kfold_X_features], kfold_y_train, batch_size=batch_size, epochs=epochs, verbose=1,
              validation_data=([kfold_X_valid,kfold_X_valid_features], kfold_y_test),
              callbacks = [checkpoint, reduce_lr, earlystopping])#ra_val, 
    gc.collect()
    
    #model.load_weights(bst_model_path)
    model.load_weights(filepath)
    
    y_test += model.predict([x_test,test_features], batch_size=1024,verbose=1) / num_folds
    
    gc.collect()
    # uncomment for out of fold predictions
    oof_predict[test_index] = model.predict([kfold_X_valid, kfold_X_valid_features],batch_size=batch_size, verbose=1)
    cv_score = roc_auc_score(kfold_y_test, oof_predict[test_index])

    f1, threshold = f1_smart(np.squeeze(kfold_y_test), np.squeeze(oof_predict[test_index]))
    print('Optimal F1: {:.4f} at threshold: {:.4f}'.format(f1, threshold))    
    bestscore.append(threshold)
    scores.append(cv_score)
    print('score: ',cv_score)

print("Done")
print('Total CV score is {}'.format(np.mean(scores)))    



In [None]:
from sklearn.metrics import f1_score
def threshold_search(y_true, y_proba):
    best_threshold =0
    best_score = 0
    for threshold in [i * 0.01 for i in range(100)]:
        score = f1_score(y_true=y_true, y_pred=y_proba > threshold)
        if score > best_score:
            best_threshold = threshold
            best_score = score
    search_result = {'threshold': best_threshold, 'f1': best_score}
    return search_result

search_result = threshold_search(y_train, oof_predict)
print(search_result)

print("Mean of Best Score ::: {}".format(np.mean(bestscore)))

In [None]:
#sum((y_test>.38).reshape(-1)==1)
#sum(y_train)

In [None]:
sub = test[['qid']]
y_test = y_test.reshape((-1, 1))
pred_test_y = (y_test>search_result['threshold']).astype(int)#np.mean(bestscore)
sub['prediction'] = pred_test_y
sub.to_csv("submission.csv", index=False)                                   