In [None]:
import pandas as pd, numpy as np
import re

In [None]:
train_data=pd.read_csv('../input/train.csv')
test_data=pd.read_csv('../input/test.csv')

In [None]:
all_text=train_data['question_text']
all_text.append(test_data['question_text'])
all_text=all_text.apply(lambda x:re.sub('\n',' ',x))
all_text=all_text.apply(lambda x:re.sub('[^a-zA-Z0-9\s]+','',x))
all_text=all_text.apply(lambda x:x.strip())
all_text=all_text.apply(lambda x:x.lower())

In [None]:
all_text_tokenized=all_text.apply(lambda x:x.split(' '))

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [None]:
t=Tokenizer()
t.fit_on_texts(list(all_text_tokenized))
encoded_text_train=t.texts_to_sequences(train_data['question_text'])
encoded_text_test=t.texts_to_sequences(test_data['question_text'])

In [None]:
maxnumwords=70
vocab_size=len(t.word_index)+1
embed_size=300
embed_matrix=np.zeros((vocab_size,embed_size))

In [None]:
padded_text_train=pad_sequences(encoded_text_train, maxlen=maxnumwords, padding='post')
padded_text_test=pad_sequences(encoded_text_test, maxlen=maxnumwords, padding='post')

In [None]:
embed_path="../input/embeddings/glove.840B.300d/glove.840B.300d.txt"


In [None]:
embed_file = open(embed_path)
for line in embed_file:
    line_arr=line.strip().split(' ')
    if line_arr[0] in t.word_index:
        embed_matrix[t.word_index[line_arr[0]]]=np.asarray(line_arr[1:],dtype='float32')


In [None]:
indices=np.arange(len(padded_text_train))
np.random.shuffle(indices)
x_train=padded_text_train[indices]
y_train=np.array(train_data.loc[indices]['target'])

In [None]:
from keras import backend as K
def prec(y_true,y_pred):
    true_positives=K.sum(K.round(K.clip(y_true*y_pred,0,1)))
    pred_positives=K.sum(K.round(K.clip(y_pred,0,1)))
    precision=true_positives/(pred_positives+K.epsilon())
    return precision

def rec(y_true,y_pred):
    true_positives=K.sum(K.round(K.clip(y_true*y_pred,0,1)))
    possible_positives=K.sum(K.round(K.clip(y_true,0,1)))
    recall=true_positives/(possible_positives+K.epsilon())
    return recall

def f1_score(y_true,y_pred):
    true_positives=K.sum(K.round(K.clip(y_true*y_pred,0,1)))
    possible_positives=K.sum(K.round(K.clip(y_true,0,1)))
    pred_positives=K.sum(K.round(K.clip(y_pred,0,1)))
    precision=true_positives/(pred_positives+K.epsilon())
    recall=true_positives/(possible_positives+K.epsilon())
    return (2*precision*recall)/(precision+recall+K.epsilon())
        


In [None]:
from keras.callbacks import EarlyStopping
from keras.callbacks import ModelCheckpoint
early_stop=EarlyStopping(monitor='val_loss',mode='min',patience=3)
file_path="model_sepcnn1.h5"
check_point = ModelCheckpoint(file_path, monitor = "val_f1_score", verbose = 1,
                                  save_best_only = True, mode = "max")

In [None]:
from keras.models import Sequential
from keras.models import Model
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from keras.layers import Conv1D
from keras.layers import MaxPooling1D
from keras.layers import GlobalMaxPooling1D
from keras.layers import AveragePooling1D
from keras.layers import GlobalAveragePooling1D
from keras.layers import BatchNormalization
from keras.layers import Input
from keras.layers import concatenate
from keras.layers import Dropout
from keras.layers import SpatialDropout1D
from keras.layers import Average
from keras.layers import Multiply
from keras.layers import Add
from keras.layers import SeparableConv1D
from keras.layers import Activation

In [None]:
def model_lin_sep_cnn(vocab_size=vocab_size,embed_size=embed_size,maxnumwords=maxnumwords,
                      embed_matrix=embed_matrix,filters=64,kernel_size=5,
                      depth_multiplier=2,dr_rate=0.2,pool_size=3,blocks=1):
    model=Sequential()
    e=Embedding(vocab_size,embed_size,input_length=maxnumwords,weights=[embed_matrix],trainable=False)
    model.add(e)
    for _ in range(blocks):
        model.add(Dropout(dr_rate))
        model.add(SeparableConv1D(filters,kernel_size,depth_multiplier=depth_multiplier,activation='relu',
                                  depthwise_initializer='random_uniform',bias_initializer='random_uniform',
                                 padding='same'))
        model.add(SeparableConv1D(filters,kernel_size,depth_multiplier=depth_multiplier,
                                  depthwise_initializer='random_uniform',bias_initializer='random_uniform',
                                 padding='same'))
        model.add(MaxPooling1D(pool_size))
        
    model.add(SeparableConv1D(2*filters,kernel_size,depth_multiplier=depth_multiplier,activation='relu',
                              depthwise_initializer='random_uniform',bias_initializer='random_uniform',
                             padding='same'))
    model.add(SeparableConv1D(2*filters,kernel_size,depth_multiplier=depth_multiplier,activation='relu',
                              depthwise_initializer='random_uniform',bias_initializer='random_uniform',
                             padding='same'))
    model.add(GlobalAveragePooling1D())
    model.add(Dropout(dr_rate))
    model.add(Dense(1,activation='sigmoid'))
    return model

In [None]:
def model_branched_sep_cnn(vocab_size=vocab_size,embed_size=embed_size,maxnumwords=maxnumwords,
                      embed_matrix=embed_matrix,filters=64,kernel_size=5,
                      depth_multiplier=2,dr_rate=0.2,pool_size=3,blocks=1):
    inp=inp=Input(shape=(maxnumwords,))
    e=Embedding(vocab_size,embed_size,weights=[embedding_matrix],trainable=False)(inp)
    dr=SpatialDropout1D(dr_rate)(e)
    
    conv1=Conv1D(filters,kernel_size-2,bias_initializer='random_uniform',
             padding='same')(dr)
    maxpool1=MaxPooling1D(pool_size)(conv1)
    
    conv2=Conv1D(filters,kernel_size,bias_initializer='random_uniform',
             padding='same')(dr)
    maxpool2=MaxPooling1D(pool_size)(conv2)
    
    conv3=Conv1D(filters,kernel_size+2,bias_initializer='random_uniform',
             padding='same')(dr)
    maxpool3=MaxPooling1D(pool_size)(conv3)
    
    x=concatenate([maxpool1,maxpool2,maxpool3],axis=1)
    x=Activation('relu')(x)
    #x=BatchNormalization()(x)
    for _ in range(blocks):
        x=Dropout(dr_rate)(x)
        x=SeparableConv1D(2*filters,kernel_size,depth_multiplier=depth_multiplier,activation='relu',
                                  depthwise_initializer='random_uniform',bias_initializer='random_uniform',
                                 padding='same')(x)
        x=MaxPooling1D(pool_size)(x)
    
    """x=Conv1D(filters,kernel_size,activation='relu',bias_initializer='random_uniform',
             padding='same')(x)"""
    x=GlobalAveragePooling1D()(x)
    x=Dense(1,activation='sigmoid')(x)
    model=Model(inputs=inp,outputs=x)
    return model

In [None]:
from keras.optimizers import SGD
from keras.optimizers import adam
sgd = SGD(lr=0.01, decay=1e-6, momentum=0.6, nesterov=False)
Adam=adam(lr=0.0001,decay=1e-6)

In [None]:
model=model_lin_sep_cnn(filters=64,kernel_size=5,
                     depth_multiplier=6,dr_rate=0.2,pool_size=3,blocks=3)

#model=model_branched_sep_cnn(filters=32,kernel_size=5,
                      #depth_multiplier=2,dr_rate=0.2,pool_size=3,blocks=3)

print(model.summary())

In [None]:
model.compile(loss='binary_crossentropy',optimizer=Adam,metrics=[f1_score,prec,rec])

In [None]:
model.fit(x_train,y_train,validation_split=0.05,epochs=35,callbacks=[early_stop,check_point], batch_size=512)

In [None]:
predictions=model.predict(padded_text_test,batch_size=1024)
pred = np.round(predictions).astype(int).reshape(predictions.shape[0],)
sub=pd.DataFrame()
sub['qid']=test_data['qid']
sub['prediction']=pred
sub.to_csv('submission.csv',index=False)