In [74]:
# Imports
import pandas as pd
import sys
import glob
import errno
import csv
import numpy as np
from nltk.corpus import stopwords
import re
import nltk.data
import nltk
import os
from collections import OrderedDict
from subprocess import check_call
from shutil import copyfile
from sklearn.metrics import log_loss
%matplotlib inline
import matplotlib.pyplot as plt
import mpld3
mpld3.enable_notebook()
import seaborn as sns
from collections import Counter
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn import ensemble, metrics, model_selection, naive_bayes
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
from tqdm import tqdm
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from keras.layers import Merge,Lambda,Input,GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D,TimeDistributed
from keras.preprocessing import sequence, text
from keras.callbacks import EarlyStopping
from nltk import word_tokenize
from keras.layers.merge import concatenate
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers.recurrent import LSTM, GRU
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.embeddings import Embedding
from keras.layers.normalization import BatchNormalization
from keras.preprocessing.sequence import pad_sequences
from keras import initializers
from keras import backend as K
from sklearn.linear_model import SGDClassifier as sgd

In [2]:
# Read data
train = "../data/train.csv"
test = "../data/test.csv"
wv = "../../../../glove.6B/glove.6B.100d.txt"
X_train = pd.read_csv( train, header=0,delimiter="," )
X_test = pd.read_csv( test, header=0,delimiter="," )

word_vecs = {}
with open(wv) as f:
    for line in f:
       vals = line.split()
       word_vecs[vals[0]] = np.array(vals[1::],dtype=float)
authors = ['EAP','MWS','HPL']

Y_train = LabelEncoder().fit_transform(X_train['author'])

In [3]:
# Clean data
def getWordVectors(X_train,X_test,word_vecs):
    X_train['word_vectors'] = [ [ word_vecs[word] for word in sentence if word in word_vecs] for sentence in X_train['text']]
    X_test['word_vectors'] = [ [ word_vecs[word] for word in sentence if word in word_vecs] for sentence in X_test['text']] 
    return X_train,X_test

def getSentenceVectors(X_train,X_test):
    X_train['sentence_vectors'] =[np.mean(sentence,axis = 0) for sentence in X_train['word_vectors']]
    X_test['sentence_vectors'] =[np.mean(sentence,axis = 0) for sentence in X_test['word_vectors']] 
    return X_train,X_test

def clean(X_train,X_test):
    X_train['words'] = [re.sub("[^a-zA-Z]"," ", data).lower().split() for data in X_train['text']]
    X_test['words'] = [re.sub("[^a-zA-Z]"," ", data).lower().split() for data in X_test['text']]
    return X_train,X_test
X_train,X_test = clean(X_train,X_test)
X_train,X_test = getWordVectors(X_train,X_test,word_vecs)
X_train,X_test = getSentenceVectors(X_train,X_test)
#X_train.head()

In [37]:
# Feature Engineering
# Punctuation
punctuations = [{"id":1,"p":"[;:]"},{"id":2,"p":"[,.]"},{"id":3,"p":"[?]"},{"id":4,"p":"[\']"},{"id":5,"p":"[\"]"},{"id":6,"p":"[;:,.?\'\"]"}]
for p in punctuations:
    punctuation = p["p"]
    _train =  [ sentence.split() for sentence in X_train['text'] ]
    X_train['punc_'+str(p["id"])] = [len([word for word in sentence if bool(re.search(punctuation, word))])*100.0/len(sentence) for sentence in _train]    

    _test =  [ sentence.split() for sentence in X_test['text'] ]
    X_test['punc_'+str(p["id"])] = [len([word for word in sentence if bool(re.search(punctuation, word))])*100.0/len(sentence) for sentence in _test]    



In [38]:
# Feature Engineering
# Stop Words
_dist_train = [x for x in X_train['words']]
X_train['stop_word'] = [len([word for word in sentence if word in stopwords.words('english')])*100.0/len(sentence) for sentence in _dist_train]

_dist_test = [x for x in X_test['words']]
X_test['stop_word'] = [len([word for word in sentence if word in stopwords.words('english')])*100.0/len(sentence) for sentence in _dist_test]    

In [39]:
# Feature Engineering
# tfidf - words - nb+svd
def tfidfWords(X_train,X_test):
    tfidf_vec = TfidfVectorizer(stop_words='english', ngram_range=(1,3))
    full_tfidf = tfidf_vec.fit_transform(X_train['text'].values.tolist() + X_test['text'].values.tolist())
    train_tfidf = tfidf_vec.transform(X_train['text'].values.tolist())
    test_tfidf = tfidf_vec.transform(X_test['text'].values.tolist())
    return train_tfidf,test_tfidf,full_tfidf
    
def runMNB(train_X, train_y, test_X, test_y, test_X2):
    model = naive_bayes.MultinomialNB()
    model.fit(train_X, train_y)
    pred_test_y = model.predict_proba(test_X)
    pred_test_y2 = model.predict_proba(test_X2)
    return pred_test_y, pred_test_y2, model

def runSVD(full_tfidf,train_tfidf,test_tfidf):   
    n_comp = 20
    svd_obj = TruncatedSVD(n_components=n_comp)
    svd_obj.fit(full_tfidf)
    train_svd = pd.DataFrame(svd_obj.transform(train_tfidf))
    test_svd = pd.DataFrame(svd_obj.transform(test_tfidf))

    train_svd.columns = ['svd_word_'+str(i) for i in range(n_comp)]
    test_svd.columns = ['svd_word_'+str(i) for i in range(n_comp)]
    return train_svd,test_svd

def do_tfidf_MNB(X_train,X_test,Y_train):
    train_tfidf,test_tfidf,full_tfidf = tfidfWords(X_train,X_test)
    cv_scores = []
    pred_full_test = 0
    pred_train = np.zeros([X_train.shape[0], 3])
    kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=2017)
    for dev_index, val_index in kf.split(X_train):
        dev_X, val_X = train_tfidf[dev_index], train_tfidf[val_index]
        dev_y, val_y = Y_train[dev_index], Y_train[val_index]
        pred_val_y, pred_test_y, model = runMNB(dev_X, dev_y, val_X, val_y, test_tfidf)
        pred_full_test = pred_full_test + pred_test_y
        pred_train[val_index,:] = pred_val_y
        cv_scores.append(metrics.log_loss(val_y, pred_val_y))
    print("Mean cv score : ", np.mean(cv_scores))
    pred_full_test = pred_full_test / 5.
    return pred_train,pred_full_test

def do_tfidf_SVD(X_train,X_test,Y_train):
    train_tfidf,test_tfidf,full_tfidf = tfidfWords(X_train,X_test)
    train_svd,test_svd = runSVD(full_tfidf,train_tfidf,test_tfidf)
    return train_svd,test_svd

pred_train,pred_test = do_tfidf_MNB(X_train,X_test,Y_train)
X_train["tfidf_words_nb_eap"] = pred_train[:,0]
X_train["tfidf_words_nb_hpl"] = pred_train[:,1]
X_train["tfidf_words_nb_mws"] = pred_train[:,2]
X_test["tfidf_words_nb_eap"] = pred_test[:,0]
X_test["tfidf_words_nb_hpl"] = pred_test[:,1]
X_test["tfidf_words_nb_mws"] = pred_test[:,2]

# pred_train,pred_test = do_tfidf_SVD(X_train,X_test,Y_train)
# print pred_train
# # X_train["tfidf_words_nb_eap"] = pred_train[:,0]
# # X_train["tfidf_words_nb_hpl"] = pred_train[:,1]
# # X_train["tfidf_words_nb_mws"] = pred_train[:,2]
# # X_test["tfidf_words_nb_eap"] = pred_test[:,0]
# # X_test["tfidf_words_nb_hpl"] = pred_test[:,1]
# # X_test["tfidf_words_nb_mws"] = pred_test[:,2]

('Mean cv score : ', 0.84221619836128525)


In [40]:
# Feature Engineering
# tfidf - chars - nb+svd
def tfidfWords(X_train,X_test):
    tfidf_vec = TfidfVectorizer(stop_words='english', ngram_range=(1,5),analyzer='char')
    full_tfidf = tfidf_vec.fit_transform(X_train['text'].values.tolist() + X_test['text'].values.tolist())
    train_tfidf = tfidf_vec.transform(X_train['text'].values.tolist())
    test_tfidf = tfidf_vec.transform(X_test['text'].values.tolist())
    return train_tfidf,test_tfidf
    
def runMNB(train_X, train_y, test_X, test_y, test_X2):
    model = naive_bayes.MultinomialNB()
    model.fit(train_X, train_y)
    pred_test_y = model.predict_proba(test_X)
    pred_test_y2 = model.predict_proba(test_X2)
    return pred_test_y, pred_test_y2, model

def do(X_train,X_test,Y_train):
    train_tfidf,test_tfidf = tfidfWords(X_train,X_test)
    cv_scores = []
    pred_full_test = 0
    pred_train = np.zeros([X_train.shape[0], 3])
    kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=2017)
    for dev_index, val_index in kf.split(X_train):
        dev_X, val_X = train_tfidf[dev_index], train_tfidf[val_index]
        dev_y, val_y = Y_train[dev_index], Y_train[val_index]
        pred_val_y, pred_test_y, model = runMNB(dev_X, dev_y, val_X, val_y, test_tfidf)
        pred_full_test = pred_full_test + pred_test_y
        pred_train[val_index,:] = pred_val_y
        cv_scores.append(metrics.log_loss(val_y, pred_val_y))
    print("Mean cv score : ", np.mean(cv_scores))
    pred_full_test = pred_full_test / 5.
    return pred_train,pred_full_test
pred_train,pred_test = do(X_train,X_test,Y_train)
X_train["tfidf_chars_nb_eap"] = pred_train[:,0]
X_train["tfidf_chars_nb_hpl"] = pred_train[:,1]
X_train["tfidf_chars_nb_mws"] = pred_train[:,2]
X_test["tfidf_chars_nb_eap"] = pred_test[:,0]
X_test["tfidf_chars_nb_hpl"] = pred_test[:,1]
X_test["tfidf_chars_nb_mws"] = pred_test[:,2]

('Mean cv score : ', 0.7904152589474216)


In [41]:
# Feature Engineering
# count - words - nb
def countWords(X_train,X_test):
    count_vec = CountVectorizer(stop_words='english', ngram_range=(1,3))
    count_vec.fit(X_train['text'].values.tolist() + X_test['text'].values.tolist())
    train_count = count_vec.transform(X_train['text'].values.tolist())
    test_count = count_vec.transform(X_test['text'].values.tolist())
    return train_count,test_count
    
def runMNB(train_X, train_y, test_X, test_y, test_X2):
    model = naive_bayes.MultinomialNB()
    model.fit(train_X, train_y)
    pred_test_y = model.predict_proba(test_X)
    pred_test_y2 = model.predict_proba(test_X2)
    return pred_test_y, pred_test_y2, model

def do_count_MNB(X_train,X_test,Y_train):
    train_count,test_count=countWords(X_train,X_test)
    cv_scores = []
    pred_full_test = 0
    pred_train = np.zeros([X_train.shape[0], 3])
    kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=2017)
    for dev_index, val_index in kf.split(X_train):
        dev_X, val_X = train_count[dev_index], train_count[val_index]
        dev_y, val_y = Y_train[dev_index], Y_train[val_index]
        pred_val_y, pred_test_y, model = runMNB(dev_X, dev_y, val_X, val_y, test_count)
        pred_full_test = pred_full_test + pred_test_y
        pred_train[val_index,:] = pred_val_y
        cv_scores.append(metrics.log_loss(val_y, pred_val_y))
    print("Mean cv score : ", np.mean(cv_scores))
    pred_full_test = pred_full_test / 5.
    return pred_train,pred_full_test

pred_train,pred_test = do_count_MNB(X_train,X_test,Y_train)
X_train["count_words_nb_eap"] = pred_train[:,0]
X_train["count_words_nb_hpl"] = pred_train[:,1]
X_train["count_words_nb_mws"] = pred_train[:,2]
X_test["count_words_nb_eap"] = pred_test[:,0]
X_test["count_words_nb_hpl"] = pred_test[:,1]
X_test["count_words_nb_mws"] = pred_test[:,2]

('Mean cv score : ', 0.45091841616567468)


In [42]:
# Feature Engineering
# count - chars - nb
def countChars(X_train,X_test):
    count_vec = CountVectorizer(ngram_range=(1,7),analyzer='char')
    count_vec.fit(X_train['text'].values.tolist() + X_test['text'].values.tolist())
    train_count = count_vec.transform(X_train['text'].values.tolist())
    test_count = count_vec.transform(X_test['text'].values.tolist())
    return train_count,test_count
    
def runMNB(train_X, train_y, test_X, test_y, test_X2):
    model = naive_bayes.MultinomialNB()
    model.fit(train_X, train_y)
    pred_test_y = model.predict_proba(test_X)
    pred_test_y2 = model.predict_proba(test_X2)
    return pred_test_y, pred_test_y2, model

def do_count_chars_MNB(X_train,X_test,Y_train):
    train_count,test_count=countChars(X_train,X_test)
    cv_scores = []
    pred_full_test = 0
    pred_train = np.zeros([X_train.shape[0], 3])
    kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=2017)
    for dev_index, val_index in kf.split(X_train):
        dev_X, val_X = train_count[dev_index], train_count[val_index]
        dev_y, val_y = Y_train[dev_index], Y_train[val_index]
        pred_val_y, pred_test_y, model = runMNB(dev_X, dev_y, val_X, val_y, test_count)
        pred_full_test = pred_full_test + pred_test_y
        pred_train[val_index,:] = pred_val_y
        cv_scores.append(metrics.log_loss(val_y, pred_val_y))
    print("Mean cv score : ", np.mean(cv_scores))
    pred_full_test = pred_full_test / 5.
    return pred_train,pred_full_test

pred_train,pred_test = do_count_chars_MNB(X_train,X_test,Y_train)
X_train["count_chars_nb_eap"] = pred_train[:,0]
X_train["count_chars_nb_hpl"] = pred_train[:,1]
X_train["count_chars_nb_mws"] = pred_train[:,2]
X_test["count_chars_nb_eap"] = pred_test[:,0]
X_test["count_chars_nb_hpl"] = pred_test[:,1]
X_test["count_chars_nb_mws"] = pred_test[:,2]

('Mean cv score : ', 3.750763922681903)


In [22]:
# load the GloVe vectors in a dictionary:

def loadWordVecs():
    embeddings_index = {}
    f = open(wv)
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()
    print('Found %s word vectors.' % len(embeddings_index))
    return embeddings_index

def sent2vec(embeddings_index,s): # this function creates a normalized vector for the whole sentence
    words = str(s).lower().decode('utf-8')
    words = word_tokenize(words)
    words = [w for w in words if not w in stopwords.words('english')]
    words = [w for w in words if w.isalpha()]
    M = []
    for w in words:
        try:
            M.append(embeddings_index[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    if type(v) != np.ndarray:
        return np.zeros(100)
    return v / np.sqrt((v ** 2).sum())

def doGlove(x_train,x_test):
    embeddings_index = loadWordVecs()
    # create sentence vectors using the above function for training and validation set
    xtrain_glove = [sent2vec(embeddings_index,x) for x in tqdm(x_train)]
    xtest_glove = [sent2vec(embeddings_index,x) for x in tqdm(x_test)]
    xtrain_glove = np.array(xtrain_glove)
    xtest_glove = np.array(xtest_glove)
    return xtrain_glove,xtest_glove,embeddings_index

glove_vecs_train,glove_vecs_test,embeddings_index = doGlove(X_train['text'],X_test['text'])
X_train[['sent_vec_'+str(i) for i in range(100)]] = pd.DataFrame(glove_vecs_train.tolist())
X_test[['sent_vec_'+str(i) for i in range(100)]] = pd.DataFrame(glove_vecs_test.tolist())



  0%|          | 6/19579 [00:00<06:10, 52.81it/s]

Found 400000 word vectors.


100%|██████████| 19579/19579 [01:50<00:00, 177.82it/s]
100%|██████████| 8392/8392 [00:46<00:00, 179.84it/s]


In [96]:
# CNN

def doAddNN(X_train,X_test,pred_train,pred_test):
    X_train["nn_eap"] = pred_train[:,0]
    X_train["nn_hpl"] = pred_train[:,1]
    X_train["nn_mws"] = pred_train[:,2]
    X_test["nn_eap"] = pred_test[:,0]
    X_test["nn_hpl"] = pred_test[:,1]
    X_test["nn_mws"] = pred_test[:,2]
    return X_train,X_test

def initNN():
    # create a simple 3 layer sequential neural net
    model = Sequential()

    model.add(Dense(1024, input_dim=50, activation='relu'))
    model.add(Dropout(0.6))
    model.add(BatchNormalization())

    model.add(Dense(1024, activation='relu'))
    model.add(Dropout(0.6))
    model.add(BatchNormalization())

    model.add(Dense(3))
    model.add(Activation('softmax'))

    # compile the model
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    return model

def initCNN(word_index,embedding_matrix,max_len):
    units = 10 # Number of nodes in the Dense layers
    dropout = 0.5 # Percentage of nodes to drop
    nb_filter = 3 # Number of filters to use in Convolution1D
    filter_length = 50 # Length of filter for Convolution1D
    # Initialize weights and biases for the Dense layers
    weights = initializers.TruncatedNormal(mean=0.0, stddev=0.05, seed=2)
    bias = bias_initializer='zeros'

    model = Sequential()
#     model.add(Embedding(len(word_index) + 1,
#                          100,
#                          weights = [embedding_matrix],
#                          input_length = max_len,
#                          trainable = False))
    model.add(Dense(1024, input_dim=100, activation='relu'))
    model.add(Dropout(0.6))
    model.add(BatchNormalization())

#     model.add(Dense(1024, activation='relu'))
#     model.add(Dropout(0.6))
#     model.add(BatchNormalization())

    model.add(Dense(3))
    model.add(Activation('softmax'))
#     model1.add(Conv1D(filters = nb_filter, 
#                              kernel_size = filter_length, 
#                              padding = 'same'))
#     model1.add(BatchNormalization())
#     model1.add(Activation('relu'))
#     model1.add(Dropout(dropout))

# #     model1.add(Conv1D(filters = nb_filter, 
# #                              kernel_size = filter_length, 
# #                              padding = 'same'))
# #     model1.add(BatchNormalization())
# #     model1.add(Activation('relu'))
# #     model1.add(Dropout(dropout))
    
    
#     model1.add(Flatten())
#     model1.add(Dense(3))
#     model1.add(Activation('softmax'))
    
    
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    return model

def doNN(X_train,X_test,embeddings_index):
    #glove_vecs_train,glove_vecs_test = doGlove(X_train['text'],X_test['text'])
    # scale the data before any neural net:
    token = text.Tokenizer(num_words=None)
    max_len = 70

    token.fit_on_texts(list(X_train['text']) + list(X_test['text']))
    xtrain_seq = token.texts_to_sequences(X_train['text'])
    xtest_seq = token.texts_to_sequences(X_test['text'])

    # zero pad the sequences
    xtrain_pad = sequence.pad_sequences(xtrain_seq, maxlen=max_len)
    xtest_pad = sequence.pad_sequences(xtest_seq, maxlen=max_len)

    word_index = token.word_index

    # create an embedding matrix for the words we have in the dataset
    embedding_matrix = np.zeros((len(word_index) + 1, 100))
    for word, i in tqdm(word_index.items()):
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

    # we need to binarize the labels for the neural net
    ytrain_enc = np_utils.to_categorical(Y_train)
    #yvalid_enc = np_utils.to_categorical(yvalid)



    kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=2017)
    cv_scores = []
    pred_full_test = 0
    pred_train = np.zeros([xtrain_pad.shape[0], 3])
    for dev_index, val_index in kf.split(xtrain_pad):
        dev_X, val_X = xtrain_pad[dev_index], xtrain_pad[val_index]
        dev_y, val_y = ytrain_enc[dev_index], ytrain_enc[val_index]
        model = initCNN(word_index,embedding_matrix,max_len)
        model.fit(dev_X, y=dev_y, batch_size=32, epochs=100, verbose=1,validation_data=(val_X, val_y))
        pred_val_y = model.predict(val_X)
        pred_test_y = model.predict(xtest_pad)
        pred_full_test = pred_full_test + pred_test_y
        pred_train[val_index,:] = pred_val_y
    return doAddNN(X_train,X_test,pred_train,pred_full_test/5)
X_train,X_test = doNN(X_train,X_test,embeddings_index)

100%|██████████| 29451/29451 [00:00<00:00, 312497.34it/s]


ValueError: Error when checking input: expected dense_141_input to have shape (None, 100) but got array with shape (15663, 70)

In [122]:
class Length(layers.Layer):
    """
    Compute the length of vectors. This is used to compute a Tensor that has the same shape with y_true in margin_loss.
    Using this layer as model's output can directly predict labels by using `y_pred = np.argmax(model.predict(x), 1)`
    inputs: shape=[None, num_vectors, dim_vector]
    output: shape=[None, num_vectors]
    """
    def call(self, inputs, **kwargs):
        return K.sqrt(K.sum(K.square(inputs), -1))

    def compute_output_shape(self, input_shape):
        return input_shape[:-1]


class Mask(layers.Layer):
    """
    Mask a Tensor with shape=[None, num_capsule, dim_vector] either by the capsule with max length or by an additional 
    input mask. Except the max-length capsule (or specified capsule), all vectors are masked to zeros. Then flatten the
    masked Tensor.
    For example:
        ```
        x = keras.layers.Input(shape=[8, 3, 2])  # batch_size=8, each sample contains 3 capsules with dim_vector=2
        y = keras.layers.Input(shape=[8, 3])  # True labels. 8 samples, 3 classes, one-hot coding.
        out = Mask()(x)  # out.shape=[8, 6]
        # or
        out2 = Mask()([x, y])  # out2.shape=[8,6]. Masked with true labels y. Of course y can also be manipulated.
        ```
    """
    def call(self, inputs, **kwargs):
        if type(inputs) is list:  # true label is provided with shape = [None, n_classes], i.e. one-hot code.
            assert len(inputs) == 2
            inputs, mask = inputs
        else:  # if no true label, mask by the max length of capsules. Mainly used for prediction
            # compute lengths of capsules
            x = K.sqrt(K.sum(K.square(inputs), -1))
            # generate the mask which is a one-hot code.
            # mask.shape=[None, n_classes]=[None, num_capsule]
            mask = K.one_hot(indices=K.argmax(x, 1), num_classes=x.get_shape().as_list()[1])

        # inputs.shape=[None, num_capsule, dim_capsule]
        # mask.shape=[None, num_capsule]
        # masked.shape=[None, num_capsule * dim_capsule]
        masked = K.batch_flatten(inputs * K.expand_dims(mask, -1))
        return masked

    def compute_output_shape(self, input_shape):
        if type(input_shape[0]) is tuple:  # true label provided
            return tuple([None, input_shape[0][1] * input_shape[0][2]])
        else:  # no true label provided
            return tuple([None, input_shape[1] * input_shape[2]])


def squash(vectors, axis=-1):
    """
    The non-linear activation used in Capsule. It drives the length of a large vector to near 1 and small vector to 0
    :param vectors: some vectors to be squashed, N-dim tensor
    :param axis: the axis to squash
    :return: a Tensor with same shape as input vectors
    """
    s_squared_norm = K.sum(K.square(vectors), axis, keepdims=True)
    scale = s_squared_norm / (1 + s_squared_norm) / K.sqrt(s_squared_norm + K.epsilon())
    return scale * vectors


class CapsuleLayer(layers.Layer):
    """
    The capsule layer. It is similar to Dense layer. Dense layer has `in_num` inputs, each is a scalar, the output of the 
    neuron from the former layer, and it has `out_num` output neurons. CapsuleLayer just expand the output of the neuron
    from scalar to vector. So its input shape = [None, input_num_capsule, input_dim_capsule] and output shape = \
    [None, num_capsule, dim_capsule]. For Dense Layer, input_dim_capsule = dim_capsule = 1.
    
    :param num_capsule: number of capsules in this layer
    :param dim_capsule: dimension of the output vectors of the capsules in this layer
    :param num_routing: number of iterations for the routing algorithm
    """
    def __init__(self, num_capsule, dim_capsule, num_routing=3,
                 kernel_initializer='glorot_uniform',
                 **kwargs):
        super(CapsuleLayer, self).__init__(**kwargs)
        self.num_capsule = num_capsule
        self.dim_capsule = dim_capsule
        self.num_routing = num_routing
        self.kernel_initializer = initializers.get(kernel_initializer)

    def build(self, input_shape):
        assert len(input_shape) >= 3, "The input Tensor should have shape=[None, input_num_capsule, input_dim_capsule]"
        self.input_num_capsule = input_shape[1]
        self.input_dim_capsule = input_shape[2]

        # Transform matrix
        self.W = self.add_weight(shape=[self.num_capsule, self.input_num_capsule,
                                        self.dim_capsule, self.input_dim_capsule],
                                 initializer=self.kernel_initializer,
                                 name='W')

        self.built = True

    def call(self, inputs, training=None):
        # inputs.shape=[None, input_num_capsule, input_dim_capsule]
        # inputs_expand.shape=[None, 1, input_num_capsule, input_dim_capsule]
        inputs_expand = K.expand_dims(inputs, 1)

        # Replicate num_capsule dimension to prepare being multiplied by W
        # inputs_tiled.shape=[None, num_capsule, input_num_capsule, input_dim_capsule]
        inputs_tiled = K.tile(inputs_expand, [1, self.num_capsule, 1, 1])

        # Compute `inputs * W` by scanning inputs_tiled on dimension 0.
        # x.shape=[num_capsule, input_num_capsule, input_dim_capsule]
        # W.shape=[num_capsule, input_num_capsule, dim_capsule, input_dim_capsule]
        # Regard the first two dimensions as `batch` dimension,
        # then matmul: [input_dim_capsule] x [dim_capsule, input_dim_capsule]^T -> [dim_capsule].
        # inputs_hat.shape = [None, num_capsule, input_num_capsule, dim_capsule]
        inputs_hat = K.map_fn(lambda x: K.batch_dot(x, self.W, [2, 3]), elems=inputs_tiled)

        """
        # Begin: routing algorithm V1, dynamic ------------------------------------------------------------#
        # The prior for coupling coefficient, initialized as zeros.
        b = K.zeros(shape=[self.batch_size, self.num_capsule, self.input_num_capsule])
        def body(i, b, outputs):
            c = tf.nn.softmax(b, dim=1)  # dim=2 is the num_capsule dimension
            outputs = squash(K.batch_dot(c, inputs_hat, [2, 2]))
            if i != 1:
                b = b + K.batch_dot(outputs, inputs_hat, [2, 3])
            return [i-1, b, outputs]
        cond = lambda i, b, inputs_hat: i > 0
        loop_vars = [K.constant(self.num_routing), b, K.sum(inputs_hat, 2, keepdims=False)]
        shape_invariants = [tf.TensorShape([]),
                            tf.TensorShape([None, self.num_capsule, self.input_num_capsule]),
                            tf.TensorShape([None, self.num_capsule, self.dim_capsule])]
        _, _, outputs = tf.while_loop(cond, body, loop_vars, shape_invariants)
        # End: routing algorithm V1, dynamic ------------------------------------------------------------#
        """
        # Begin: Routing algorithm ---------------------------------------------------------------------#
        # In forward pass, `inputs_hat_stopped` = `inputs_hat`;
        # In backward, no gradient can flow from `inputs_hat_stopped` back to `inputs_hat`.
        inputs_hat_stopped = K.stop_gradient(inputs_hat)
        
        # The prior for coupling coefficient, initialized as zeros.
        # b.shape = [None, self.num_capsule, self.input_num_capsule].
        b = tf.zeros(shape=[K.shape(inputs_hat)[0], self.num_capsule, self.input_num_capsule])

        assert self.num_routing > 0, 'The num_routing should be > 0.'
        for i in range(self.num_routing):
            # c.shape=[batch_size, num_capsule, input_num_capsule]
            c = tf.nn.softmax(b, dim=1)

            # At last iteration, use `inputs_hat` to compute `outputs` in order to backpropagate gradient
            if i == self.num_routing - 1:
                # c.shape =  [batch_size, num_capsule, input_num_capsule]
                # inputs_hat.shape=[None, num_capsule, input_num_capsule, dim_capsule]
                # The first two dimensions as `batch` dimension,
                # then matmal: [input_num_capsule] x [input_num_capsule, dim_capsule] -> [dim_capsule].
                # outputs.shape=[None, num_capsule, dim_capsule]
                outputs = squash(K.batch_dot(c, inputs_hat, [2, 2]))  # [None, 10, 16]
            else:  # Otherwise, use `inputs_hat_stopped` to update `b`. No gradients flow on this path.
                outputs = squash(K.batch_dot(c, inputs_hat_stopped, [2, 2]))

                # outputs.shape =  [None, num_capsule, dim_capsule]
                # inputs_hat.shape=[None, num_capsule, input_num_capsule, dim_capsule]
                # The first two dimensions as `batch` dimension,
                # then matmal: [dim_capsule] x [input_num_capsule, dim_capsule]^T -> [input_num_capsule].
                # b.shape=[batch_size, num_capsule, input_num_capsule]
                b += K.batch_dot(outputs, inputs_hat_stopped, [2, 3])
        # End: Routing algorithm -----------------------------------------------------------------------#

        return outputs

    def compute_output_shape(self, input_shape):
        return tuple([None, self.num_capsule, self.dim_capsule])


def PrimaryCap(inputs, dim_capsule, n_channels, kernel_size, strides, padding):
    """
    Apply Conv2D `n_channels` times and concatenate all capsules
    :param inputs: 4D tensor, shape=[None, width, height, channels]
    :param dim_capsule: the dim of the output vector of capsule
    :param n_channels: the number of types of capsules
    :return: output tensor, shape=[None, num_capsule, dim_capsule]
    """
    output = layers.Conv2D(filters=dim_capsule*n_channels, kernel_size=kernel_size, strides=strides, padding=padding,
                           name='primarycap_conv2d')(inputs)
    outputs = layers.Reshape(target_shape=[-1, dim_capsule], name='primarycap_reshape')(output)
    return layers.Lambda(squash, name='primarycap_squash')(outputs)

In [156]:
# Test - magic-embeddings-keras-a-toy-example

#STEMMING WORDS
import nltk.stem as stm
import re
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Dropout, Embedding
from keras.layers import Flatten, Input, SpatialDropout1D, Reshape
from keras.models import Model
from keras.optimizers import Adam
    

def processText(X_train,X_test):
    #PROCESS TEXT: RAW
    stemmer = stm.SnowballStemmer("english")
    X_train["stem_text"] = X_train.text.apply(lambda x: (" ").join([stemmer.stem(z) for z in re.sub("[^a-zA-Z0-9]"," ", x).split(" ")]))
    X_test["stem_text"] = X_test.text.apply(lambda x: (" ").join([stemmer.stem(z) for z in re.sub("[^a-zA-Z0-9]"," ", x).split(" ")]))

    tok_raw = Tokenizer()
    tok_raw.fit_on_texts(X_train.text.str.lower())
    tok_stem = Tokenizer()
    tok_stem.fit_on_texts(X_train.stem_text)
    X_train["seq_text_stem"] = tok_stem.texts_to_sequences(X_train.stem_text)
    X_test["seq_text_stem"] = tok_stem.texts_to_sequences(X_test.stem_text)
    return X_train,X_test

#EXTRACT DATA FOR KERAS MODEL
def get_keras_data(dataset, maxlen):
    return pad_sequences(dataset.seq_text_stem, maxlen=maxlen)

# Model
def doAddModel(X_train,X_test,pred_train,pred_test):
    X_train["nn_eap"] = pred_train[:,0]
    X_train["nn_hpl"] = pred_train[:,1]
    X_train["nn_mws"] = pred_train[:,2]
    X_test["nn_eap"] = pred_test[:,0]
    X_test["nn_hpl"] = pred_test[:,1]
    X_test["nn_mws"] = pred_test[:,2]
    return X_train,X_test

def initModel(n_stem_seq,maxlen):
    embed_dim = 50
    dropout_rate = 0.9
    emb_dropout_rate = 0.9

    input_text = Input(shape=[maxlen], name="stem_input")

    emb_lstm = SpatialDropout1D(emb_dropout_rate,) (Embedding(n_stem_seq, embed_dim
                                                ,input_length = maxlen
                                                               ) (input_text))
    dense = Dropout(dropout_rate) (Dense(1024) (Flatten() (emb_lstm)))
    dense = Reshape((128, 8)) (dense)
    dense = Flatten() (Mask()(CapsuleLayer(128, 8)(dense)))

    output = Dense(3, activation="softmax")(dense)

    model = Model([input_text], output)

    optimizer = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
    model.compile(loss="categorical_crossentropy", optimizer=optimizer)
    return model

#def doModel(X_train,X_test):
#X_train,X_test = processText(X_train,X_test)
maxlen = 70
print "Getting Keras Data\n"
stem_train = get_keras_data(X_train,maxlen)
stem_test = get_keras_data(X_test,maxlen)

# we need to binarize the labels for the neural net
ytrain_enc = np_utils.to_categorical(Y_train)
#yvalid_enc = np_utils.to_categorical(yvalid)

kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=2017)
cv_scores = []
pred_full_test = 0
pred_train = np.zeros([X_train.shape[0], 3])
for dev_index, val_index in kf.split(X_train):
    dev_X, val_X = stem_train[dev_index], stem_train[val_index]
    dev_y, val_y = ytrain_enc[dev_index], ytrain_enc[val_index]
    n_stem_seq = np.max( [np.max(stem_train[dev_index]), np.max(stem_train[val_index])])+1
    model = initModel(n_stem_seq,maxlen)
    model.fit(dev_X, y=dev_y, batch_size=1024, epochs=100, verbose=1,validation_data=(val_X, val_y))
    pred_val_y = model.predict(val_X)
    pred_test_y = model.predict(stem_test)
    pred_full_test = pred_full_test + pred_test_y
    pred_train[val_index,:] = pred_val_y
#return doAddModel(X_train,X_test,pred_train,pred_full_test/5)

X_train,X_test = doModel(X_train,X_test)

Getting Keras Data



TypeError: ('Keyword argument not understood:', 'return_sequences')

Unnamed: 0,id,text,author,sent_vec_0,sent_vec_1,sent_vec_2,sent_vec_3,sent_vec_4,sent_vec_5,sent_vec_6,...,sent_vec_92,sent_vec_93,sent_vec_94,sent_vec_95,sent_vec_96,sent_vec_97,sent_vec_98,sent_vec_99,stem_text,seq_text_stem
0,id26305,"This process, however, afforded me no means of...",EAP,-0.004493,0.01127,0.110208,0.011623,-0.044851,0.038956,-0.034797,...,-0.032212,-0.015696,-0.039146,-0.034641,-0.026215,-0.079093,0.068413,0.062398,this process howev afford me no mean of asce...,"[27, 1876, 161, 743, 22, 37, 201, 2, 1652, 1, ..."
1,id17569,It never once occurred to me that the fumbling...,HPL,0.053792,-0.001588,0.131171,0.007966,-0.123774,0.076791,0.056179,...,-0.056828,0.019743,-0.083541,0.018332,-0.020753,-0.027814,0.011478,0.054295,it never onc occur to me that the fumbl might ...,"[10, 99, 138, 672, 4, 22, 9, 1, 3675, 85, 23, ..."
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP,-0.036566,0.031117,0.036176,-0.002317,0.015307,0.039265,-0.082055,...,-0.02957,0.050657,-0.152419,-0.024303,-0.033989,-0.062024,0.138996,0.046701,in his left hand was a gold snuff box from wh...,"[7, 15, 164, 122, 8, 6, 935, 4166, 636, 24, 18..."
3,id27763,How lovely is spring As we looked from Windsor...,MWS,-0.064576,0.122922,0.063774,-0.009112,-0.012569,0.140672,-0.011279,...,0.00269,0.046515,-0.23811,0.006557,-0.143425,0.060398,0.095089,0.036426,how love is spring as we look from windsor ter...,"[133, 106, 26, 749, 16, 35, 94, 24, 903, 2393,..."
4,id12958,"Finding nothing else, not even gold, the Super...",HPL,0.040296,0.095128,0.140202,-0.028749,-0.001092,0.065856,-0.09049,...,0.043574,-0.036604,-0.119386,-0.016236,-0.116524,-0.016949,0.043204,0.084453,find noth els not even gold the superintend ...,"[207, 194, 882, 20, 67, 935, 1, 3204, 1281, 15..."


In [216]:
# Model
# XGBoost
def runXGB(train_X, train_y, test_X, test_y=None, test_X2=None, seed_val=0, child=1, colsample=0.3):
    param = {}
    param['objective'] = 'multi:softprob'
    param['eta'] = 0.1
    param['max_depth'] = 3
    param['silent'] = 1
    param['num_class'] = 3
    param['eval_metric'] = "mlogloss"
    param['min_child_weight'] = child
    param['subsample'] = 0.8
    param['colsample_bytree'] = colsample
    param['seed'] = seed_val
    num_rounds = 2000

    plst = list(param.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y)

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y)
        watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=50, verbose_eval=20)
    else:
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(plst, xgtrain, num_rounds)

    pred_test_y = model.predict(xgtest, ntree_limit = model.best_ntree_limit)
    if test_X2 is not None:
        xgtest2 = xgb.DMatrix(test_X2)
        pred_test_y2 = model.predict(xgtest2, ntree_limit = model.best_ntree_limit)
    return pred_test_y, pred_test_y2, model

def do(X_train,X_test,Y_train):
    drop_columns=["id","text","words","word_vectors","sentence_vectors"]
    x_train = X_train.drop(drop_columns+['author'],axis=1)
    x_test = X_test.drop(drop_columns,axis=1)
    y_train = Y_train
    
    kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=2017)
    cv_scores = []
    pred_full_test = 0
    pred_train = np.zeros([x_train.shape[0], 3])
    for dev_index, val_index in kf.split(x_train):
        dev_X, val_X = x_train.loc[dev_index], x_train.loc[val_index]
        dev_y, val_y = y_train[dev_index], y_train[val_index]
        pred_val_y, pred_test_y, model = runXGB(dev_X, dev_y, val_X, val_y, x_test, seed_val=0, colsample=0.7)
        pred_full_test = pred_full_test + pred_test_y
        pred_train[val_index,:] = pred_val_y
        cv_scores.append(metrics.log_loss(val_y, pred_val_y))
    print("cv scores : ", cv_scores)
    return pred_full_test/5
result = do(X_train,X_test,Y_train)

[0]	train-mlogloss:0.999628	test-mlogloss:0.999482
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
[20]	train-mlogloss:0.391298	test-mlogloss:0.397139
[40]	train-mlogloss:0.319198	test-mlogloss:0.333202
[60]	train-mlogloss:0.296807	test-mlogloss:0.320042
[80]	train-mlogloss:0.281649	test-mlogloss:0.314684
[100]	train-mlogloss:0.26805	test-mlogloss:0.311341
[120]	train-mlogloss:0.257358	test-mlogloss:0.309619
[140]	train-mlogloss:0.247042	test-mlogloss:0.308337
[160]	train-mlogloss:0.237471	test-mlogloss:0.307778
[180]	train-mlogloss:0.229021	test-mlogloss:0.306946
[200]	train-mlogloss:0.220467	test-mlogloss:0.306314
[220]	train-mlogloss:0.21224	test-mlogloss:0.306209
[240]	train-mlogloss:0.2049	test-mlogloss:0.305659
[260]	train-mlogloss:0.198042	test-mlogloss:0.306052
[280]	train-mlogloss:0.191251	test-mlogloss:0.306039
[300]	train-mlogloss:0.185274	test-mlogloss:0.306677
Stopping. B

In [217]:
# Write Results

def writeResult(result,test):
    # count number of files
    path, dirs, files = os.walk("../results").next()
    file_count = len(files)/2+1

    # Write the test results
    data=OrderedDict()
    data["id"]=test["id"] 
    data["EAP"]=result[0]#["EAP"]
    data["HPL"]=result[1]#["HPL"]	
    data["MWS"]=result[2]#["MWS"]
    output = pd.DataFrame(data=data)
    filename = "../results/result"+str(file_count)+".csv"
    output.to_csv( filename, index=False )
    filename = "../results/result"+str(file_count)+"compr.csv"
    output.to_csv( filename, index=False )
    check_call(['gzip', filename])

writeResult(result.T,X_test)

In [13]:
# STOP HERE

Index([u'id', u'text', u'author', u'words', u'word_vectors',
       u'sentence_vectors', u'sent_vec_0', u'sent_vec_1', u'sent_vec_2',
       u'sent_vec_3', u'sent_vec_4', u'sent_vec_5', u'sent_vec_6',
       u'sent_vec_7', u'sent_vec_8', u'sent_vec_9', u'sent_vec_10',
       u'sent_vec_11', u'sent_vec_12', u'sent_vec_13', u'sent_vec_14',
       u'sent_vec_15', u'sent_vec_16', u'sent_vec_17', u'sent_vec_18',
       u'sent_vec_19', u'sent_vec_20', u'sent_vec_21', u'sent_vec_22',
       u'sent_vec_23', u'sent_vec_24', u'sent_vec_25', u'sent_vec_26',
       u'sent_vec_27', u'sent_vec_28', u'sent_vec_29', u'sent_vec_30',
       u'sent_vec_31', u'sent_vec_32', u'sent_vec_33', u'sent_vec_34',
       u'sent_vec_35', u'sent_vec_36', u'sent_vec_37', u'sent_vec_38',
       u'sent_vec_39', u'sent_vec_40', u'sent_vec_41', u'sent_vec_42',
       u'sent_vec_43', u'sent_vec_44', u'sent_vec_45', u'sent_vec_46',
       u'sent_vec_47', u'sent_vec_48', u'sent_vec_49', u'nn_eap', u'nn_hpl',
       u'nn_mws

In [97]:
X_train.head()

Unnamed: 0,id,text,author,sent_vec_0,sent_vec_1,sent_vec_2,sent_vec_3,sent_vec_4,sent_vec_5,sent_vec_6,...,sent_vec_90,sent_vec_91,sent_vec_92,sent_vec_93,sent_vec_94,sent_vec_95,sent_vec_96,sent_vec_97,sent_vec_98,sent_vec_99
0,id26305,"This process, however, afforded me no means of...",EAP,-0.004493,0.01127,0.110208,0.011623,-0.044851,0.038956,-0.034797,...,-0.064199,-0.051707,-0.032212,-0.015696,-0.039146,-0.034641,-0.026215,-0.079093,0.068413,0.062398
1,id17569,It never once occurred to me that the fumbling...,HPL,0.053792,-0.001588,0.131171,0.007966,-0.123774,0.076791,0.056179,...,-0.042077,0.016603,-0.056828,0.019743,-0.083541,0.018332,-0.020753,-0.027814,0.011478,0.054295
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP,-0.036566,0.031117,0.036176,-0.002317,0.015307,0.039265,-0.082055,...,-0.065868,-0.024816,-0.02957,0.050657,-0.152419,-0.024303,-0.033989,-0.062024,0.138996,0.046701
3,id27763,How lovely is spring As we looked from Windsor...,MWS,-0.064576,0.122922,0.063774,-0.009112,-0.012569,0.140672,-0.011279,...,0.014403,-0.109733,0.00269,0.046515,-0.23811,0.006557,-0.143425,0.060398,0.095089,0.036426
4,id12958,"Finding nothing else, not even gold, the Super...",HPL,0.040296,0.095128,0.140202,-0.028749,-0.001092,0.065856,-0.09049,...,0.049627,-0.031102,0.043574,-0.036604,-0.119386,-0.016236,-0.116524,-0.016949,0.043204,0.084453


In [207]:
X_train = X_train.drop(['sent_vec_'+str(i) for i in range(300)],axis=1)

In [None]:
word_index

In [None]:
1+2