In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from tqdm import tqdm
from sklearn.svm import SVC

## Deep Learning
from keras.models import Sequential
from keras.layers.recurrent import LSTM, GRU
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.embeddings import Embedding
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils
from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
from keras.preprocessing import sequence, text
from keras.callbacks import EarlyStopping

## Machine Learning
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

## NLTK
from nltk import word_tokenize
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

Using TensorFlow backend.


## Load Dataset

In [2]:
train = pd.read_csv('./Data/train.csv')
test = pd.read_csv('./Data/test.csv')

In [3]:
train.head()

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


In [4]:
test.head()

Unnamed: 0,id,text
0,id02310,"Still, as I urged our leaving Ireland with suc..."
1,id24541,"If a fire wanted fanning, it could readily be ..."
2,id00134,And when they had broken down the frail door t...
3,id27757,While I was thinking how I should possibly man...
4,id04081,I am not sure to what limit his knowledge may ...


In [5]:
def multiclass_logloss(actual, predicted, eps=1e-15):
    """Multi class version of Logarithmic Loss metric.
    :param actual: Array containing the actual target classes
    :param predicted: Matrix with class predictions, one probability per class
    """
    # Convert 'actua;' to binary array if it's not already
    if len(actual.shape) == 1:
        actual2 = np.zeros((actual.shape[0], predicted.shape[1]))
        for i, val in enumerate(actual):
            actual2[i, val] = 1
        actual = actual2
        
    clip = np.clip(predicted, eps, 1 - eps)
    rows = actual.shape[0]
    vsota = np.sum(actual * np.log(clip))
    return -1.0/rows * vsota

We use the LabelEncoder from scikit-learn to convert text labels to integers, 0, 1 2

In [6]:
lbl_enc = preprocessing.LabelEncoder()
y = lbl_enc.fit_transform(train.author.values)

In [7]:
y

array([0, 1, 0, ..., 0, 0, 1])

In [8]:
xtrain, xvalid, ytrain, yvalid = train_test_split(train.text.values, y, stratify=y, random_state=42, test_size=0.1)

In [9]:
print(xtrain.shape)
print(xvalid.shape)

(17621,)
(1958,)


# Basic models

### Tf-Idf

In [10]:
tfv = TfidfVectorizer(min_df=3, max_features=None, strip_accents='unicode',
                     analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1, 3),
                      use_idf=1, smooth_idf=1, sublinear_tf=1, stop_words='english')
# Fitting TF-IDF to both training and test sets (semi-supervised learning)
tfv.fit(list(xtrain) + list(xvalid))
xtrain_tfv = tfv.transform(xtrain)
xvalid_tfv = tfv.transform(xvalid)

### Count Vectorizer (Use word counts as features)

In [11]:
ctv = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}',
                     ngram_range=(1, 3), stop_words='english')
# Fitting Count Vectorizer to both training and test sets (semi-supervised learning)
ctv.fit(list(xtrain) + list(xvalid))
xtrain_ctv =  ctv.transform(xtrain) 
xvalid_ctv = ctv.transform(xvalid)

## Logistic Regression

In [12]:
# Fitting Logistic Regression on TF-IDF
clf = LogisticRegression(C=1.0)
clf.fit(xtrain_tfv, ytrain)
predictions = clf.predict_proba(xvalid_tfv)

print("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

logloss: 0.626 


In [13]:
# Fitting Logistic Regression on Count Vectors
clf = LogisticRegression(C=1.0)
clf.fit(xtrain_ctv, ytrain)
predictions = clf.predict_proba(xvalid_ctv)

print("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

logloss: 0.528 


## Naive Bayes

In [14]:
# Fitting a simple Naive Bayes on TF-IDF
clf = MultinomialNB()
clf.fit(xtrain_tfv, ytrain)
predictions = clf.predict_proba(xvalid_tfv)

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

logloss: 0.578 


In [15]:
# Fitting a simple Naive Bayes on Count Vectors
clf = MultinomialNB()
clf.fit(xtrain_ctv, ytrain)
predictions = clf.predict_proba(xvalid_ctv)

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

logloss: 0.485 


## SVM

### First use SVD to reduce features

In [16]:
# Apply SVD on TF-IDF, I chose 120 components. 120-200 components are good enough for SVM model.
svd = decomposition.TruncatedSVD(n_components=120)
svd.fit(xtrain_tfv)
xtrain_svd = svd.transform(xtrain_tfv)
xvalid_svd = svd.transform(xvalid_tfv)

# Scale the data obtained from SVD. Renaming variable to reuse without scaling.
scl = preprocessing.StandardScaler()
scl.fit(xtrain_svd)
xtrain_svd_scl = scl.transform(xtrain_svd)
xvalid_svd_scl = scl.transform(xvalid_svd)

Now we apply SVM

In [17]:
# Fitting a simple SVM
clf = SVC(C=1.0, probability=True)  # Since we need probabilities
clf.fit(xtrain_svd_scl, ytrain)
predictions = clf.predict_proba(xvalid_svd_scl)
print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

logloss: 0.733 


## Xgboost

In [18]:
# Fitting a simple xgboost on tf-idf
clf = xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8, subsample=0.8,
                        nthread=10, learning_rate=0.1)
clf.fit(xtrain_tfv.tocsc(), ytrain)
predictions = clf.predict_proba(xvalid_tfv.tocsc())

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

logloss: 0.782 


In [19]:
# Fitting a simple xgboost on Count vectors
# Fitting a simple xgboost on tf-idf
clf = xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8, 
                        subsample=0.8, nthread=10, learning_rate=0.1)
clf.fit(xtrain_ctv.tocsc(), ytrain)
predictions = clf.predict_proba(xvalid_ctv.tocsc())

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

logloss: 0.772 


In [20]:
# Fitting a simple xgboost on tf-idf svd features
clf = xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8, subsample=0.8, nthread=10, learning_rate=0.1)
clf.fit(xtrain_svd, ytrain)
predictions = clf.predict_proba(xvalid_svd)

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

logloss: 0.772 


In [21]:
# Fitting a simple xgboost on tf-idf svd features
clf = xgb.XGBClassifier(nthread=10)
clf.fit(xtrain_svd, ytrain)
predictions = clf.predict_proba(xvalid_svd)

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

logloss: 0.812 


## Grid Search

In [22]:
mll_scorer = metrics.make_scorer(multiclass_logloss, 
                                 greater_is_better=False,
                                 needs_proba=True)

In [23]:
### Pipeline
# Initialise SVD
svd = TruncatedSVD()

# Intialise Standard Scaler
scl = preprocessing.StandardScaler()

# Logistic Regression
lr_model = LogisticRegression()

# Create pipeline
clf = pipeline.Pipeline([('svd', svd),
                        ('scl', scl),
                        ('lr', lr_model)])

In [24]:
param_grid = {'svd__n_components': [120, 180],
              'lr__C': [0.1, 1.0, 10],
              'lr__penalty': ['l1', 'l2']}

In [25]:
# Initialise Grid Search Model
model = GridSearchCV(estimator=clf, param_grid=param_grid, 
                    scoring=mll_scorer, verbose=10,
                    n_jobs=-1, iid=True, refit=True, cv=2)
# Fit grid search model
model.fit(xtrain_tfv, ytrain)  # Can use full data here but here using only xtrain
print("Best score: %0.3f" % model.best_score_)
print("Best parameters set:")
best_parameters = model.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print("\t%s: %r" %(param_name, best_parameters[param_name]))

Fitting 2 folds for each of 12 candidates, totalling 24 fits
[CV] lr__penalty=l1, lr__C=0.1, svd__n_components=120 ................
[CV] lr__penalty=l1, lr__C=0.1, svd__n_components=120 ................
[CV] lr__penalty=l1, lr__C=0.1, svd__n_components=180 ................
[CV] lr__penalty=l1, lr__C=0.1, svd__n_components=180 ................
[CV]  lr__penalty=l1, lr__C=0.1, svd__n_components=120, score=-0.778793, total=  12.6s
[CV] lr__penalty=l2, lr__C=0.1, svd__n_components=120 ................
[CV]  lr__penalty=l1, lr__C=0.1, svd__n_components=120, score=-0.779101, total=  13.1s
[CV] lr__penalty=l2, lr__C=0.1, svd__n_components=120 ................
[CV]  lr__penalty=l1, lr__C=0.1, svd__n_components=180, score=-0.751959, total=  16.7s
[CV] lr__penalty=l2, lr__C=0.1, svd__n_components=180 ................
[CV]  lr__penalty=l1, lr__C=0.1, svd__n_components=180, score=-0.749493, total=  18.3s
[CV] lr__penalty=l2, lr__C=0.1, svd__n_components=180 ................
[CV]  lr__penalty=l2, l

[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   22.7s


[CV]  lr__penalty=l2, lr__C=0.1, svd__n_components=120, score=-0.783922, total=  10.5s
[CV] lr__penalty=l1, lr__C=1.0, svd__n_components=120 ................
[CV]  lr__penalty=l2, lr__C=0.1, svd__n_components=180, score=-0.745771, total=  17.1s
[CV] lr__penalty=l1, lr__C=1.0, svd__n_components=180 ................
[CV]  lr__penalty=l2, lr__C=0.1, svd__n_components=180, score=-0.745090, total=  19.0s
[CV] lr__penalty=l1, lr__C=1.0, svd__n_components=180 ................
[CV]  lr__penalty=l1, lr__C=1.0, svd__n_components=120, score=-0.782720, total=  13.3s
[CV] lr__penalty=l2, lr__C=1.0, svd__n_components=120 ................
[CV]  lr__penalty=l1, lr__C=1.0, svd__n_components=120, score=-0.772513, total=  12.5s
[CV] lr__penalty=l2, lr__C=1.0, svd__n_components=120 ................


[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   36.3s


[CV]  lr__penalty=l2, lr__C=1.0, svd__n_components=120, score=-0.774216, total=  13.1s
[CV] lr__penalty=l2, lr__C=1.0, svd__n_components=180 ................
[CV]  lr__penalty=l2, lr__C=1.0, svd__n_components=120, score=-0.769028, total=  14.1s
[CV] lr__penalty=l2, lr__C=1.0, svd__n_components=180 ................
[CV]  lr__penalty=l1, lr__C=1.0, svd__n_components=180, score=-0.739432, total=  19.1s
[CV] lr__penalty=l1, lr__C=10, svd__n_components=120 .................
[CV]  lr__penalty=l1, lr__C=1.0, svd__n_components=180, score=-0.738692, total=  27.4s
[CV] lr__penalty=l1, lr__C=10, svd__n_components=120 .................
[CV]  lr__penalty=l2, lr__C=1.0, svd__n_components=180, score=-0.743542, total=  15.4s
[CV] lr__penalty=l1, lr__C=10, svd__n_components=180 .................
[CV]  lr__penalty=l2, lr__C=1.0, svd__n_components=180, score=-0.747544, total=  16.5s
[CV] lr__penalty=l1, lr__C=10, svd__n_components=180 .................
[CV]  lr__penalty=l1, lr__C=10, svd__n_components=12

[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:  1.1min


[CV]  lr__penalty=l1, lr__C=10, svd__n_components=120, score=-0.761928, total=  15.0s
[CV] lr__penalty=l2, lr__C=10, svd__n_components=120 .................
[CV]  lr__penalty=l2, lr__C=10, svd__n_components=120, score=-0.782794, total=  12.5s
[CV] lr__penalty=l2, lr__C=10, svd__n_components=180 .................
[CV]  lr__penalty=l1, lr__C=10, svd__n_components=180, score=-0.740813, total=  20.8s
[CV] lr__penalty=l2, lr__C=10, svd__n_components=180 .................


[Parallel(n_jobs=-1)]: Done  20 out of  24 | elapsed:  1.5min remaining:   17.5s


[CV]  lr__penalty=l2, lr__C=10, svd__n_components=120, score=-0.770673, total=   9.7s
[CV]  lr__penalty=l1, lr__C=10, svd__n_components=180, score=-0.739227, total=  23.1s
[CV]  lr__penalty=l2, lr__C=10, svd__n_components=180, score=-0.738877, total=  11.2s
[CV]  lr__penalty=l2, lr__C=10, svd__n_components=180, score=-0.734521, total=   7.8s


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  1.6min finished


Best score: -0.737
Best parameters set:
	lr__C: 10
	lr__penalty: 'l2'
	svd__n_components: 180


In [26]:
# Using this technique for MultinomialNB
nb_model = MultinomialNB()

# Create Pipeline
clf = pipeline.Pipeline([('nb', nb_model)])

# Parameter Grid
param_grid = {'nb__alpha': [0.001, 0.01, 0.1, 1, 10, 100]}

# Initialise Grid Search Model
model = GridSearchCV(estimator=clf, param_grid=param_grid, scoring=mll_scorer,
                     verbose=10, n_jobs=-1, iid=True, refit=True, cv=2)

# Fit Grid Search Model
model.fit(xtrain_tfv, ytrain)

print("Best score: %0.3f" % model.best_score_)
print("Best parameters set:")
best_parameters = model.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Fitting 2 folds for each of 6 candidates, totalling 12 fits
[CV] nb__alpha=0.001 .................................................
[CV] nb__alpha=0.001 .................................................
[CV] nb__alpha=0.01 ..................................................
[CV] nb__alpha=0.01 ..................................................
[CV] ................. nb__alpha=0.001, score=-0.620470, total=   0.1s
[CV] .................. nb__alpha=0.01, score=-0.510778, total=   0.0s
[CV] nb__alpha=0.1 ...................................................
[CV] ................. nb__alpha=0.001, score=-0.641454, total=   0.1s
[CV] .................. nb__alpha=0.01, score=-0.522989, total=   0.1s
[CV] nb__alpha=0.1 ...................................................
[CV] nb__alpha=1 .....................................................
[CV] nb__alpha=1 .....................................................
[CV] ..................... nb__alpha=1, score=-0.662953, total=   0.0s
[CV] nb__alpha=10

[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done   7 out of  12 | elapsed:    0.4s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done   9 out of  12 | elapsed:    0.4s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:    0.5s finished


## Word Vectors

In [27]:
import re
# Load Glove vectors
embeddings_index = {}
f = open('glove.840B.300d.txt')
for line in tqdm(f):
    #values = line.split()
    values = re.sub(r'[^\x00-\x7F]+','', line).split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

2196017it [06:13, 5874.73it/s]

Found 2185161 word vectors.





In [28]:
# this function creates a normalized vector for the whole sentence
def sent2vec(s):
    words = str(s).lower()
    words = word_tokenize(words)
    words = [w for w in words if not w in stop_words]
    words = [w for w in words if w.isalpha()]
    M = []
    for w in words:
        try:
            M.append(embeddings_index[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    if type(v) != np.ndarray:
        return np.zeros(300)
    return v / np.sqrt((v ** 2).sum())

In [37]:
# create sentence vectors using the above function for training and validation set
xtrain_glove = [sent2vec(x) for x in tqdm(xtrain)]
xvalid_glove = [sent2vec(x) for x in tqdm(xvalid)]

xtrain_glove = np.array(xtrain_glove)
xvaild_glove = np.array(xvalid_glove)

print(type(test_data_dict))
print(type(xtrain_glove))
print(type(xvalid_glove))

100%|██████████| 17621/17621 [00:05<00:00, 2948.90it/s]
100%|██████████| 1958/1958 [00:00<00:00, 2876.51it/s]


<class 'dict'>
<class 'numpy.ndarray'>
<class 'list'>


In [30]:
## Xgboost on Glove features
# Fitting a simple xgboost on glove features
clf = xgb.XGBClassifier(nthread=10, silent=False)
clf.fit(xtrain_glove, ytrain)
predictions = clf.predict_proba(xvalid_glove)

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

logloss: 0.812 


In [31]:
# Fitting a simple xgboost on glove features
clf = xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8, 
                        subsample=0.8, nthread=10, learning_rate=0.1, silent=False)
clf.fit(xtrain_glove, ytrain)
predictions = clf.predict_proba(xvalid_glove)

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

logloss: 0.693 


In [36]:
print(type(test_data_dict))
print(type(xtrain_glove))
print(type(xvalid_glove))

<class 'dict'>
<class 'numpy.ndarray'>
<class 'list'>


## Deep Learning

In [44]:
# Scale the data before neural net
scl = preprocessing.StandardScaler()
xtrain_glove_scl = scl.fit_transform(xtrain_glove)
xvalid_glove_scl = scl.fit_transform(xvalid_glove)

In [45]:
# we need to binarize the labels for the neural net
ytrain_enc = np_utils.to_categorical(ytrain)
yvalid_enc = np_utils.to_categorical(yvalid)

In [46]:
# Craete a simple 3 layer sequential neural net
model = Sequential()

model.add(Dense(300, input_dim=300, activation='relu'))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(Dense(300, activation='relu'))
model.add(Dropout(0.3))
model.add(BatchNormalization())

model.add(Dense(3))
model.add(Activation('softmax'))

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [47]:
model.fit(xtrain_glove_scl, y=ytrain_enc, batch_size=64, 
          epochs=5, verbose=1, 
          validation_data=(xvalid_glove_scl, yvalid_enc))

Train on 17621 samples, validate on 1958 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f1848d316a0>

### Use LSTMs

In [48]:
# using keras tokenizer here
token = text.Tokenizer(num_words=None)
max_len = 70

token.fit_on_texts(list(xtrain) + list(xvalid))
xtrain_seq = token.texts_to_sequences(xtrain)
xvalid_seq = token.texts_to_sequences(xvalid)

# zero pad the sequences
xtrain_pad = sequence.pad_sequences(xtrain_seq, maxlen=max_len)
xvalid_pad = sequence.pad_sequences(xvalid_seq, maxlen=max_len)

word_index = token.word_index

In [49]:
# Create an embedding matrix
embedding_matrix = np.zeros((len(word_index) + 1, 300))
for word, i in tqdm(word_index.items()):
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector



  0%|          | 0/25943 [00:00<?, ?it/s][A[A

 70%|██████▉   | 18128/25943 [00:00<00:00, 181277.58it/s][A[A

100%|██████████| 25943/25943 [00:00<00:00, 215290.19it/s][A[A

In [50]:
# A simple LSTM with glove embeddings and two dense layers
model = Sequential()
model.add(Embedding(len(word_index) + 1, 300,
                    weights=[embedding_matrix],
                    input_length=max_len,
                    trainable=False))
model.add(SpatialDropout1D(0.3))
model.add(LSTM(100, dropout=0.3, recurrent_dropout=0.3))

model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.8))

model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.8))

model.add(Dense(3))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam')

In [52]:
# Fit the model with early stopping callback
earlystop = EarlyStopping(monitor='val_loss', min_delta=0, patience=3, verbose=0, mode='auto')
model.fit(xtrain_pad, y=ytrain_enc, batch_size=512, epochs=100, 
          verbose=1, validation_data=(xvalid_pad, yvalid_enc), callbacks=[earlystop])

Train on 17621 samples, validate on 1958 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100


<keras.callbacks.History at 0x7f1821444160>

In [53]:
# A simple bidirectional LSTM with glove embeddings and two dense layers
model = Sequential()
model.add(Embedding(len(word_index) + 1,
                     300,
                     weights=[embedding_matrix],
                     input_length=max_len,
                     trainable=False))
model.add(SpatialDropout1D(0.3))
model.add(Bidirectional(LSTM(300, dropout=0.3, recurrent_dropout=0.3)))

model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.8))

model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.8))

model.add(Dense(3))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [54]:
# Fit the model with early stopping callback
earlystop = EarlyStopping(monitor='val_loss', min_delta=0, patience=3, verbose=0, mode='auto')
model.fit(xtrain_pad, y=ytrain_enc, batch_size=512, epochs=100, 
          verbose=1, validation_data=(xvalid_pad, yvalid_enc), callbacks=[earlystop])

Train on 17621 samples, validate on 1958 samples
Epoch 1/100

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.5/dist-packages/IPython/core/interactiveshell.py", line 2862, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-54-95a2251d3047>", line 4, in <module>
    verbose=1, validation_data=(xvalid_pad, yvalid_enc), callbacks=[earlystop])
  File "/usr/local/lib/python3.5/dist-packages/keras/models.py", line 867, in fit
    initial_epoch=initial_epoch)
  File "/usr/local/lib/python3.5/dist-packages/keras/engine/training.py", line 1598, in fit
    validation_steps=validation_steps)
  File "/usr/local/lib/python3.5/dist-packages/keras/engine/training.py", line 1183, in _fit_loop
    outs = f(ins_batch)
  File "/usr/local/lib/python3.5/dist-packages/keras/backend/tensorflow_backend.py", line 2273, in __call__
    **self.session_kwargs)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/client/session.py", line 895, in run
    run_metadata_ptr)
  File "/usr/local/lib/python3.5/dist

KeyboardInterrupt: 

In [None]:
# GRU with glove embeddings and two dense layers
model = Sequential()
model.add(Embedding(len(word_index) + 1,
                     300,
                     weights=[embedding_matrix],
                     input_length=max_len,
                     trainable=False))
model.add(SpatialDropout1D(0.3))
model.add(GRU(300, dropout=0.3, recurrent_dropout=0.3, return_sequences=True))
model.add(GRU(300, dropout=0.3, recurrent_dropout=0.3))

model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.8))

model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.8))

model.add(Dense(3))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [None]:
# Fit the model with early stopping callback
earlystop = EarlyStopping(monitor='val_loss', min_delta=0, patience=3, verbose=0, mode='auto')
model.fit(xtrain_pad, y=ytrain_enc, batch_size=512, epochs=100, 
          verbose=1, validation_data=(xvalid_pad, yvalid_enc), callbacks=[earlystop])

## Ensembling

In [32]:
# this is the main ensembling class. how to use it is in the next cell!
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, KFold
import pandas as pd
import os
import sys
import logging

logging.basicConfig(
    level=logging.DEBUG,
    format="[%(asctime)s] %(levelname)s %(message)s",
    datefmt="%H:%M:%S", stream=sys.stdout)
logger = logging.getLogger(__name__)


class Ensembler(object):
    def __init__(self, model_dict, num_folds=3, task_type='classification', optimize=roc_auc_score,
                 lower_is_better=False, save_path=None):
        """
        Ensembler init function
        :param model_dict: model dictionary, see README for its format
        :param num_folds: the number of folds for ensembling
        :param task_type: classification or regression
        :param optimize: the function to optimize for, e.g. AUC, logloss, etc. Must have two arguments y_test and y_pred
        :param lower_is_better: is lower value of optimization function better or higher
        :param save_path: path to which model pickles will be dumped to along with generated predictions, or None
        """

        self.model_dict = model_dict
        self.levels = len(self.model_dict)
        self.num_folds = num_folds
        self.task_type = task_type
        self.optimize = optimize
        self.lower_is_better = lower_is_better
        self.save_path = save_path

        self.training_data = None
        self.test_data = None
        self.y = None
        self.lbl_enc = None
        self.y_enc = None
        self.train_prediction_dict = None
        self.test_prediction_dict = None
        self.num_classes = None

    def fit(self, training_data, y, lentrain):
        """
        :param training_data: training data in tabular format
        :param y: binary, multi-class or regression
        :return: chain of models to be used in prediction
        """

        self.training_data = training_data
        self.y = y

        if self.task_type == 'classification':
            self.num_classes = len(np.unique(self.y))
            logger.info("Found %d classes", self.num_classes)
            self.lbl_enc = LabelEncoder()
            self.y_enc = self.lbl_enc.fit_transform(self.y)
            kf = StratifiedKFold(n_splits=self.num_folds)
            train_prediction_shape = (lentrain, self.num_classes)
        else:
            self.num_classes = -1
            self.y_enc = self.y
            kf = KFold(n_splits=self.num_folds)
            train_prediction_shape = (lentrain, 1)

        self.train_prediction_dict = {}
        for level in range(self.levels):
            self.train_prediction_dict[level] = np.zeros((train_prediction_shape[0],
                                                          train_prediction_shape[1] * len(self.model_dict[level])))

        for level in range(self.levels):

            if level == 0:
                temp_train = self.training_data
            else:
                temp_train = self.train_prediction_dict[level - 1]

            for model_num, model in enumerate(self.model_dict[level]):
                validation_scores = []
                foldnum = 1
                for train_index, valid_index in kf.split(self.train_prediction_dict[0], self.y_enc):
                    logger.info("Training Level %d Fold # %d. Model # %d", level, foldnum, model_num)

                    if level != 0:
                        l_training_data = temp_train[train_index]
                        l_validation_data = temp_train[valid_index]
                        model.fit(l_training_data, self.y_enc[train_index])
                    else:
                        l0_training_data = temp_train[0][model_num]
                        if type(l0_training_data) == list:
                            l_training_data = [x[train_index] for x in l0_training_data]
                            l_validation_data = [x[valid_index] for x in l0_training_data]
                        else:
                            l_training_data = l0_training_data[train_index]
                            l_validation_data = l0_training_data[valid_index]
                        model.fit(l_training_data, self.y_enc[train_index])

                    logger.info("Predicting Level %d. Fold # %d. Model # %d", level, foldnum, model_num)

                    if self.task_type == 'classification':
                        temp_train_predictions = model.predict_proba(l_validation_data)
                        self.train_prediction_dict[level][valid_index,
                        (model_num * self.num_classes):(model_num * self.num_classes) +
                                                       self.num_classes] = temp_train_predictions

                    else:
                        temp_train_predictions = model.predict(l_validation_data)
                        self.train_prediction_dict[level][valid_index, model_num] = temp_train_predictions
                    validation_score = self.optimize(self.y_enc[valid_index], temp_train_predictions)
                    validation_scores.append(validation_score)
                    logger.info("Level %d. Fold # %d. Model # %d. Validation Score = %f", level, foldnum, model_num,
                                validation_score)
                    foldnum += 1
                avg_score = np.mean(validation_scores)
                std_score = np.std(validation_scores)
                logger.info("Level %d. Model # %d. Mean Score = %f. Std Dev = %f", level, model_num,
                            avg_score, std_score)

            logger.info("Saving predictions for level # %d", level)
            train_predictions_df = pd.DataFrame(self.train_prediction_dict[level])
            train_predictions_df.to_csv(os.path.join(self.save_path, "train_predictions_level_" + str(level) + ".csv"),
                                        index=False, header=None)

        return self.train_prediction_dict

    def predict(self, test_data, lentest):
        self.test_data = test_data
        if self.task_type == 'classification':
            test_prediction_shape = (lentest, self.num_classes)
        else:
            test_prediction_shape = (lentest, 1)

        self.test_prediction_dict = {}
        for level in range(self.levels):
            self.test_prediction_dict[level] = np.zeros((test_prediction_shape[0],
                                                         test_prediction_shape[1] * len(self.model_dict[level])))
        self.test_data = test_data
        for level in range(self.levels):
            if level == 0:
                temp_train = self.training_data
                temp_test = self.test_data
            else:
                temp_train = self.train_prediction_dict[level - 1]
                temp_test = self.test_prediction_dict[level - 1]

            for model_num, model in enumerate(self.model_dict[level]):

                logger.info("Training Fulldata Level %d. Model # %d", level, model_num)
                if level == 0:
                    model.fit(temp_train[0][model_num], self.y_enc)
                else:
                    model.fit(temp_train, self.y_enc)

                logger.info("Predicting Test Level %d. Model # %d", level, model_num)

                if self.task_type == 'classification':
                    if level == 0:
                        temp_test_predictions = model.predict_proba(temp_test[0][model_num])
                    else:
                        temp_test_predictions = model.predict_proba(temp_test)
                    self.test_prediction_dict[level][:, (model_num * self.num_classes): (model_num * self.num_classes) +
                                                                                        self.num_classes] = temp_test_predictions

                else:
                    if level == 0:
                        temp_test_predictions = model.predict(temp_test[0][model_num])
                    else:
                        temp_test_predictions = model.predict(temp_test)
                    self.test_prediction_dict[level][:, model_num] = temp_test_predictions

            test_predictions_df = pd.DataFrame(self.test_prediction_dict[level])
            test_predictions_df.to_csv(os.path.join(self.save_path, "test_predictions_level_" + str(level) + ".csv"),
                                       index=False, header=None)

        return self.test_prediction_dict

In [50]:
print(xtrain_tfv.shape)
print(xtrain_ctv.shape)
print(xtrain_glove.shape)
print(xvalid_tfv.shape)
print(xvalid_ctv.shape)
print(xvalid_glove.shape)

(17621, 15102)
(17621, 400266)
(17621, 300)
(1958, 15102)
(1958, 400266)


AttributeError: 'list' object has no attribute 'shape'

In [39]:
# specify the data to be used for every level of ensembling:
train_data_dict = {0: [xtrain_tfv, xtrain_ctv, xtrain_tfv, xtrain_ctv], 1: [xtrain_glove]}
test_data_dict = {0: [xvalid_tfv, xvalid_ctv, xvalid_tfv, xvalid_ctv], 1: [xvalid_glove]}

model_dict = {0: [LogisticRegression(), LogisticRegression(), MultinomialNB(alpha=0.1), MultinomialNB()],

              1: [xgb.XGBClassifier(silent=True, n_estimators=120, max_depth=7)]}

ens = Ensembler(model_dict=model_dict, num_folds=3, task_type='classification',
                optimize=multiclass_logloss, lower_is_better=True, save_path='')

ens.fit(train_data_dict, ytrain, lentrain=xtrain_glove.shape[0])
preds = ens.predict(test_data_dict, lentest=len(xvalid_glove))

[17:18:01] INFO Found 3 classes
[17:18:01] INFO Training Level 0 Fold # 1. Model # 0
[17:18:02] INFO Predicting Level 0. Fold # 1. Model # 0
[17:18:02] INFO Level 0. Fold # 1. Model # 0. Validation Score = 0.679328
[17:18:02] INFO Training Level 0 Fold # 2. Model # 0
[17:18:02] INFO Predicting Level 0. Fold # 2. Model # 0
[17:18:02] INFO Level 0. Fold # 2. Model # 0. Validation Score = 0.670841
[17:18:02] INFO Training Level 0 Fold # 3. Model # 0
[17:18:03] INFO Predicting Level 0. Fold # 3. Model # 0
[17:18:03] INFO Level 0. Fold # 3. Model # 0. Validation Score = 0.672830
[17:18:03] INFO Level 0. Model # 0. Mean Score = 0.674333. Std Dev = 0.003624
[17:18:03] INFO Training Level 0 Fold # 1. Model # 1
[17:18:06] INFO Predicting Level 0. Fold # 1. Model # 1
[17:18:06] INFO Level 0. Fold # 1. Model # 1. Validation Score = 0.574758
[17:18:06] INFO Training Level 0 Fold # 2. Model # 1
[17:18:10] INFO Predicting Level 0. Fold # 2. Model # 1
[17:18:10] INFO Level 0. Fold # 2. Model # 1. Val

In [40]:
# check error:
multiclass_logloss(yvalid, preds[1])

0.42348974089710179

In [41]:
testdf = pd.read_csv('./Data/test.csv')
xtest = testdf.text.values

In [47]:
xtest[:5]

array([ 'Still, as I urged our leaving Ireland with such inquietude and impatience, my father thought it best to yield.',
       'If a fire wanted fanning, it could readily be fanned with a newspaper, and as the government grew weaker, I have no doubt that leather and iron acquired durability in proportion, for, in a very short time, there was not a pair of bellows in all Rotterdam that ever stood in need of a stitch or required the assistance of a hammer.',
       'And when they had broken down the frail door they found only this: two cleanly picked human skeletons on the earthen floor, and a number of singular beetles crawling in the shadowy corners.',
       'While I was thinking how I should possibly manage without them, one actually tumbled out of my head, and, rolling down the steep side of the steeple, lodged in the rain gutter which ran along the eaves of the main building.',
       'I am not sure to what limit his knowledge may extend.'], dtype=object)

In [42]:
tfv = TfidfVectorizer(min_df=3, max_features=None, strip_accents='unicode',
                     analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1, 3),
                      use_idf=1, smooth_idf=1, sublinear_tf=1, stop_words='english')
# Fitting TF-IDF to both training and test sets (semi-supervised learning)
xtest_tfv = tfv.fit_transform(list(xtest))

ctv = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}',
                     ngram_range=(1, 3), stop_words='english')
# Fitting Count Vectorizer to both training and test sets (semi-supervised learning)
xtest_ctv = ctv.fit_transform(list(xtest))

In [43]:
# create sentence vectors using the above function for training and validation set
xtest_glove = [sent2vec(x) for x in tqdm(xtest)]

xtest_glove = np.array(xtest_glove)

100%|██████████| 8392/8392 [00:02<00:00, 2909.11it/s]


In [44]:
test_data_dict2 = {0: [xtest_tfv, xtest_ctv, xtest_tfv, xtest_ctv], 1: [xtest_glove]}

In [48]:
print(xtest_tfv.shape)
print(xtest_ctv.shape)
print(xtest_glove.shape)

(8392, 7594)
(8392, 179824)
(8392, 300)


In [46]:
preds_test = ens.predict(test_data_dict2, lentest=xtest_glove.shape[0])

[17:28:30] INFO Training Fulldata Level 0. Model # 0
[17:28:31] INFO Predicting Test Level 0. Model # 0


ValueError: X has 7594 features per sample; expecting 15102