In [1]:
import numpy as np

import pandas as pd

from collections import defaultdict

import keras
import keras.backend as K
from keras.layers import Dense, GlobalAveragePooling1D, Embedding
from keras.callbacks import EarlyStopping
from keras.models import Sequential
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical

from sklearn.model_selection import train_test_split

np.random.seed(7)

Using TensorFlow backend.


In [2]:
df = pd.read_csv('../Data/train.csv')
a2c = {'EAP': 0, 'HPL' : 1, 'MWS' : 2}
y = np.array([a2c[a] for a in df.author])
y = to_categorical(y)

In [3]:
y[:5]

array([[ 1.,  0.,  0.],
       [ 0.,  1.,  0.],
       [ 1.,  0.,  0.],
       [ 0.,  0.,  1.],
       [ 0.,  1.,  0.]])

# 1. Few Preprocessings

In traditional NLP tasks, preprocessings play an important role, but...

## Low-frequency words

In my experience, fastText is very fast, but I need to delete rare words to avoid overfitting.

NOTE: Some keywords are rare words, such like Cthulhu in Cthulhu Mythos of Howard Phillips Lovecraft. But these are useful for this task.

## Removing Stopwords

Nothing. To identify author from a sentence, some stopwords play an important role because one has specific usages of them.

## Stemming and Lowercase

Nothing. This reason is the same for stopwords removing. And I guess some stemming rules provided by libraries is bad for this task because all author is the older author.

## Cutting long sentence

Too long documents are cut.

## Punctuation

Because I guess each author has unique punctuations's usage in the novel, I separate them from words.

e.g. Don't worry -> Don ' t worry

## Character didtribution per author

In [4]:
counter = {name : defaultdict(int) for name in set(df.author)}
for (text, author) in zip(df.text, df.author):
    text = text.replace(' ', '')
    for c in text:
        counter[author][c] += 1

chars = set()
for v in counter.values():
    chars |= v.keys()
    
names = [author for author in counter.keys()]

print('c ', end='')
for n in names:
    print(n, end='   ')
print()
for c in chars:    
    print(c, end=' ')
    for n in names:
        print(counter[n][c], end=' ')
    print()

c EAP   MWS   HPL   
M 1065 415 645 
t 82426 63142 62235 
' 1334 476 1710 
J 164 66 210 
ἶ 0 0 2 
æ 36 0 10 
Π 0 0 1 
i 60952 46080 44250 
q 1030 677 779 
â 6 0 0 
n 62636 50291 50879 
H 864 669 741 
E 435 445 281 
ñ 0 0 7 
m 22792 20471 17622 
N 411 204 345 
l 35371 27819 30273 
Σ 0 0 1 
y 17001 14877 12534 
s 53841 45962 43915 
D 491 227 334 
k 4277 3707 5204 
; 1354 2662 1143 
Υ 0 0 1 
à 10 0 0 
b 13245 9611 10636 
P 442 365 320 
Ν 0 0 1 
C 395 308 439 
u 26311 21025 19519 
f 22354 18351 16272 
S 729 578 841 
Y 282 234 111 
I 4846 4917 3480 
α 0 0 2 
T 2217 1230 1583 
W 739 681 732 
g 16088 12601 14951 
v 9624 7948 6529 
F 383 232 269 
K 86 35 176 
U 166 46 94 
. 8406 5761 5908 
G 313 246 318 
ü 1 0 5 
e 114885 97515 88259 
ï 0 0 7 
j 683 682 424 
L 458 307 249 
Å 0 0 1 
c 24127 17911 18338 
a 68525 55274 56815 
" 2987 1469 513 
é 47 0 15 
ô 8 0 0 
h 51580 43738 42770 
x 1951 1267 1061 
ä 1 0 6 
? 510 419 169 
è 15 0 0 
V 156 57 67 
Ο 0 0 3 
A 1258 943 1167 
Q 21 7 10 
X 17 4 5 
p 1

## Summary of character distribution
1. HPL and EAP used non ascii characters like a ä.
2. The number of punctuations seems to be good feature

## Preprocessing

My preproceeings are

1. Separate punctuation from words
2. Remove lower frequency words ( <= 2)
3. Cut a longer document which contains 256 words


In [5]:
def preprocess(text):
    text = text.replace("' ", " ' ")
    signs = set(',.:;"?!')
    prods = set(text) & signs
    if not prods:
        return text

    for sign in prods:
        text = text.replace(sign, ' {} '.format(sign) )
    return text

In [6]:
def create_docs(df, n_gram_max=2):
    def add_ngram(q, n_gram_max):
        ngrams = []
        for n in range(2, n_gram_max+1):
            for w_index in range(len(q)-n+1):
                ngrams.append('--'.join(q[w_index:w_index+n]))
        return q + ngrams
        
    docs = []
    for doc in df.text:
        doc = preprocess(doc).split()
        docs.append(' '.join(add_ngram(doc, n_gram_max)))
    
    return docs

In [7]:
min_count = 2

docs = create_docs(df)
tokenizer = Tokenizer(lower=False, filters='')
tokenizer.fit_on_texts(docs)
num_words = sum([1 for _, v in tokenizer.word_counts.items() if v >= min_count])

tokenizer = Tokenizer(num_words=num_words, lower=False, filters='')
tokenizer.fit_on_texts(docs)
docs = tokenizer.texts_to_sequences(docs)

maxlen = 256

docs = pad_sequences(sequences=docs, maxlen=maxlen)

# 2. Model: FastText by Keras

FastText is very fast and strong baseline algorithm for text classification based on Continuous Bag-of-Words model a.k.a Word2vec.

FastText contains only three layers:

1. Embeddings layer: Input words (and word n-grams) are all words in a sentence/document
2. Mean/AveragePooling Layer: Taking average vector of Embedding vectors
3. Softmax layer

# My FastText parameters are:

* The dimension of word vector is 20
* Optimizer is Adam
* Inputs are words and word bi-grams
    * you can change this parameter by passing the max n-gram size to argument of create_docs function.

In [8]:
input_dim = np.max(docs) + 1
embedding_dims = 20

In [9]:
def create_model(embedding_dims=20, optimizer='adam'):
    model = Sequential()
    model.add(Embedding(input_dim=input_dim, output_dim=embedding_dims))
    model.add(GlobalAveragePooling1D())
    model.add(Dense(3, activation='softmax'))

    model.compile(loss='categorical_crossentropy',
                  optimizer=optimizer,
                  metrics=['accuracy'])
    return model

In [10]:
epochs = 25
x_train, x_test, y_train, y_test = train_test_split(docs, y, test_size=0.2)

model = create_model()
hist = model.fit(x_train, y_train,
                 batch_size=16,
                 validation_data=(x_test, y_test),
                 epochs=epochs,
                 callbacks=[EarlyStopping(patience=2, monitor='val_loss')])

Train on 15663 samples, validate on 3916 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25


# 2.1 Change Preprocessings

## 2.1.1 Do lowercase

In [11]:
docs = create_docs(df)
tokenizer = Tokenizer(lower=True, filters='')
tokenizer.fit_on_texts(docs)
num_words = sum([1 for _, v in tokenizer.word_counts.items() if v >= min_count])

tokenizer = Tokenizer(num_words=num_words, lower=True, filters='')
tokenizer.fit_on_texts(docs)
docs = tokenizer.texts_to_sequences(docs)

maxlen = 256

docs = pad_sequences(sequences=docs, maxlen=maxlen)

input_dim = np.max(docs) + 1

In [12]:
epochs = 16
x_train, x_test, y_train, y_test = train_test_split(docs, y, test_size=0.2)

model = create_model()
hist = model.fit(x_train, y_train,
                 batch_size=16,
                 validation_data=(x_test, y_test),
                 epochs=epochs,
                 callbacks=[EarlyStopping(patience=2, monitor='val_loss')])

Train on 15663 samples, validate on 3916 samples
Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16


In [13]:
test_df = pd.read_csv('../Data/test.csv')
docs = create_docs(test_df)
docs = tokenizer.texts_to_sequences(docs)
docs = pad_sequences(sequences=docs, maxlen=maxlen)
y = model.predict_proba(docs)

result = pd.read_csv('../Data/sample_submission.csv')
for a, i in a2c.items():
    result[a] = y[:, i]



In [14]:
result.to_csv('fastText_result.csv', index=False)