In [2]:
import numpy as np
import pandas as pd
from collections import defaultdict
import keras
import keras.backend as K
from keras.layers import Dense, GlobalAveragePooling1D, Embedding
from keras.callbacks import EarlyStopping
from keras.models import Sequential
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split

np.random.seed(7)

Using TensorFlow backend.


In [4]:
df = pd.read_csv('train.csv')
a2c = {'EAP': 0, 'HPL' : 1, 'MWS' : 2}
y = np.array([a2c[a] for a in df.author])
y = to_categorical(y)

In [7]:
y

array([[ 1.,  0.,  0.],
       [ 0.,  1.,  0.],
       [ 1.,  0.,  0.],
       ..., 
       [ 1.,  0.,  0.],
       [ 1.,  0.,  0.],
       [ 0.,  1.,  0.]])

In [8]:
counter = {name : defaultdict(int) for name in set(df.author)}
for (text, author) in zip(df.text, df.author):
    text = text.replace(' ', '')
    for c in text:
        counter[author][c] += 1

chars = set()
for v in counter.values():
    chars |= v.keys()
    
names = [author for author in counter.keys()]

print('c ', end='')
for n in names:
    print(n, end='   ')
print()
for c in chars:    
    print(c, end=' ')
    for n in names:
        print(counter[n][c], end=' ')
    print()

c HPL   MWS   EAP   
' 1710 476 1334 
T 1583 1230 2217 
ö 3 0 16 
u 19519 21025 26311 
î 0 0 1 
ê 2 0 28 
B 533 395 835 
, 8581 12045 17594 
m 17622 20471 22792 
æ 10 0 36 
V 67 57 156 
Z 51 2 23 
. 5908 5761 8406 
s 43915 45962 53841 
Υ 1 0 0 
G 318 246 313 
Π 1 0 0 
ô 0 0 8 
y 12534 14877 17001 
i 44250 46080 60952 
δ 2 0 0 
l 30273 27819 35371 
a 56815 55274 68525 
K 176 35 86 
r 40590 44042 51221 
q 779 677 1030 
ä 6 0 1 
A 1167 943 1258 
U 94 46 166 
k 5204 3707 4277 
o 50996 53386 67145 
j 424 682 683 
ἶ 2 0 0 
b 10636 9611 13245 
R 237 385 258 
J 210 66 164 
N 345 204 411 
X 5 4 17 
f 16272 18351 22354 
Æ 4 0 1 
ñ 7 0 0 
α 2 0 0 
h 42770 43738 51580 
L 249 307 458 
" 513 1469 2987 
â 0 0 6 
F 269 232 383 
w 15554 16062 17507 
M 645 415 1065 
S 841 578 729 
ü 5 0 1 
W 732 681 739 
E 281 445 435 
d 33366 35315 36862 
Σ 1 0 0 
à 0 0 10 
t 62235 63142 82426 
Ο 3 0 0 
c 18338 17911 24127 
p 10965 12361 17422 
C 439 308 395 
ï 7 0 0 
n 50879 50291 62636 
? 169 419 510 
D 334 227 491 


In [9]:
def preprocess(text):
    text = text.replace("' ", " ' ")
    signs = set(',.:;"?!')
    prods = set(text) & signs
    if not prods:
        return text

    for sign in prods:
        text = text.replace(sign, ' {} '.format(sign) )
    return text

In [10]:
def create_docs(df, n_gram_max=2):
    def add_ngram(q, n_gram_max):
            ngrams = []
            for n in range(2, n_gram_max+1):
                for w_index in range(len(q)-n+1):
                    ngrams.append('--'.join(q[w_index:w_index+n]))
            return q + ngrams
        
    docs = []
    for doc in df.text:
        doc = preprocess(doc).split()
        docs.append(' '.join(add_ngram(doc, n_gram_max)))
    
    return docs

In [11]:
min_count = 2

docs = create_docs(df)
tokenizer = Tokenizer(lower=False, filters='')
tokenizer.fit_on_texts(docs)
num_words = sum([1 for _, v in tokenizer.word_counts.items() if v >= min_count])

tokenizer = Tokenizer(num_words=num_words, lower=False, filters='')
tokenizer.fit_on_texts(docs)
docs = tokenizer.texts_to_sequences(docs)

maxlen = 256

docs = pad_sequences(sequences=docs, maxlen=maxlen)

In [12]:
input_dim = np.max(docs) + 1
embedding_dims = 20

In [13]:
def create_model(embedding_dims=20, optimizer='adam'):
    model = Sequential()
    model.add(Embedding(input_dim=input_dim, output_dim=embedding_dims))
    model.add(GlobalAveragePooling1D())
    model.add(Dense(3, activation='softmax'))

    model.compile(loss='categorical_crossentropy',
                  optimizer=optimizer,
                  metrics=['accuracy'])
    return model

In [14]:
epochs = 25
x_train, x_test, y_train, y_test = train_test_split(docs, y, test_size=0.2)

model = create_model()
hist = model.fit(x_train, y_train,
                 batch_size=16,
                 validation_data=(x_test, y_test),
                 epochs=epochs,
                 callbacks=[EarlyStopping(patience=2, monitor='val_loss')])

Train on 15663 samples, validate on 3916 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25


In [16]:
docs = create_docs(df)
tokenizer = Tokenizer(lower=True, filters='')
tokenizer.fit_on_texts(docs)
num_words = sum([1 for _, v in tokenizer.word_counts.items() if v >= min_count])

tokenizer = Tokenizer(num_words=num_words, lower=True, filters='')
tokenizer.fit_on_texts(docs)
docs = tokenizer.texts_to_sequences(docs)

maxlen = 256

docs = pad_sequences(sequences=docs, maxlen=maxlen)

input_dim = np.max(docs) + 1

In [17]:
epochs = 25
x_train, x_test, y_train, y_test = train_test_split(docs, y, test_size=0.2)

model = create_model()
hist = model.fit(x_train, y_train,
                 batch_size=16,
                 validation_data=(x_test, y_test),
                 epochs=epochs,
                 callbacks=[EarlyStopping(patience=2, monitor='val_loss')])


Train on 15663 samples, validate on 3916 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25


In [19]:
docs = create_docs(df)
tokenizer = Tokenizer(lower=True, filters='')
tokenizer.fit_on_texts(docs)
num_words = sum([1 for _, v in tokenizer.word_counts.items() if v >= min_count])

tokenizer = Tokenizer(num_words=num_words, lower=True, filters='')
tokenizer.fit_on_texts(docs)
docs = tokenizer.texts_to_sequences(docs)

maxlen = 128

docs = pad_sequences(sequences=docs, maxlen=maxlen)

input_dim = np.max(docs) + 1

In [20]:
epochs = 25
x_train, x_test, y_train, y_test = train_test_split(docs, y, test_size=0.2)

model = create_model()
hist = model.fit(x_train, y_train,
                 batch_size=16,
                 validation_data=(x_test, y_test),
                 epochs=epochs,
                 callbacks=[EarlyStopping(patience=2, monitor='val_loss')])

Train on 15663 samples, validate on 3916 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25


In [21]:
test_df = pd.read_csv('test.csv')
docs = create_docs(test_df)
docs = tokenizer.texts_to_sequences(docs)
docs = pad_sequences(sequences=docs, maxlen=maxlen)
y = model.predict_proba(docs)


In [30]:
test_df

Unnamed: 0,id,text
0,id02310,"Still, as I urged our leaving Ireland with suc..."
1,id24541,"If a fire wanted fanning, it could readily be ..."
2,id00134,And when they had broken down the frail door t...
3,id27757,While I was thinking how I should possibly man...
4,id04081,I am not sure to what limit his knowledge may ...
5,id27337,"""The thick and peculiar mist, or smoke, which ..."
6,id24265,"That which is not matter, is not at all unless..."
7,id25917,I sought for repose although I did not hope fo...
8,id04951,"Upon the fourth day of the assassination, a pa..."
9,id14549,"""The tone metaphysical is also a good one."


In [28]:
y[0]

array([ 0.00775385,  0.00236759,  0.98987854], dtype=float32)

In [29]:
y[0][1]

0.0023675908

In [24]:
my_sub = pd.DataFrame(columns={'id', 'EAP','HPL', 'MWS'})
my_sub=my_sub[['id', 'EAP','HPL', 'MWS']]
my_sub

Unnamed: 0,id,EAP,HPL,MWS


In [31]:
for i in range(len(y)):
    my_sub.loc[i] = [test_df['id'][i], y[i][0], y[i][1], y[i][2]]

In [32]:
my_sub

Unnamed: 0,id,EAP,HPL,MWS
0,id02310,7.753852e-03,2.367591e-03,9.898785e-01
1,id24541,9.997495e-01,2.504703e-04,2.786851e-09
2,id00134,6.740056e-03,9.884935e-01,4.766435e-03
3,id27757,9.457802e-01,5.264156e-02,1.578200e-03
4,id04081,7.790537e-01,1.452995e-01,7.564667e-02
5,id27337,9.945274e-01,4.808387e-03,6.641285e-04
6,id24265,9.300900e-01,6.726805e-02,2.641895e-03
7,id25917,9.026050e-04,3.414609e-02,9.649513e-01
8,id04951,9.999908e-01,8.917546e-06,2.950157e-07
9,id14549,9.878228e-01,2.431972e-03,9.745225e-03


In [33]:
my_sub.to_csv('roberto_new_keras_fasttext.csv',index=False)