In [90]:
import numpy as np
import pandas as pd
from collections import defaultdict
import keras
import keras.backend as K
from keras.layers import Dense, GlobalAveragePooling1D, Embedding
from keras.callbacks import EarlyStopping
from keras.models import Sequential
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.layers.recurrent import LSTM, GRU
from keras.layers.recurrent import LSTM, GRU
from keras.layers import GlobalAveragePooling1D
from keras.layers.core import Dense, Activation, Dropout
from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D

from keras.utils import to_categorical
from sklearn.model_selection import train_test_split

np.random.seed(7)

In [91]:
df = pd.read_csv('train.csv')
a2c = {'EAP': 0, 'HPL' : 1, 'MWS' : 2}
y = np.array([a2c[a] for a in df.author])
y = to_categorical(y)

In [92]:
y

array([[ 1.,  0.,  0.],
       [ 0.,  1.,  0.],
       [ 1.,  0.,  0.],
       ..., 
       [ 1.,  0.,  0.],
       [ 1.,  0.,  0.],
       [ 0.,  1.,  0.]])

In [93]:
counter = {name : defaultdict(int) for name in set(df.author)}
for (text, author) in zip(df.text, df.author):
    text = text.replace(' ', '')
    for c in text:
        counter[author][c] += 1

chars = set()
for v in counter.values():
    chars |= v.keys()
    
names = [author for author in counter.keys()]

print('c ', end='')
for n in names:
    print(n, end='   ')
print()
for c in chars:    
    print(c, end=' ')
    for n in names:
        print(counter[n][c], end=' ')
    print()

c EAP   MWS   HPL   
c 24127 17911 18338 
ï 0 0 7 
ö 16 0 3 
t 82426 63142 62235 
n 62636 50291 50879 
Y 282 234 111 
e 114885 97515 88259 
r 51221 44042 40590 
Σ 0 0 1 
y 17001 14877 12534 
C 395 308 439 
S 729 578 841 
Z 23 2 51 
ä 1 0 6 
Å 0 0 1 
L 458 307 249 
δ 0 0 2 
Æ 1 0 4 
Ν 0 0 1 
: 176 339 47 
p 17422 12361 10965 
K 86 35 176 
z 634 400 529 
é 47 0 15 
f 22354 18351 16272 
g 16088 12601 14951 
X 17 4 5 
H 864 669 741 
O 414 282 503 
I 4846 4917 3480 
P 442 365 320 
B 835 395 533 
ü 1 0 5 
ñ 0 0 7 
; 1354 2662 1143 
W 739 681 732 
è 15 0 0 
l 35371 27819 30273 
v 9624 7948 6529 
Ο 0 0 3 
D 491 227 334 
î 1 0 0 
o 67145 53386 50996 
. 8406 5761 5908 
w 17507 16062 15554 
Υ 0 0 1 
s 53841 45962 43915 
u 26311 21025 19519 
ë 0 0 12 
? 510 419 169 
j 683 682 424 
ç 1 0 0 
x 1951 1267 1061 
α 0 0 2 
A 1258 943 1167 
k 4277 3707 5204 
q 1030 677 779 
" 2987 1469 513 
b 13245 9611 10636 
R 258 385 237 
, 17594 12045 8581 
N 411 204 345 
T 2217 1230 1583 
E 435 445 281 
J 164 66 210 

In [94]:
names

['EAP', 'MWS', 'HPL']

In [95]:
def preprocess(text):
    text = text.replace("' ", " ' ")
    signs = set(',.:;"?!')
    prods = set(text) & signs
    if not prods:
        return text

    for sign in prods:
        text = text.replace(sign, ' {} '.format(sign) )
    return text

In [96]:
def create_docs(df, n_gram_max=2):
    def add_ngram(q, n_gram_max):
            ngrams = []
            for n in range(2, n_gram_max+1):
                for w_index in range(len(q)-n+1):
                    ngrams.append('--'.join(q[w_index:w_index+n]))
            return q + ngrams
        
    docs = []
    for doc in df.text:
        doc = preprocess(doc).split()
        docs.append(' '.join(add_ngram(doc, n_gram_max)))
    
    return docs

In [97]:
min_count = 2

docs = create_docs(df)
print(docs[0])
print('-'*20)

tokenizer = Tokenizer(lower=False, filters='')
tokenizer.fit_on_texts(docs)
num_words = sum([1 for _, v in tokenizer.word_counts.items() if v >= min_count])

tokenizer = Tokenizer(num_words=num_words, lower=False, filters='')
tokenizer.fit_on_texts(docs)
docs = tokenizer.texts_to_sequences(docs)
print(docs[0])
print('-'*20)

maxlen = 256

docs = pad_sequences(sequences=docs, maxlen=maxlen)
print(docs[0])
print('-'*20)


This process , however , afforded me no means of ascertaining the dimensions of my dungeon ; as I might make its circuit , and return to the point whence I set out , without being aware of the fact ; so perfectly uniform seemed the wall . This--process process--, ,--however however--, ,--afforded afforded--me me--no no--means means--of of--ascertaining ascertaining--the the--dimensions dimensions--of of--my my--dungeon dungeon--; ;--as as--I I--might might--make make--its its--circuit circuit--, ,--and and--return return--to to--the the--point point--whence whence--I I--set set--out out--, ,--without without--being being--aware aware--of of--the the--fact fact--; ;--so so--perfectly perfectly--uniform uniform--seemed seemed--the the--wall wall--.
--------------------
[174, 6008, 1, 224, 1, 2481, 26, 46, 469, 3, 20045, 2, 4827, 3, 15, 10367, 14, 21, 7, 120, 282, 59, 9408, 1, 5, 482, 6, 2, 393, 4601, 7, 533, 106, 1, 206, 182, 1587, 3, 2, 506, 14, 49, 2645, 11508, 142, 2, 725, 4, 20046, 2

In [98]:
docs.shape

(19579, 256)

In [99]:
input_dim = np.max(docs) + 1
embedding_dims = 20

In [100]:
def create_model(embedding_dims=30, optimizer='adam'):
    model = Sequential()
    model.add(Embedding(input_dim=input_dim, output_dim=embedding_dims))
    model.add(GlobalAveragePooling1D())
    model.add(Dense(3, activation='softmax'))

    model.compile(loss='categorical_crossentropy',
                  optimizer=optimizer,
                  metrics=['accuracy'])
    return model

In [111]:
def create_model2(embedding_dims=30, optimizer='adam'):
    model = Sequential()
    model.add(Embedding(input_dim=input_dim, output_dim=embedding_dims))
    model.add(SpatialDropout1D(0.3))
    model.add(GRU(300, dropout=0.3, recurrent_dropout=0.3, return_sequences=True))
    model.add(GRU(300, dropout=0.3, recurrent_dropout=0.3))
    
    model.add(Dense(100, activation='relu'))
    model.add(Dropout(0.8))
    
    model.add(Dense(3, activation='softmax'))
    model.compile(loss='categorical_crossentropy',
                  optimizer=optimizer,
                  metrics=['accuracy'])
    return model

In [112]:
epochs = 50
x_train, x_test, y_train, y_test = train_test_split(docs, y, test_size=0.2)

model = create_model2()
hist = model.fit(x_train, y_train,
                 batch_size=512,
                 validation_data=(x_test, y_test),
                 epochs=epochs,
                 callbacks=[EarlyStopping(patience=3, monitor='val_loss')])

Train on 15663 samples, validate on 3916 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50


KeyboardInterrupt: 

In [47]:
docs = create_docs(df)
tokenizer = Tokenizer(lower=True, filters='')
tokenizer.fit_on_texts(docs)
num_words = sum([1 for _, v in tokenizer.word_counts.items() if v >= min_count])

tokenizer = Tokenizer(num_words=num_words, lower=True, filters='')
tokenizer.fit_on_texts(docs)
docs = tokenizer.texts_to_sequences(docs)

maxlen = 256

docs = pad_sequences(sequences=docs, maxlen=maxlen)

input_dim = np.max(docs) + 1

In [48]:
epochs = 25
x_train, x_test, y_train, y_test = train_test_split(docs, y, test_size=0.2)

model = create_model()
hist = model.fit(x_train, y_train,
                 batch_size=16,
                 validation_data=(x_test, y_test),
                 epochs=epochs,
                 callbacks=[EarlyStopping(patience=2, monitor='val_loss')])


Train on 15663 samples, validate on 3916 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25


In [85]:
docs = create_docs(df)
tokenizer = Tokenizer(lower=True, filters='')
tokenizer.fit_on_texts(docs)
num_words = sum([1 for _, v in tokenizer.word_counts.items() if v >= min_count])

tokenizer = Tokenizer(num_words=num_words, lower=True, filters='')
tokenizer.fit_on_texts(docs)
docs = tokenizer.texts_to_sequences(docs)

maxlen = 128

docs = pad_sequences(sequences=docs, maxlen=maxlen)

input_dim = np.max(docs) + 1
print(input_dim)

75104


In [87]:
epochs = 25
x_train, x_test, y_train, y_test = train_test_split(docs, y, test_size=0.2)

model = create_model2()
hist = model.fit(x_train, y_train,
                 batch_size=16,
                 validation_data=(x_test, y_test),
                 epochs=epochs,
                 #callbacks=[EarlyStopping(patience=2, monitor='val_loss')]
                )

ValueError: Input 0 is incompatible with layer gru_3: expected ndim=3, found ndim=2

In [None]:
test_df = pd.read_csv('test.csv')
docs = create_docs(test_df)
docs = tokenizer.texts_to_sequences(docs)
docs = pad_sequences(sequences=docs, maxlen=maxlen)
y = model.predict_proba(docs)


In [None]:
my_sub = pd.DataFrame(columns={'id', 'EAP','HPL', 'MWS'})
my_sub=my_sub[['id', 'EAP','HPL', 'MWS']]
my_sub

In [None]:
for i in range(len(y)):
    my_sub.loc[i] = [test_df['id'][i], y[i][0], y[i][1], y[i][2]]

In [None]:
my_sub

In [None]:
my_sub.to_csv('roberto_new_keras_fasttext.csv',index=False)