In [None]:
import numpy as np
import pandas as pd
from collections import defaultdict
import keras
%config Completer.use_jedi = False

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Dense, GlobalAveragePooling1D , Embedding
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split


In [None]:
import zipfile
with zipfile.ZipFile('../input/spooky-author-identification/train.zip', 'r') as zip_ref:
    zip_ref.extractall('./')

In [None]:
with zipfile.ZipFile('../input/spooky-author-identification/test.zip', 'r') as zip_ref:
    zip_ref.extractall('./')

In [None]:
with zipfile.ZipFile('../input/spooky-author-identification/sample_submission.zip', 'r') as zip_ref:
    zip_ref.extractall('./')

In [None]:
train = pd.read_csv('./train.csv')

In [None]:
train.head()

In [None]:
a2c = {'EAP': 0, 'HPL' : 1, 'MWS' : 2}
y = np.array([a2c[a] for a in train.author])
y = to_categorical(y)

In [None]:
def preprocess(text):
    text = text.replace("' " , " ' ")
    signs = set(',.:;?!')
    prods = set(text) & signs
    if not prods:
        return text
    
    for sign in prods:
        text = text.replace(sign , ' {} '.format(sign))
    return text

In [None]:
def create_docs(df, n_gram_max=2):
    def add_ngram(q, n_gram_max):
            ngrams = []
            for n in range(2, n_gram_max+1):
                for w_index in range(len(q)-n+1):
                    ngrams.append('--'.join(q[w_index:w_index+n]))
            return q + ngrams
        
    docs = []
    for doc in df.text:
        doc = preprocess(doc).split()
        docs.append(' '.join(add_ngram(doc, n_gram_max)))
    
    return docs

In [None]:
min_count = 2
docs = create_docs(train)
tokenizer = Tokenizer(lower = False , filters='')
tokenizer.fit_on_texts(docs)
num_words = sum([1 for _,v in tokenizer.word_counts.items() if v >= min_count])
tokenizer = Tokenizer(num_words=num_words, lower = False , filters='')
tokenizer.fit_on_texts(docs)
docs = tokenizer.texts_to_sequences(docs)
maxlen = 256
docs = pad_sequences(sequences=docs , maxlen=maxlen)

In [None]:
input_dim = np.max(docs) +1
embedding_dim = 20

In [None]:
def create_model(embedding_dims=20, optimizer='adam'):
    model = Sequential()
    model.add(Embedding(input_dim=input_dim, output_dim=embedding_dims))
    model.add(GlobalAveragePooling1D())
    model.add(Dense(3, activation='softmax'))

    model.compile(loss='categorical_crossentropy',
                  optimizer=optimizer,
                  metrics=['accuracy'])
    return model

In [None]:
epochs = 50
x_train , x_test , y_train, y_test = train_test_split(docs , y , test_size=0.2)
model = create_model()
hist = model.fit(x_train , y_train , 
                batch_size = 16,
                validation_data = (x_test , y_test),
                epochs = epochs,
                callbacks = [EarlyStopping(patience = 3 , monitor = 'val_loss')])

In [None]:
test_df = pd.read_csv('./test.csv')
docs = create_docs(test_df)
docs = tokenizer.texts_to_sequences(docs)
docs = pad_sequences(sequences=docs, maxlen=maxlen)

In [None]:
y = model.predict_proba(docs)
result = pd.read_csv('./sample_submission.csv')
result.head()
for a, i in a2c.items():
    result[a] = y[:, i]

In [None]:
result.to_csv('fast_text.csv' , index = False)