In [1]:
import pandas as pd

In [2]:
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")

In [3]:
train.shape

(19579, 3)

In [4]:
train[:5]

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


In [5]:
from sklearn.feature_extraction.text import CountVectorizer

dataset = train.append(test)
dataset.shape
word_dict = CountVectorizer().fit(dataset["text"])
dict_size = len(word_dict.vocabulary_) + 1
dict_size

28301

In [6]:
import numpy as np

np.array(list(word_dict.vocabulary_.values())).min()

0

In [None]:
# import nltk
# nltk.download('punkt')

In [7]:
from nltk import word_tokenize

invalid_word_idx = 0

train["encoded"] = train["text"].apply(lambda x: 
                              [word_dict.vocabulary_[word.lower()] + 1 
                               if word.lower() in word_dict.vocabulary_ 
                               else invalid_word_idx
                               for word in word_tokenize(x) 
                               if word.isalpha() ])
train["encoded_len"] = train["encoded"].apply(lambda x: len(x))
train[:5]                              

Unnamed: 0,id,text,author,encoded,encoded_len
0,id26305,"This process, however, afforded me no means of...",EAP,"[24990, 19301, 12123, 511, 15384, 16656, 15404...",41
1,id17569,It never once occurred to me that the fumbling...,HPL,"[13598, 16558, 17118, 16985, 25230, 15384, 248...",14
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP,"[12624, 11896, 14355, 11397, 27456, 0, 10915, ...",36
3,id27763,How lovely is spring As we looked from Windsor...,MWS,"[12122, 14826, 13567, 23393, 1457, 27521, 1475...",34
4,id12958,"Finding nothing else, not even gold, the Super...",HPL,"[9668, 16754, 8114, 16745, 8785, 10915, 24887,...",27


In [8]:
train["encoded_len"].describe()

count    19579.000000
mean        26.639512
std         19.003268
min          0.000000
25%         15.000000
50%         23.000000
75%         34.000000
max        862.000000
Name: encoded_len, dtype: float64

Явно има изречения с по 0 думи, ще ги филтрирам не виждам как ще носят информация :)

In [9]:
train = train [train["encoded_len"] > 2]
train["encoded_len"].describe()

count    19572.000000
mean        26.648426
std         19.000813
min          3.000000
25%         15.000000
50%         23.000000
75%         34.000000
max        862.000000
Name: encoded_len, dtype: float64

In [10]:
max_sentence_len = int(train["encoded_len"].describe()["max"])
max_sentence_len

862

Ще подравня всички изречения да имат равен брой думи, за да мога да ползвам фиксиран ембединг

In [11]:
train_padded = pd.DataFrame()
def padd_sentence(sentence):
    diff = max_sentence_len - len(sentence)
    for _ in range(diff):
        sentence.append(invalid_word_idx)
    
    return sentence
    
train_padded["encoded"] = train["encoded"].apply(padd_sentence)
train_padded["encoded_len"] = train_padded["encoded"].apply(lambda x: len(x))
train_padded[:5]

Unnamed: 0,encoded,encoded_len
0,"[24990, 19301, 12123, 511, 15384, 16656, 15404...",862
1,"[13598, 16558, 17118, 16985, 25230, 15384, 248...",862
2,"[12624, 11896, 14355, 11397, 27456, 0, 10915, ...",862
3,"[12122, 14826, 13567, 23393, 1457, 27521, 1475...",862
4,"[9668, 16754, 8114, 16745, 8785, 10915, 24887,...",862


In [12]:
x_train = np.array([ sentence for sentence in train_padded["encoded"]])
x_train = x_train / dict_size
x_train.shape

(19572, 862)

In [13]:
train["author"] = train["author"].replace("EAP",0)
train["author"] = train["author"].replace("HPL",1)
train["author"] = train["author"].replace("MWS",2)

from sklearn.preprocessing import OneHotEncoder

y_train = OneHotEncoder().fit_transform(train["author"].as_matrix().reshape(-1, 1))
y_train.shape

(19572, 3)

In [14]:
y_train = y_train.todense()

In [15]:
import keras
from keras.layers import Dense, Embedding, Bidirectional, Input, LSTM, Bidirectional
from keras.models import Model, Sequential
from keras.layers.recurrent import SimpleRNN
from keras.layers.wrappers import TimeDistributed
from keras.layers import Dropout, Reshape, Flatten

Using TensorFlow backend.


In [None]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

In [16]:
sequence_input = Input(shape=(max_sentence_len,))
embedding_layer = Embedding(input_dim=max_sentence_len, output_dim=128 )
embedded_sequences = embedding_layer(sequence_input)
l_lstm = Bidirectional(LSTM(64))(embedded_sequences)
dropout = Dropout(0.5)(l_lstm)
preds = Dense(3, activation="softmax")(dropout)
model = Model(sequence_input, preds)

In [17]:
model.compile(loss="categorical_crossentropy",
              optimizer="adam",
              metrics=["acc"])


In [18]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 862)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 862, 128)          110336    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 128)               98816     
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 3)                 387       
Total params: 209,539
Trainable params: 209,539
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.fit(x_train, y_train,epochs=16, validation_split=0.2, batch_size=256)

In [None]:
model.save("bidirection_lstm.h5") # loss > 1 

Ще пробвам нов модел с по-малък embedding layer и 1 lstm

In [20]:
sequence_input = Input(shape=(max_sentence_len,))
embedding_layer = Embedding(input_dim=max_sentence_len, output_dim=64 )
embedded_sequences = embedding_layer(sequence_input)
lstm = LSTM(64)(embedded_sequences)
preds = Dense(3, activation="softmax")(lstm)
model = Model(sequence_input, preds)

In [21]:
model.compile(loss="categorical_crossentropy",
              optimizer="adam",
              metrics=["acc"])


In [22]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 862)               0         
_________________________________________________________________
embedding_3 (Embedding)      (None, 862, 64)           55168     
_________________________________________________________________
lstm_3 (LSTM)                (None, 64)                33024     
_________________________________________________________________
dense_3 (Dense)              (None, 3)                 195       
Total params: 88,387
Trainable params: 88,387
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.fit(x_train, y_train,epochs=12, validation_split=0.2, batch_size=256)

In [None]:
model.save("lstm.h5") # loss > 1

Без успех, следващото моля.

In [23]:
model = Sequential([
    Embedding(input_dim=max_sentence_len, output_dim=128),
    LSTM(128, dropout=0.2, recurrent_dropout=0.2),
    Dense(3, activation="softmax")
])

In [24]:
model.compile(loss="categorical_crossentropy",
              optimizer="adam",
              metrics=["accuracy"])


In [25]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, None, 128)         110336    
_________________________________________________________________
lstm_4 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dense_4 (Dense)              (None, 3)                 387       
Total params: 242,307
Trainable params: 242,307
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.fit(x_train, y_train, batch_size=512,epochs=15,validation_split=0.2)

In [None]:
# wont save it loss > 1

Ще пробвам и с нормална feed forward, но с embedding

In [26]:
model = Sequential([
    Embedding(max_sentence_len, 84, input_length=max_sentence_len),
    Dropout(0.2),
    Flatten(),
    Dense(64, activation="tanh"),
    Dropout(0.2),
    Dense(32, activation="tanh"),
    Dense(16, activation="tanh"),
    Dense(3, activation="softmax")
])

In [27]:
model.compile(loss="categorical_crossentropy",
              optimizer="adam",
              metrics=["accuracy"])


In [28]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 862, 84)           72408     
_________________________________________________________________
dropout_2 (Dropout)          (None, 862, 84)           0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 72408)             0         
_________________________________________________________________
dense_5 (Dense)              (None, 64)                4634176   
_________________________________________________________________
dropout_3 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_6 (Dense)              (None, 32)                2080      
_________________________________________________________________
dense_7 (Dense)              (None, 16)                528       
__________

In [None]:
model.fit(x_train, y_train, batch_size=256,epochs=12,validation_split=0.2)

Ще натренирам мой собствен елементарен Embedding layer

In [29]:
input_layer = Input(shape=(max_sentence_len,))
embedded = Dense(64, activation="relu")(input_layer)
decoded = Dense(max_sentence_len, activation="softmax")(embedded)

embedded_encoded = Model(input_layer, decoded)

In [30]:
embedded_encoded.compile(optimizer="adadelta", loss="binary_crossentropy")

In [31]:
embedded_encoded.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 862)               0         
_________________________________________________________________
dense_9 (Dense)              (None, 64)                55232     
_________________________________________________________________
dense_10 (Dense)             (None, 862)               56030     
Total params: 111,262
Trainable params: 111,262
Non-trainable params: 0
_________________________________________________________________


In [None]:
embedded_encoded.fit(x_train, x_train, batch_size=256,epochs=512,validation_split=0.2)

In [None]:
embedded_encoded.save("embedding_64.h5")

In [33]:
embedding = embedded_encoded.layers[1]

In [35]:
input_layer = Input((max_sentence_len,))
embedded_layer = embedding(input_layer)
hidden_1 = Dense(1024, activation="relu")(embedded_layer)
dropout = Dropout(0.2)(hidden_1)
hidden_2 = Dense(256, activation="relu")(dropout)

pred = Dense(3, activation="softmax")(hidden_2)

model = Model(input_layer, pred)

In [None]:
model.compile(loss="categorical_crossentropy",
              optimizer="adam",
              metrics=["accuracy"])


In [36]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_5 (InputLayer)         (None, 862)               0         
_________________________________________________________________
dense_9 (Dense)              (None, 64)                55232     
_________________________________________________________________
dense_11 (Dense)             (None, 1024)              66560     
_________________________________________________________________
dropout_4 (Dropout)          (None, 1024)              0         
_________________________________________________________________
dense_12 (Dense)             (None, 256)               262400    
_________________________________________________________________
dense_13 (Dense)             (None, 3)                 771       
Total params: 384,963
Trainable params: 384,963
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.fit(x_train, y_train, batch_size=256,epochs=128,validation_split=0.2)

С нашия ембединг резултатите са малко по-добри

In [37]:
model = Sequential([
    Embedding(max_sentence_len, output_dim=64),
    LSTM(32),
    Dropout(0.5),
    Dense(3, activation="softmax")
])

In [38]:
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["acc"])

In [39]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, None, 64)          55168     
_________________________________________________________________
lstm_5 (LSTM)                (None, 32)                12416     
_________________________________________________________________
dropout_5 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_14 (Dense)             (None, 3)                 99        
Total params: 67,683
Trainable params: 67,683
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.fit(x_train, y_train, epochs=12, batch_size=128, validation_split=0.2) # loss > 1