In [96]:
import pandas as pd

In [97]:
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")

In [98]:
train.shape

(19579, 3)

In [99]:
train[:5]

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


In [100]:
from sklearn.feature_extraction.text import CountVectorizer

dataset = train.append(test)
dataset.shape
word_dict = CountVectorizer().fit(dataset["text"])
dict_size = len(word_dict.vocabulary_)
dict_size

28300

In [101]:
import numpy as np

np.array(list(word_dict.vocabulary_.values())).min()

0

In [102]:
x_train = word_dict.transform(train["text"])
x_train.shape

(19579, 28300)

In [115]:
def one_hot_y(y):
    y["author"] = y["author"].replace("EAP",0)
    y["author"] = y["author"].replace("HPL",1)
    y["author"] = y["author"].replace("MWS",2)

    from sklearn.preprocessing import OneHotEncoder

    y_encoded = OneHotEncoder().fit_transform(train["author"].as_matrix().reshape(-1, 1))
    return y_encoded

In [116]:
y_train = one_hot_y(train)
y_train.shape

(19579, 3)

In [104]:
x_train = x_train.todense()

In [105]:
y_train = y_train.todense()

In [106]:
import keras
from keras.layers import Dense, Embedding, Bidirectional, LSTM, GlobalMaxPooling1D, Dropout
from keras.models import  Sequential

In [107]:
model = Sequential([
    Dense(16, input_shape=(dict_size,), activation="tanh"),
    Dense(32, activation="tanh"),
    Dense(3, activation="softmax")
])

In [108]:
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

In [109]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_33 (Dense)             (None, 16)                452816    
_________________________________________________________________
dense_34 (Dense)             (None, 32)                544       
_________________________________________________________________
dense_35 (Dense)             (None, 3)                 99        
Total params: 453,459
Trainable params: 453,459
Non-trainable params: 0
_________________________________________________________________


In [110]:
from keras.callbacks import EarlyStopping
early_stopping = EarlyStopping(monitor='val_loss', patience=2)

In [111]:
model.fit(x_train, y_train, epochs=2, batch_size=128, validation_split=0.2, callbacks=[early_stopping])

Train on 15663 samples, validate on 3916 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f1027257a20>

In [112]:
model.save("count_vect_dense_0.39.h5")

In [172]:
from keras.models import load_model
def predict(x_test, model_name=""):
    model = load_model(model_name)
    prediction = model.predict([x_test])
    return prediction

In [197]:
x_test = word_dict.transform(test["text"])
x_test = x_test.todense()
prediction = predict(x_test, "count_vect_dense_0.39.h5")

In [198]:
prediction.shape

(8392, 3)

In [199]:
prediction[:5]

array([[  8.27568695e-02,   2.68017855e-02,   8.90441298e-01],
       [  9.94316995e-01,   5.60619356e-03,   7.68447426e-05],
       [  3.03895418e-02,   9.67859566e-01,   1.75093161e-03],
       [  8.21188927e-01,   1.78083554e-01,   7.27521547e-04],
       [  8.82422626e-01,   6.79892153e-02,   4.95881550e-02]], dtype=float32)

In [200]:
classes=["EAP", "HPL", "MWS"]

In [202]:
def to_submission(prediction):
    return pd.DataFrame(prediction, columns=classes, index=test["id"])

In [203]:
submission_count_vect_dense = to_submission(prediction)
submission_count_vect_dense[:5]

Unnamed: 0_level_0,EAP,HPL,MWS
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
id02310,0.082757,0.026802,0.890441
id24541,0.994317,0.005606,7.7e-05
id00134,0.03039,0.96786,0.001751
id27757,0.821189,0.178084,0.000728
id04081,0.882423,0.067989,0.049588


In [204]:
submission_count_vect_dense.to_csv("subm_count_vect_dense.csv")
submission_count_vect_dense.shape

(8392, 3)

In [151]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [152]:
avg_text_sample_len = sum([len(text) for text in train["text"]]) / len(train["text"])
avg_text_sample_len

149.05740844782676

In [153]:
max_features = 20000
max_len = 120

In [154]:
tokenizer = Tokenizer(num_words=max_features)

In [155]:
corpus = train["text"].append(test["text"])
corpus.shape

(27971,)

In [156]:
tokenizer.fit_on_texts(corpus)

In [157]:
x_train = tokenizer.texts_to_sequences(train["text"])
x_train = pad_sequences(x_train, maxlen=max_len)

In [158]:
x_train.shape

(19579, 120)

In [159]:
y_train.shape

(19579, 3)

In [161]:
y_train = y_train.todense()

In [162]:
x_train[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,   26, 3334,  139, 1295,   22,   36,  285,    2, 6426,
          1, 2341,    2,   10, 4400,   16,    5,   80,  187,   47, 3805,
          3,  303,    4,    1,  263, 2208,    5,  320,   74,  136,  127,
        922,    2,    1,  306,   40, 1488, 4175,   98,    1,  443], dtype=int32)

А сега нека да пробваме и lstm layer

In [163]:
model = Sequential([
    Embedding(max_features,64, input_length=x_train.shape[1]),
    Bidirectional(LSTM(32, return_sequences=True)),
    GlobalMaxPooling1D(),
    Dropout(0.3),
    Dense(64, activation="relu"),
    Dropout(0.3),
    Dense(3, activation="softmax")
])

In [164]:
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

In [165]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_13 (Embedding)     (None, 120, 64)           1280000   
_________________________________________________________________
bidirectional_10 (Bidirectio (None, 120, 64)           24832     
_________________________________________________________________
global_max_pooling1d_8 (Glob (None, 64)                0         
_________________________________________________________________
dropout_21 (Dropout)         (None, 64)                0         
_________________________________________________________________
dense_36 (Dense)             (None, 64)                4160      
_________________________________________________________________
dropout_22 (Dropout)         (None, 64)                0         
_________________________________________________________________
dense_37 (Dense)             (None, 3)                 195       
Total para

In [166]:
model.fit(x_train, y_train, epochs=3, batch_size=256, validation_split=0.2, callbacks=[early_stopping])

Train on 15663 samples, validate on 3916 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f0c8085be48>

In [170]:
model.save("lstm_max_pool.h5")

In [206]:
x_test = tokenizer.texts_to_sequences(test["text"])
x_test = pad_sequences(x_test, maxlen=max_len)

In [207]:
prediction = predict(x_test,model_name="lstm_max_pool.h5")

In [208]:
submission_lstm = to_submission(prediction)

In [209]:
submission_lstm[:5]

Unnamed: 0_level_0,EAP,HPL,MWS
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
id02310,0.024993,0.0126,0.962407
id24541,0.999607,0.000332,6.1e-05
id00134,0.006904,0.992085,0.001011
id27757,0.990787,0.008419,0.000794
id04081,0.547632,0.298268,0.1541


In [210]:
ensembled = (submission_lstm + submission_count_vect_dense) /2
ensembled[:5]

Unnamed: 0_level_0,EAP,HPL,MWS
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
id02310,0.053875,0.019701,0.926424
id24541,0.996962,0.002969,6.9e-05
id00134,0.018647,0.979972,0.001381
id27757,0.905988,0.093251,0.000761
id04081,0.715027,0.183128,0.101844


In [211]:
ensembled.to_csv("ensembled_lstm+count_vect_dense.csv")

In [217]:
tf_mnb = pd.read_csv("./submission/submit_Tfidf_MNB_text_boosted.csv", index_col="id")

In [218]:
tf_mnb[:5]

Unnamed: 0_level_0,EAP,HPL,MWS
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
id02310,0.048463,0.001949,0.949587
id24541,0.978842,0.016705,0.004453
id00134,0.005973,0.992195,0.001832
id27757,0.777307,0.219594,0.003098
id04081,0.59694,0.320374,0.082687


In [223]:
ensembled = (submission_lstm + submission_count_vect_dense + 3*tf_mnb) /5
ensembled[:5]

Unnamed: 0_level_0,EAP,HPL,MWS
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
id02310,0.050628,0.00905,0.940322
id24541,0.98609,0.011211,0.002699
id00134,0.011042,0.987306,0.001652
id27757,0.828779,0.169057,0.002163
id04081,0.644175,0.265476,0.09035


In [224]:
ensembled.to_csv("ensembled_lstm+count_vect_dense+tf_idf_mb.csv")

ensembled_lstm+count_vect_dense+tf_idf_mb.csv

0.32914

In [54]:
from keras.layers import Flatten

In [74]:
model = Sequential([
    Embedding(max_features,64, input_length=x_train.shape[1]),
    Bidirectional(LSTM(32, return_sequences=True)),
    Flatten(),
    Dropout(0.2),
    Dense(128, activation="relu"),
    Dropout(0.3),
    Dense(64, activation="relu"),
    Dropout(0.4),
    Dense(3, activation="softmax")
])

In [75]:
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

In [76]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_11 (Embedding)     (None, 120, 64)           640000    
_________________________________________________________________
bidirectional_8 (Bidirection (None, 120, 64)           24832     
_________________________________________________________________
flatten_5 (Flatten)          (None, 7680)              0         
_________________________________________________________________
dropout_16 (Dropout)         (None, 7680)              0         
_________________________________________________________________
dense_25 (Dense)             (None, 128)               983168    
_________________________________________________________________
dropout_17 (Dropout)         (None, 128)               0         
_________________________________________________________________
dense_26 (Dense)             (None, 64)                8256      
__________

In [77]:
model.fit(x_train, y_train, epochs=3, batch_size=256, validation_split=0.2, callbacks=[early_stopping])

Train on 15663 samples, validate on 3916 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4


<keras.callbacks.History at 0x7f0e6e708c50>

In [65]:
model.save("flat_lstm.h5")