In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn import model_selection, preprocessing, metrics
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble

import keras
from keras import layers, models, optimizers
from keras.wrappers.scikit_learn import KerasClassifier
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.utils import to_categorical

import os

trainDF = pd.read_csv("train.csv")
testDF = pd.read_csv("test.csv")

# split the dataset into training and test datasets 
train_x, test_x, train_y, test_y = model_selection.train_test_split(trainDF['title'], trainDF['Category'])

submission_x = testDF['title']

# label encode the target variable 
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
test_y = encoder.fit_transform(test_y)
test_y_1dim = test_y

train_y = to_categorical(train_y, num_classes=58)
test_y = to_categorical(test_y, num_classes=58)

print("done")

Using TensorFlow backend.


done


In [2]:
# train_y and valid change from 1 dimensional array to (58, ) shape array
# i.e. change from 20 to [0, 0, ... , 1, 0, 0, ..., 0] where the 20th element will be 1
# so output layer on neural network will be 58 neurons with softmax and each elemetn is probability of it being true

# TF-IDF Vectors as Features
'''TF-IDF score represents the relative importance of a term 
in the document and the entire corpus. TF-IDF score is composed
by two terms: the first computes the normalized Term Frequency (TF), 
the second term is the Inverse Document Frequency (IDF), computed 
as the logarithm of the number of the documents in the corpus divided 
by the number of documents where the specific term appears.
TF(t) = (Number of times term t appears in a document) / (Total number of terms in the document)
IDF(t) = log_e(Total number of documents / Number of documents with term t in it)
'''
print("Starting...")
# word level tf-idf - Matrix representing tf-idf scores of every term in different documents
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features= 10000)#80091
tfidf_vect.fit(trainDF['title'])
xtrain_tfidf =  tfidf_vect.transform(train_x)
xtest_tfidf = tfidf_vect.transform(test_x)
xsubmission_tfidf = tfidf_vect.transform(submission_x)

print("Done")

Starting...
Done


In [3]:
from sklearn.metrics import accuracy_score
from keras.models import load_model

from keras.utils import plot_model
from keras.models import Model
from keras.layers import Input
from keras.layers import Dense
from keras.layers.merge import concatenate
from numpy import argmax

In [4]:
# load models from file
def load_all_models(n_models):
    all_models = list()
    for i in range(n_models):
        # define filename for this ensemble
        filename = 'model_' + str(i + 1) + '.h5'
        # load model from file
        model = load_model(filename)
        # add to list of members
        all_models.append(model)
        print('>loaded %s' % filename)
    return all_models


# define stacked model from multiple member input models
def define_stacked_model(members):
    # update all layers in all models to not be trainable
    for i in range(len(members)):
        model = members[i]
        for layer in model.layers:
            # make not trainable
            layer.trainable = False
            # rename to avoid 'unique layer name' issue
            layer.name = 'ensemble_' + str(i+1) + '_' + layer.name
    # define multi-headed input
    ensemble_visible = [model.input for model in members]
    # concatenate merge output from each model
    ensemble_outputs = [model.output for model in members]
    merge = concatenate(ensemble_outputs)
    hidden1 = Dense(200, activation='relu')(merge)
    dropout1 = Dropout(0, 3)(hidden1)
    hidden2 = Dense(160, activation = 'relu')(dropout1)
    dropout2 = Dropout(0, 3)(hidden2)
    hidden3 = Dense(120, activation='relu')(dropout2)
    dropout3 = Dropout(0, 3)(hidden3)
    hidden4 = Dense(80, activation='relu')(dropout3)
    dropout4 = Dropout(0, 3)(hidden4)
    output = Dense(58, activation='softmax')(dropout4)
    model = Model(inputs=ensemble_visible, outputs=output)
    # compile
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [5]:
# fit a stacked model
def fit_stacked_model(model, inputX, inputy):
    # prepare input data
    X = [inputX for _ in range(len(model.input))]
    # encode output data
    # fit model
    model.fit(X, inputy, epochs=10, verbose=1, batch_size=128)

In [6]:
# make a prediction with a stacked model
def predict_stacked_model(model, inputX):
    # prepare input data
    X = [inputX for _ in range(len(model.input))]
    # make prediction
    return model.predict(X)

In [7]:
print(xtrain_tfidf.shape, xtest_tfidf.shape)
# load all models
n_members = 5
members = load_all_models(n_members)
print('Loaded %d models' % len(members))

(499961, 10000) (166654, 10000)
>loaded model_1.h5
>loaded model_2.h5
>loaded model_3.h5
>loaded model_4.h5
>loaded model_5.h5
Loaded 5 models


In [8]:
# define ensemble model
stacked_model = define_stacked_model(members)
# fit stacked model on test dataset
fit_stacked_model(stacked_model, xtest_tfidf, test_y)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [9]:
X = [xtest_tfidf for _ in range(len(stacked_model.input))]
# make prediction
score = stacked_model.evaluate(X, test_y)
score



[0.632153760491017, 0.8098635673522949]

In [10]:
X = [xsubmission_tfidf for _ in range(len(stacked_model.input))]
# make prediction
prediction = stacked_model.predict(X)
prediction[0][0:10]

array([3.1399035e-05, 9.8280590e-03, 9.1453679e-05, 2.3663719e-03,
       4.3039350e-03, 9.8239648e-01, 7.5929273e-07, 2.1882850e-04,
       4.8091199e-05, 6.5912370e-04], dtype=float32)

In [11]:
prediction = argmax(prediction, axis=1)
prediction

array([ 5,  5,  5, ..., 35, 33, 34], dtype=int64)

In [13]:
print ("Generate Submission File ... ")
my_submission = pd.DataFrame({"itemid": testDF.itemid, "Category": prediction})
my_submission.to_csv('submission.csv', index=False)
print("Done!")
#print(xvalid_tfidf)

Generate Submission File ... 
Done!
