In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble
from sklearn.tree import DecisionTreeClassifier

from keras import layers, models, optimizers
from keras.wrappers.scikit_learn import KerasClassifier
from keras.models import Sequential
from keras.layers import Dense, Dropout
import keras
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os

trainDF = pd.read_csv("train.csv")
testDF = pd.read_csv("test.csv")

# split the dataset into training and test datasets 
train_x, test_x, train_y, test_y = model_selection.train_test_split(trainDF['title'], trainDF['Category'])


submission_x = testDF['title']

# label encode the target variable 
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
test_y = encoder.fit_transform(test_y)
test_y_1dim = test_y
# split dataset into 3 parts, 1 training, 1 valid, 1 test
# train dataset on training, predict valid dataset and use the (valid_x, valid_y) as features to train
# another model. use the last model to predict test_x (which is concat of prev 2 models prediction of test_x) 
# and score with the test_y

train_y = keras.utils.to_categorical(train_y, num_classes=58)
test_y = keras.utils.to_categorical(test_y, num_classes=58)

print("done")
# train_y and valid change from 1 dimensional array to (58, ) shape array
# i.e. change from 20 to [0, 0, ... , 1, 0, 0, ..., 0] where the 20th element will be 1
# so output layer on neural network will be 58 neurons with softmax and each elemetn is probability of it being true

Using TensorFlow backend.


done


In [2]:
# TF-IDF Vectors as Features
'''TF-IDF score represents the relative importance of a term 
in the document and the entire corpus. TF-IDF score is composed
by two terms: the first computes the normalized Term Frequency (TF), 
the second term is the Inverse Document Frequency (IDF), computed 
as the logarithm of the number of the documents in the corpus divided 
by the number of documents where the specific term appears.
TF(t) = (Number of times term t appears in a document) / (Total number of terms in the document)
IDF(t) = log_e(Total number of documents / Number of documents with term t in it)
'''
print("Starting...")
# word level tf-idf - Matrix representing tf-idf scores of every term in different documents
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features= 10000)#80091
tfidf_vect.fit(trainDF['title'])
xtrain_tfidf =  tfidf_vect.transform(train_x)
xtest_tfidf = tfidf_vect.transform(test_x)
xsubmission_tfidf = tfidf_vect.transform(submission_x)

print("Done")

Starting...
Done


In [3]:
# fit model on dataset
def build_model(trainX, trainy):
    # define model
    model = Sequential()
    model.add(Dense(256, input_dim=xtrain_tfidf.shape[1], activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(200, activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(160, activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(120, activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(80, activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(58, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()
    # fit model
    model.fit(trainX, trainy, epochs=15, verbose=1, batch_size = 128)
    return model

In [4]:
n_members = 5
for i in range(n_members):
    # fit model
    model = build_model(xtrain_tfidf, train_y)
    # save model
    filename = 'model_' + str(i + 1) + '.h5'
    model.save(filename)
    print('>Saved %s' % filename)

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 256)               2560256   
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 200)               51400     
_________________________________________________________________
dropout_2 (Dropout)          (None, 200)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 160)               32160     
_________________________________________________________________
dropout_3 (Dropout)          (None, 160)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 120)              

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
>Saved model_3.h5
Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_19 (Dense)             (None, 256)               2560256   
_________________________________________________________________
dropout_16 (Dropout)         (None, 256)               0         
_________________________________________________________________
dense_20 (Dense)             (None, 200)               51400     
_________________________________________________________________
dropout_17 (Dropout)         (None, 200)               0         
_________________________________________________________________
dense_21 (Dense)             (None, 160)               32160     
_______________________________________________________________

Epoch 13/15
Epoch 14/15
Epoch 15/15
>Saved model_5.h5
