In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '1'

In [2]:
from __future__ import print_function

import keras
from keras.models import Sequential
from keras.layers import Conv1D, Dense, Dropout, Flatten, BatchNormalization
from keras.callbacks import EarlyStopping, CSVLogger

from loader import load_20news
from custom_layer import MonteCarloLRF, SeparableMonteCarloLRF, SeparableMonteCarloMaxPoolingV2, RandomLRF

import numpy as np
import sklearn as sk
import pickle

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [3]:
top_words=10000
sparse=False
remove_short_documents=True
notebook = 'mcNet_top10k_temptative_42'

In [4]:
(input_shape, nb_classes), (X_train, X_test, Y_train, Y_test), graph_data = \
    load_20news(data_home='data', top_words=top_words, sparse=sparse,
                remove_short_documents=remove_short_documents, verbose=False)

In [5]:
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], X_train.shape[2])
X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], X_test.shape[2])
num_classes = Y_train.shape[1]

In [6]:
%%time
#Process next cell only once
path = os.path.join('probabilities_' + 
                    'top' + str(top_words) +
                    '_sparse' + str(sparse) +
                    '_removeShorts' + str(remove_short_documents) +
                    '_tfidf.pkl')
if os.path.isfile(path):
  probabilities = pickle.load(open(path, "rb"), encoding='latin1')

CPU times: user 57.9 ms, sys: 125 ms, total: 183 ms
Wall time: 182 ms


In [7]:
if not(os.path.isfile(path)):
  METRIC = 'cosine'#'euclidean'
  distances = sk.metrics.pairwise.pairwise_distances(graph_data, metric=METRIC, n_jobs=-2)

  # enforce exact zero
  for k in range(distances.shape[0]):
    distances[k,k] = 0.

  # max normalize
  #distances /= distances.max()
  distances /= distances.max(axis=1).reshape((distances.shape[0], 1))

  # use tricube kernel (because of flatness around 0)
  probabilities = (1. - np.abs(distances) ** 3) ** 3

  # remove auto connections (which are taken anyway in LRF)
  for k in range(probabilities.shape[0]):
    probabilities[k,k] = 0.

  # normalize proba
  probabilities /= np.sum(probabilities, axis=1).reshape((probabilities.shape[0], 1))
  
  # normalize by neighbour frequency (idf)
  probabilities /= np.sum(probabilities, axis=0).reshape((probabilities.shape[0], 1))
    
  # renormalize proba
  probabilities /= np.sum(probabilities, axis=1).reshape((probabilities.shape[0], 1))
  
  # pickled for later use
  pickle.dump(probabilities, open(path,"wb"))

In [8]:
if True:
  probabilities = np.ones(probabilities.shape)

  # remove auto connections (which are taken anyway in LRF)
  for k in range(probabilities.shape[0]):
    probabilities[k,k] = 0.

  # renormalize proba
  probabilities /= np.sum(probabilities, axis=-1).reshape((probabilities.shape[0], 1))
  assert ((np.sum(probabilities, axis=-1) - 1) < 0.000001).all()

In [9]:
batch_size = 64
model = Sequential()
model.add(RandomLRF(probabilities, LRF_size=2, filters=1, activation='relu',
                       input_shape=(X_train.shape[1], X_train.shape[2])))
#model.add(BatchNormalization(axis=1))
#model.add(Conv1D(1, kernel_size=1, activation='relu',
#                 padding='same',kernel_initializer='he_uniform'))
#model.add(SeparableMonteCarloMaxPoolingV2(LRF_size=4, new_size=2500))
model.add(Dropout(0.2))
model.add(Flatten())
model.add(Dense(500, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(num_classes, activation='softmax'))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
random_lrf_1 (RandomLRF)     (None, 10000, 1)          3         
_________________________________________________________________
dropout_1 (Dropout)          (None, 10000, 1)          0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 10000)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 500)               5000500   
_________________________________________________________________
dropout_2 (Dropout)          (None, 500)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 20)                10020     
Total params: 5,010,523
Trainable params: 5,010,523
Non-trainable params: 0
_________________________________________________________________


In [10]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

early_stopper = EarlyStopping(min_delta=0.001, patience=2)
csv = CSVLogger(notebook + '_log.csv')

history = model.fit(X_train, Y_train,
                    batch_size=batch_size,
                    epochs=20,
                    verbose=1,
                    callbacks=[early_stopper, csv],
                    validation_data=(X_test, Y_test))
score = model.evaluate(X_test, Y_test, verbose=0)

print('Test loss:', score[0])
print('Test accuracy:', score[1])

Train on 10168 samples, validate on 7071 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Test loss: 1.0503242312386694
Test accuracy: 0.7182859567499383


In [11]:
%%bash
tail mcNet_top10k_temptative_42_log.csv

epoch,acc,loss,val_acc,val_loss
0,0.33910306840314347,2.8039873434182323,0.43925894492334394,2.423607136838727
1,0.6700432729664795,1.7680579985716884,0.6802432469811842,1.494418034676935
2,0.8084185680566484,0.9697367282574051,0.707113562353833,1.1471810740100126
3,0.8675255704638903,0.6222516213881509,0.7150332342447107,1.0446805082458601
4,0.9093233675679102,0.4255000485809275,0.7184273793359842,1.0285227775084822
5,0.9335169157674236,0.31233567494186804,0.7199830293082193,1.0240958286906348
6,0.9510228167266755,0.2307115395785691,0.7174374204750126,1.0321892972057232
7,0.9674468923046493,0.1742295026368836,0.7182859567499383,1.0503242302945708
