In [2]:
from keras.models import Sequential, Model, load_model
model_path = "/Users/g/git/char-embeddings/output/model.hdf5"

trained_model = load_model(model_path)

Using TensorFlow backend.


In [3]:
trained_model.layers[1].get_weights()

[array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.02276734, -0.00849825,  0.01119453, ...,  0.01045316,
         -0.00365897, -0.00579622],
        [ 0.02850626, -0.20829438,  0.20986378, ...,  0.19447061,
         -0.03668197, -0.04913757],
        ..., 
        [ 0.25639474, -0.27427381,  0.08741026, ...,  0.16504318,
         -0.10606523, -0.13719946],
        [ 0.27301911, -0.25047618,  0.09171401, ...,  0.19639358,
         -0.05847714, -0.11588186],
        [ 0.25345701, -0.27562574,  0.10642631, ...,  0.17800714,
         -0.08167854, -0.0818819 ]], dtype=float32)]

In [4]:
from __future__ import print_function
from keras.models import Model
from keras.layers import Dense, Activation, Embedding
from keras.layers import LSTM, Input
from keras.layers.merge import concatenate
from keras.optimizers import RMSprop, Adam
from keras.utils.data_utils import get_file
from keras.layers.normalization import BatchNormalization
from keras.callbacks import Callback, ModelCheckpoint
from sklearn.decomposition import PCA
from keras.utils import plot_model
import numpy as np
import random
import sys
import csv
import os
import h5py
import time

embeddings_path = "glove.840B.300d-char.txt"
embedding_dim = 300
batch_size = 128
use_pca = False
lr = 0.001
lr_decay = 1e-4
maxlen = 40
consume_less = 2   # 0 for cpu, 2 for gpu

text = text = open("/Users/g/git/neural-authorship-identification/charlm/pan15traincorpus.txt").read()
print('corpus length:', len(text))

chars = sorted(list(set(text)))
print('total chars:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

# cut the text in semi-redundant sequences of maxlen characters

step = 3
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen])
    next_chars.append(text[i + maxlen])
print('nb sequences:', len(sentences))


print('Vectorization...')
X = np.zeros((len(sentences), maxlen), dtype=np.int)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        X[i, t] = char_indices[char]



corpus length: 396105
total chars: 72
nb sequences: 132022
Vectorization...


In [5]:
# test code to sample on 10% for functional model testing

def random_subset(X, y, p=0.1):

    idx = np.random.randint(X.shape[0], size=int(X.shape[0] * p))
    X = X[idx, :]
    y = y[idx]
    return (X, y)


# https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html
print('Processing pretrained character embeds...')
embedding_vectors = {}
with open(embeddings_path, 'r') as f:
    for line in f:
        line_split = line.strip().split(" ")
        vec = np.array(line_split[1:], dtype=float)
        char = line_split[0]
        embedding_vectors[char] = vec

embedding_matrix = np.zeros((len(chars), 300))
#embedding_matrix = np.random.uniform(-1, 1, (len(chars), 300))
for char, i in char_indices.items():
    #print ("{}, {}".format(char, i))
    embedding_vector = embedding_vectors.get(char)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

# Use PCA from sklearn to reduce 300D -> 50D
if use_pca:
    pca = PCA(n_components=embedding_dim)
    pca.fit(embedding_matrix)
    embedding_matrix_pca = np.array(pca.transform(embedding_matrix))
    print (embedding_matrix_pca)
    print (embedding_matrix_pca.shape)


Processing pretrained character embeds...


In [6]:
print('Build model...')
main_input = Input(shape=(maxlen,))
embedding_layer = Embedding(
    len(chars), embedding_dim, input_length=maxlen,
    weights=[embedding_matrix_pca] if use_pca else [embedding_matrix])
# embedding_layer = Embedding(
#     len(chars), embedding_dim, input_length=maxlen)
embedded = embedding_layer(main_input)

# RNN Layer
rnn = LSTM(256, implementation=consume_less)(embedded)

aux_output = Dense(len(chars))(rnn)
aux_output = Activation('softmax', name='aux_out')(aux_output)

# Hidden Layers
hidden_1 = Dense(512, use_bias=False)(rnn)
hidden_1 = BatchNormalization()(hidden_1)
hidden_1 = Activation('relu')(hidden_1)

hidden_2 = Dense(256, use_bias=False)(hidden_1)
hidden_2 = BatchNormalization()(hidden_2)
hidden_2 = Activation('relu')(hidden_2)

main_output = Dense(len(chars))(hidden_2)
main_output = Activation('softmax', name='main_out')(main_output)

model = Model(inputs=main_input, outputs=[main_output, aux_output])

optimizer = Adam(lr=lr, decay=lr_decay)
model.compile(loss='categorical_crossentropy',
              optimizer=optimizer, loss_weights=[1., 0.2])
model.summary()

Build model...
____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_1 (InputLayer)             (None, 40)            0                                            
____________________________________________________________________________________________________
embedding_1 (Embedding)          (None, 40, 300)       21600       input_1[0][0]                    
____________________________________________________________________________________________________
lstm_1 (LSTM)                    (None, 256)           570368      embedding_1[0][0]                
____________________________________________________________________________________________________
dense_2 (Dense)                  (None, 512)           131072      lstm_1[0][0]                     
____________________________________________________________________________

In [7]:
model.layers[1].set_weights(trained_model.layers[1].get_weights())

ValueError: Layer weight shape (72, 300) not compatible with provided weight shape (58, 300)

In [9]:
model.layers[1].get_weights()[0].shape

(72, 300)

In [8]:
trained_model.layers[1].get_weights()[0].shape

(58, 300)

In [10]:
trained_model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_6 (InputLayer)             (None, 100)           0                                            
____________________________________________________________________________________________________
embedding_6 (Embedding)          (None, 100, 300)      17400       input_6[0][0]                    
____________________________________________________________________________________________________
lstm_5 (LSTM)                    (None, 256)           570368      embedding_6[0][0]                
____________________________________________________________________________________________________
dense_19 (Dense)                 (None, 512)           131072      lstm_5[0][0]                     
___________________________________________________________________________________________

In [11]:
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_1 (InputLayer)             (None, 40)            0                                            
____________________________________________________________________________________________________
embedding_1 (Embedding)          (None, 40, 300)       21600       input_1[0][0]                    
____________________________________________________________________________________________________
lstm_1 (LSTM)                    (None, 256)           570368      embedding_1[0][0]                
____________________________________________________________________________________________________
dense_2 (Dense)                  (None, 512)           131072      lstm_1[0][0]                     
___________________________________________________________________________________________