In [1]:
import numpy as np
import keras
import pandas as pd

Using TensorFlow backend.


In [2]:
DATADIR = 'data/'
# this is annoying... but this is how Conv2D layer in Keras works!
#A matrix is treated a grayscale image, i.e. am image with num_channels = 1
NUMCHANNELS = 1

class InteractionData:
    #Interaction data of query qid with K top docs -
    #each row vector is a hisotgram of interaction data for a document
    def __init__(self, qid, dataPathBase=DATADIR):
        self.qid = qid        
        dataFile = "{}/{}.txt".format(dataPathBase, self.qid)        
        df = pd.read_csv(dataFile, delim_whitespace=True, header=None)
        self.matrix = df.to_numpy()

In [3]:
class PairedInstance:
    def __init__(self, line):
        l = line.strip().split('\t')        
        self.id_a = l[0]
        self.id_b = l[1]
        self.label = int(l[2])

    def __str__(self):
        return "({}, {})".format(self.id_a, self.id_b)
    
    def getKey(self):
        return "{}-{}".format(self.id_a, self.id_b)
                    
#Separate instances for training/test sets etc. Load only the id pairs.
#Data is loaded later in batches with a subclass of Keras generator
class PairedInstanceIds:    
    '''
    Each line in this file should comprise three tab separated fields
    <id1> <id2> <label (1/0)>
    '''
    def __init__(self, idpairLabelsFile):
        self.data = {}

        with open(idpairLabelsFile) as f:
            content = f.readlines()
        
        # remove whitespace characters like `\n` at the end of each line
        for x in content:
            instance = PairedInstance(x)
            self.data[instance.getKey()] = instance


In [4]:
allPairs = PairedInstanceIds("data/pairs.txt")
allPairsList = list(allPairs.data.values())
np.random.shuffle(allPairsList)
num_pairs = len(allPairsList)

TRAIN_RATIO=0.8
num_training = int(TRAIN_RATIO*num_pairs)

#get the ids
train_pairs = allPairsList[0:num_training]
test_pairs = allPairsList[num_training:]

print ('{}/{} pairs for training'.format(num_training, num_pairs))


3960/4950 pairs for training


In [5]:
#Num top docs (Default: 10)
K=10
#M: bin-size (Default: 20)
M=20
BATCH_SIZE=20

'''
The files need to be residing in the folder data/
Each file is a matrix of values that's read using 
'''
class PairCmpDataGenerator(keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, paired_instances_ids, dataFolder=DATADIR, batch_size=BATCH_SIZE, dim=(K, M, NUMCHANNELS)):
        'Initialization'
        self.paired_instances_ids = paired_instances_ids
        self.dim = dim
        self.batch_size = batch_size
        self.dataDir = dataFolder
        self.on_epoch_end()

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(len(self.paired_instances_ids) / self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]

        # Find list of IDs
        list_IDs = [self.paired_instances_ids[k] for k in indexes]

        # Generate data
        X = self.__data_generation(list_IDs)

        return X

    def on_epoch_end(self):
        'Update indexes after each epoch'
        self.indexes = np.arange(len(self.paired_instances_ids))

    def __data_generation(self, list_IDs):
        'Generates data pairs containing batch_size samples'
        # Initialization
        X = [np.empty((self.batch_size, *self.dim)) for i in range(2)]
        Y = np.empty((self.batch_size), dtype=int)

        # Generate data
        for i, paired_instance in enumerate(list_IDs):            
            a_id = paired_instance.id_a
            b_id = paired_instance.id_b
            
            #read from the data file and construct the instances
            a_data = InteractionData(a_id, self.dataDir)
            b_data = InteractionData(b_id, self.dataDir)
            
            w, h = a_data.matrix.shape
            a_data.matrix = a_data.matrix.reshape(w, h, 1)
            b_data.matrix = b_data.matrix.reshape(w, h, 1)
            
            X[0][i,] = a_data.matrix
            X[1][i,] = b_data.matrix
            Y[i] = paired_instance.label
            
        return X, Y

In [6]:
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Flatten, Input, Conv2D, MaxPooling2D
from keras.layers.merge import concatenate
from tensorflow.keras import layers
HIDDEN_LAYER_DIM = 16

In [7]:
def build_siamese(input_shape):
    
    input_a = Input(shape=input_shape, dtype='float32')
    input_b = Input(shape=input_shape, dtype='float32')

    matrix_encoder = Sequential()
    matrix_encoder.add(Conv2D(32, (5,5), activation='relu', input_shape=input_shape))
    matrix_encoder.add(MaxPooling2D(padding='same'))
    matrix_encoder.add(Conv2D(64, (3,3), activation='relu'))
    matrix_encoder.add(MaxPooling2D(padding='same'))
    matrix_encoder.add(Flatten())
    matrix_encoder.add(Dropout(0.2))
    matrix_encoder.add(Dense(HIDDEN_LAYER_DIM, activation='sigmoid'))    

    encoded_a = matrix_encoder(input_a)
    encoded_b = matrix_encoder(input_b)

    merged_vector = concatenate([encoded_a, encoded_b], axis=-1)
    
    # And add a logistic regression (2 class - sigmoid) on top
    # used for backpropagating from the (pred, true) labels
    predictions = Dense(1, activation='sigmoid')(merged_vector)

    siamese_net = Model([input_a, input_b], outputs=predictions)
    return siamese_net

In [8]:
siamese_model = build_siamese((K, M, 1))
siamese_model.compile(loss = keras.losses.BinaryCrossentropy(), optimizer = keras.optimizers.Adam())
siamese_model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 10, 20, 1)    0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 10, 20, 1)    0                                            
__________________________________________________________________________________________________
sequential_1 (Sequential)       (None, 16)           22416       input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
concatenate_1 (Concatenate)     (None, 32)           0           sequential_1[1][0]         

In [9]:
EPOCHS=3
training_generator = PairCmpDataGenerator(train_pairs, dataFolder=DATADIR)
siamese_model.fit_generator(generator=training_generator,
                    use_multiprocessing=True, epochs=EPOCHS, workers=4)

Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 2/3


<keras.callbacks.callbacks.History at 0x64718d320>