In [None]:
word_vectors_sg = KeyedVectors.load('word2vec_sg.wordvectors')

def matrix_generator(dataframe, list_IDs_temp, min_len = 1, max_len = 100, dim = (100, 200)):
    '''
    Input:
    filename: name of the csv file we want to read data from
    list_TDs_temp: indeces for minibatch
    min_len: minimum number of tokenized words for a title + abstraction text
    max_len: maximum number of tokenized words for a title + abstraction text
    dim: dimension of outputing matrix representation of text
    
    Output:
    X: batch of matrix representations of num_doc number of title+abstraction texts
    y: labels
    '''
    batch_size = len(list_IDs_temp)
    X = np.zeros((batch_size, *dim))
    y = np.zeros((1, batch_size), dtype = int)
    counter = 0
    
    for idx in list_IDs_temp:
        if dataframe.iloc[idx]["abstract"] == 'NULL':
            line = re.sub('[^a-zA-Z0-9]', ' ', str(dataframe.iloc[idx]["title"])) # remove non-letters and non-numbers
            bool_list = [word in word_vectors_sg for word in utils.simple_preprocess(line)]
            filtered_line = list(compress(utils.simple_preprocess(line), bool_list)) #some infrequency words are not included in the vocab, 
            #we remove then
            length = len(filtered_line)
            if length >= min_len:
                matrix = word_vectors_sg[filtered_line] # generate document matrix representation from tokens
                X_row = np.pad(matrix, ((0, max_len - length),(0, 0))).reshape([*dim])
                X[counter] = X_row
                y[0, counter] = dataframe.iloc[idx]["label"] - 1
        else:
            line = re.sub('[^a-zA-Z0-9]', ' ', str(dataframe.iloc[idx]["title"]) + ' ' + str(dataframe.iloc[idx]["abstract"])) # remove non-letters and non-numbers
            bool_list = [word in word_vectors_sg for word in utils.simple_preprocess(line)]
            filtered_line = list(compress(utils.simple_preprocess(line), bool_list)) #some infrequent words are not included in the vocab, 
            #we remove then
            length = len(filtered_line)
            if length >= min_len and length <= max_len:
                matrix = word_vectors_sg[filtered_line] # generate document matrix representation from tokens
                X_row = np.pad(matrix, ((0, max_len - length),(0, 0))).reshape([*dim])
                X[counter] = X_row
                y[0, counter] = dataframe.iloc[idx]["label"] - 1
            elif length > max_len:
                matrix = word_vectors_sg[filtered_line[0:max_len]] # generate document matrix representation from tokens
                X_row = matrix.reshape([*dim])
                X[counter] = X_row
                y[0, counter] = dataframe.iloc[idx]["label"] - 1
        counter += 1
    return X, np.squeeze(y)

In [None]:
# For model training
class DataGenerator(Sequence):
    'Generates data for Keras'
    def __init__(self, dataframe, batch_size=100, text_length = 100,
                 n_classes=125, shuffle=True):
        'Initialization'
        self.dataframe = dataframe
        self.dim = (text_length, 200)
        self.batch_size = batch_size
        self.list_IDs = list(range(len(dataframe.loc[:, "patent_id"])))
        self.n_classes = n_classes
        self.shuffle = shuffle
        self.on_epoch_end()

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(len(self.dataframe.loc[:, "patent_id"]) / self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]

        # Find list of IDs
        list_IDs_temp = [self.list_IDs[k] for k in indexes]

        # Generate data
        X, y = self.__data_generation(list_IDs_temp)

        return X, y

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.list_IDs))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, list_IDs_temp):
        'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
        X, y = matrix_generator(self.dataframe, list_IDs_temp)
        return X, to_categorical(y, num_classes=self.n_classes)

In [None]:
# For prediction assessments
class DataGenerator2(Sequence):
    'Generates data for Keras'
    def __init__(self, dataframe, batch_size=100, text_length = 100,
                 n_classes=125, shuffle=False):
        'Initialization'
        self.dataframe = dataframe
        self.dim = (text_length, 200)
        self.batch_size = batch_size
        self.list_IDs = list(range(len(dataframe.loc[:, "patent_id"])))
        self.n_classes = n_classes
        self.shuffle = shuffle
        self.on_epoch_end()
        self.n = 0
        self.max = self.__len__()

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(len(self.dataframe.loc[:, "patent_id"]) / self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]

        # Find list of IDs
        list_IDs_temp = [self.list_IDs[k] for k in indexes]

        # Generate data
        y = self.__data_generation(list_IDs_temp)

        return y

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.list_IDs))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, list_IDs_temp):
        'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
        _, y = matrix_generator(self.dataframe, list_IDs_temp)
        return to_categorical(y, num_classes=self.n_classes)
    
    def __next__(self):
        if self.n >= self.max:
           self.n = 0
        y = self.__getitem__(self.n)
        self.n += 1
        return y