We are given a sequential corpora i.e a sequence of document collections where the sequence is in a chronological order. This notebook demostrates how to use the orthogonal procrustes method to align the embedding models learned on each document collection separately.

In [1]:
import gensim
import os
import sys

MOD_DIR = "../modules"

if MOD_DIR not in sys.path: sys.path.append (MOD_DIR)

from utils import seqitertools, w2vutils
from semshift import embeddings, alignment

from utils.w2vutils import EpochLogger

In [2]:
DATA_DIR = "/hg191/corpora/legaldata/sc-docs/"

In [3]:
models = [embeddings.TrainedModel(filename) for filename in seqitertools.iter_files (DATA_DIR, "*.model")]

In [4]:
def sequential_align (trainedmodels):
    aligned_models = list ()
    base_model = None
    first_iter = True
    for model in trainedmodels:
        if first_iter:
            aligned_model = model.m
            first_iter = False
        else:
            aligned_model = alignment.smart_procrustes_align_gensim (base_model, model.m)
        base_model = aligned_model
        aligned_models.append (aligned_model)
    return aligned_models

In [5]:
aligned_models = sequential_align (models)