In [1]:
import numpy as np
import pandas as pd

import nltk
from nltk.corpus import brown

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from unidecode import unidecode

import gensim
from gensim.models import Word2Vec



In [2]:
# Load Google's pre-trained Word2Vec model.
# accessible: https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit

model = gensim.models.KeyedVectors.load_word2vec_format('../../../../../GoogleNews-vectors-negative300.bin.gz', binary=True)

In [3]:
def get_transformation(D1, D2):
    
    m3 = 300 # dimension of word embeddings
    n1, m1 = D1.shape
    n2, m2 = D2.shape
    
    features1 = list(D1.columns)
    features2 = list(D2.columns)

    # Feature map initialization from n_i x m_i -> n_i x m_3
    # Linear Transformation = Matrix multiplication T_i: m_i x m_3
    T1 = np.zeros((m1, m3), dtype=np.float32)
    T2 = np.zeros((m2, m3), dtype=np.float32)
    
    # Construction of Matrices
    # i-th row in matrix T_j corresponds to the word embedding (GoogleNews Word2Vec pretrained model)
    # of i-th culumn/feature (word) of D_j 
    for idx, feat in enumerate(features1):
        
        if feat in model:
            T1[idx, :] = model[feat]
        else:
            T1[idx, :] = np.zeros((1, 300))
        
    for idx, feat in enumerate(features2):
        if feat in model:
            T2[idx, :] = model[feat]
        else:
            T2[idx, :] = np.zeros((1, 300))
        
    return T1, T2

def cosine_sim(D1, D2):
    
    # Compute transformaton
    T1, T2 = get_transformation(D1, D2)
    
    # Matrix multiplication
    # k-th row in Dj_prime is a linear combination a_1 * e_1 + ... + a_i * e_i +  ... + a_m_j * e_m_j, 
    # where e_i is the i-th row of matrix T1 (corresponding to the embedding of i-th column/feature of Dj) 
    # and a_i is the count of i-th column/feature in k-th document of dataset Dj
    D1_prime = np.matmul(D1.values, T1)
    D2_prime = np.matmul(D2.values, T2)
    
    # In other words, each row is represented as a linear combination of word embedding vectors, i. e.
    # i-th document/row in dataset D1: d1_i = sum_{k = 1}^{m1} #(count of word k in i-th document) * e_k (embedding of word k)
    
    # cosine similarity between of size: n_1 x n_2
    return cosine_similarity(D1_prime, D2_prime) 

In [4]:
n1 = 25
m1 = 40

n2 = 30
m2 = 30

m3 = 300
brown_size = len(brown.fileids())

def construct_dataset(m, n, corpus, corpus_size):
    
    rand_int = np.random.randint(low=0, high=corpus_size, size=n)
    
    d = []
    
    for idx in rand_int:
        doc = brown_ = " ".join([' '.join(sent).replace('``', '"').replace("''", '"').replace('`', "'")  
                                 for sent in corpus.sents(corpus.fileids()[idx])])
        
        d.append({"doc": doc})
        
    df_doc = pd.DataFrame(d)
        
    vec = CountVectorizer(max_features=m, stop_words='english', ngram_range=(1,1))

    X = vec.fit_transform(df_doc["doc"]).toarray()
    
    vocab = vec.get_feature_names()
    
    df_feat = pd.DataFrame(X, columns=vocab)
    
    return df_doc, df_feat

df_doc1, D1 = construct_dataset(m1, n1, brown, brown_size)
df_doc2, D2 = construct_dataset(m2, n2, brown, brown_size)

similarity = cosine_sim(D1, D2)

max_similarity = np.amax(similarity)
id_1, id_2 = tuple(np.argwhere(similarity == max_similarity)[0])

print(max_similarity, similarity[id_1, id_2])
doc1 = df_doc1["doc"].loc[id_1]
doc2 = df_doc2["doc"].loc[id_2]

print(doc1[0:1000])

print("\n\n")

print(doc2[0:1000])

0.9085390539774225 0.9085390539774225
Too many people think that the primary purpose of a higher education is to help you make a living ; ; this is not so , for education offers all kinds of dividends , including how to pull the wool over a husband's eyes while you are having an affair with his wife . If it were not for an old professor who made me read the classics I would have been stymied on what to do , and now I understand why they are classics ; ; those who wrote them knew people and what made people tick . I worked for my Uncle ( an Uncle by marriage so you will not think this has a mild undercurrent of incest ) who ran one of those antique shops in New Orleans' Vieux Carre , the old French Quarter . The arrangement I had with him was to work four hours a day . The rest of the time I devoted to painting or to those other activities a young and healthy man just out of college finds interesting . I had a one-room studio which overlooked an ancient courtyard filled with flowers and