In [1]:
import os
import sys

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
from fastai.text import *

%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
bs = 48
path = untar_data(URLs.IMDB)

In [3]:
#data_lm = (TextList.from_folder(path)
#            .filter_by_folder(include=['train', 'test', 'unsup']) 
#            .split_by_rand_pct(0.1)
#            .label_for_lm()           
#            .databunch(bs=bs))
#data_lm.save('data_lm.pkl')

data_lm = load_data(path, 'data_lm.pkl', bs = bs)

In [4]:
learn = language_model_learner(data_lm, AWD_LSTM, drop_mult=0.3)

### 0. Prior Inspection

In [5]:
len(data_lm.vocab.itos)

60000

In [6]:
learn.model.state_dict()['0.encoder.weight'].shape

torch.Size([60000, 400])

### 1. Prepare Test Data

In [12]:
learn.data.vocab.numericalize(['good', 'nice', 'horrible', 'king', 'queen', 'sun'])

[68, 359, 503, 733, 1495, 2259]

In [8]:
w_mat = learn.model.state_dict()['0.encoder.weight']

In [28]:
def encode_word_vocab(word, w_mat):
    """ direct index is fast and memory efficient """
    # pad one dim for sklearn computation
    idx = data_lm.vocab.itos.index(word)
    return w_mat[idx].cpu().numpy().reshape(-1, 1)

In [9]:
def encode_word(idx, w_mat):
    """ direct index is fast and memory efficient """
    # pad one dim for sklearn computation
    return np.expand_dims(w_mat[idx].cpu().numpy(), axis = 0)

In [13]:
good_emb = encode_word(68, w_mat)
nice_emb = encode_word(359, w_mat)
horrible_emb = encode_word(503, w_mat)
king_emb = encode_word(733, w_mat)
queen_emb = encode_word(1495, w_mat)
sun_emb = encode_word(2259, w_mat)

### 2. Test Cosine Similarity

In [11]:
from sklearn.metrics.pairwise import cosine_similarity

In [16]:
print(f'king vs queen: {cosine_similarity(king_emb, queen_emb)[0][0]}')

king vs queen: 0.6706327199935913


In [18]:
print(f'king vs sun: {cosine_similarity(king_emb, sun_emb)[0][0]}')

king vs sun: 0.1983110010623932


In [17]:
print(f'good vs nice: {cosine_similarity(good_emb, nice_emb)[0][0]}')

good vs nice: 0.3581615090370178


In [19]:
print(f'good vs sun: {cosine_similarity(good_emb, sun_emb)[0][0]}')

good vs sun: 0.04677487164735794


### 3. Test Inner Product

In [24]:
print(f'king v.s. queen: {(king_emb * queen_emb).sum()}')

king v.s. queen: 13.336496353149414


In [25]:
print(f'king v.s. sun: {(king_emb * sun_emb).sum()}')

king v.s. sun: 3.981299877166748


In [26]:
print(f'good v.s. nice: {(good_emb * nice_emb).sum()}')

good v.s. nice: 5.647521018981934


In [27]:
print(f'good v.s. sun: {(good_emb * sun_emb).sum()}')

good v.s. sun: 0.8564869165420532


#### 4. Normalized Cosine Similarity