### 使用训练好的emb来分析数据

In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('../../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
from sgd_nlp.core.embedding.submodule.corpus_factory import *
from sgd_nlp.core.embedding.word2vec import *
from sgd_nlp.core.embedding.glove import *
from torch.optim.lr_scheduler import ExponentialLR
import torch
import os
import time
import math
import pickle

In [3]:
class glove_context:
    corpus_factory= 'save/glove/corpus_obj.cf'
    model_path='save/glove/glove_weights.path'
    emb_dim = 300
    sparse_emb = False
    win_width=11
    device = torch.device('cuda')


class skip_gram_context:
    corpus_factory= 'save/skipgram/corpus_obj.cf'
    model_path='save/skipgram/skipgram_weights.path'
    emb_dim = 300
    sparse_emb = True
    win_width=11
    device = torch.device('cuda')
    
class cbow_context:
    corpus_factory= 'save/cbow/corpus_obj.cf'
    model_path='save/cbow/cbow_weights.path'
    emb_dim = 300
    sparse_emb = True
    win_width=11
    device = torch.device('cuda')
    

In [4]:
def load_corpus_factory(obj_file_name):
    with open(obj_file_name, 'rb') as fin:
        print("!!! load corpus factory success !!!")
        return pickle.load(fin)

    
def load_emb(context):
    corpus_factory = load_corpus_factory(context.corpus_factory)
    
    model = None
    if 'skip' in context.model_path:
        model = SkipGram(emb_dim=context.emb_dim,
                     token_num=corpus_factory.token_num(),
                     sparse_emb=context.sparse_emb).to(context.device)
    
    elif 'cbow' in  context.model_path:
        model = Cbow(emb_dim=context.emb_dim,
                token_num=corpus_factory.token_num(),
                win_width=context.win_width,
                sparse_emb=context.sparse_emb).to(context.device)
        
    elif 'glove' in  context.model_path:
        model = Glove(emb_dim=context.emb_dim,
                token_num=corpus_factory.token_num(),
                sparse_emb=context.sparse_emb).to(context.device)
        
    model.load_state_dict(torch.load(context.model_path))
    print("!!! Load model weights success !!!")
    
    avg_emb = None
    
    if 'glove' in context.model_path:
        emb_l = model.emb_i.weight
        emb_r = model.emb_j.weight
        avg_emb = (emb_l+emb_r)/2
    
    else:
        emb_l = model.emb_i.weight
        emb_r = model.emb_o.weight
        avg_emb = (emb_l+emb_r)/2
    
    return avg_emb, corpus_factory 
    


In [5]:
avg_emb, corpus_factory = load_emb(skip_gram_context)
print(type(avg_emb))
print(avg_emb)
print('avg_emb', avg_emb.shape)
print('token_num',corpus_factory.vocab.token_num())
print()
print(corpus_factory.vocab.log_info())

!!! load corpus factory success !!!
!!! Load model weights success !!!
<class 'torch.Tensor'>
tensor([[-0.6650,  5.1770,  0.9831,  ..., -1.3951, -0.8118,  0.3744],
        [ 1.6138, -2.2751, -1.0278,  ...,  0.6881, -0.6176,  0.2612],
        [-0.9439,  0.4284,  0.1687,  ..., -0.3068,  0.4402,  0.1262],
        ...,
        [-0.2844,  0.2892, -0.4906,  ...,  0.0661,  0.0971,  0.1666],
        [ 0.1695,  0.6495,  0.3074,  ..., -0.2193, -0.6466, -1.4928],
        [-0.6326,  0.1975,  0.5403,  ...,  1.4104, -0.5175,  0.1229]],
       device='cuda:0', grad_fn=<DivBackward0>)
avg_emb torch.Size([23707, 300])
token_num 23697

****** VOCAB LOG INFO ******
corpus_word_num: 887249
vocab_size: 23697
word_freq_count: 
[('the', 26050), ('you', 24618), ('i', 24456), ('to', 19978), ('and', 19046), ('a', 15636), ('it', 9891), ('is', 9517), ('rachel:', 9312), ('ross:', 9226), ('that', 8881), ('chandler:', 8492), ('monica:', 8423), ('joey:', 8332), ('oh', 7807), ('phoebe:', 7527), ('in', 7509), ('of', 71

### 老友记6个人的亲密关系排序

``` python
['monica', 'phoebe', 'rachel', 'joey', 'chandler', 'ross',]
```

In [10]:
def analysis(emb, corpus_factory):
    friends = ['monica', 'phoebe', 'rachel', 'joey', 'chandler', 'ross']
    friends_id = corpus_factory.vocab[friends]
    
    friends_emb = emb[friends_id]
    
    # 计算6个人间的协方差
    cov = friends_emb.mm(friends_emb.transpose(0, 1))    
    print('\n emb向量内积')
    print(cov)

    sort_id = torch.argsort(cov, dim = 1, descending=True)
    
    print("每个人最亲近的关系排序")
    for i in range(6):
       
        sortid = sort_id[i].tolist()
        print([friends[j] for j in sortid])

### 找出每个人最相关的top-20的词
def find_topk(emb, corpus_factory ,k=20):
    friends = ['monica', 'phoebe', 'rachel', 'joey', 'chandler', 'ross']
    friends_id = corpus_factory.vocab[friends]
    friends_emb = emb[friends_id]
    
    cov = friends_emb.mm(emb.transpose(0, 1))   # [6, vocab_token_num]
    print('\n emb向量内积')
    print(cov)
    sort_id = torch.argsort(cov, dim = 1, descending=True)[:, :20]  # [6, 20]

    print("每个人关系最紧密的topk-20的词")
    for i in range(6):
       
        sortid = sort_id[i].tolist()
        print([corpus_factory.vocab.to_tokens(sortid)])
            



In [7]:
avg_emb, corpus_factory = load_emb(skip_gram_context)
analysis(avg_emb, corpus_factory)
find_topk(avg_emb, corpus_factory)

!!! load corpus factory success !!!
!!! Load model weights success !!!
每个人最亲近的关系排序
['monica', 'phoebe', 'joey', 'rachel', 'ross', 'chandler']
['phoebe', 'monica', 'joey', 'rachel', 'chandler', 'ross']
['rachel', 'ross', 'joey', 'monica', 'chandler', 'phoebe']
['joey', 'ross', 'monica', 'phoebe', 'rachel', 'chandler']
['chandler', 'ross', 'joey', 'monica', 'rachel', 'phoebe']
['ross', 'joey', 'rachel', 'chandler', 'monica', 'phoebe']
每个人关系最紧密的topk-20的词
[['monica', 'mum', 'chiefs', 'vs', 'outisde', 'knockers', 'liam:', 'vais', '20', "amy's>", 'oo-oo', "'fighting'", 'schoolyard', 'hinges', 'doddle', "screendon't", "a-doin'", 'tilts', 'provocatively', 'section']]
[['phoebe', 'daddy', 'stuart', 'pluck', 'somewhat', 'seasons', 'cheques', 'anybody', "it'sit's", 'parrot', 'futile', 'testicles', 'emillio', 'g-go', 'futon', '905', 'duncan', '-make', 'wallet', 'purchased']]
[['rachel', 'tratt', 'sally', 'podium', 'suite', '904', 'pirate;', 'blessing', 'mid-term', 'imp', 'owner', 'middle;', 'aggre

In [8]:
avg_emb, corpus_factory = load_emb(cbow_context)
analysis(avg_emb, corpus_factory)
find_topk(avg_emb, corpus_factory)

!!! load corpus factory success !!!
!!! Load model weights success !!!
每个人最亲近的关系排序
['monica', 'rachel', 'joey', 'chandler', 'ross', 'phoebe']
['phoebe', 'rachel', 'joey', 'chandler', 'monica', 'ross']
['rachel', 'phoebe', 'monica', 'ross', 'joey', 'chandler']
['joey', 'chandler', 'ross', 'monica', 'phoebe', 'rachel']
['chandler', 'joey', 'phoebe', 'monica', 'rachel', 'ross']
['ross', 'joey', 'rachel', 'monica', 'chandler', 'phoebe']
每个人关系最紧密的topk-20的词
[['monica', 'bang', 'frantically', 'petes', 'comforting', 'traipsing', 'mouths', 'deadpan', 'drags', 'screams', '30%', 'erica', 'handing', 'disgust', 'shoves', 'shoulder', 'passionately', 'glaring', 'what-whats', 'th--fun']]
[['sr:', 'phoebe', 'poses', 'fear', 'buffay', 'manages', 'joining', ':', 'husband', 'milk', 'cartwheel', 'ewwww', 'knit', 'borkow', 'stabby', 'wrapped', 'mike', 'hodge', 'slept', 'painless']]
[['rachel', 'slamming', 'tilts', '123', 'returns', 'amy', 'greep', 'gate', 'dumped', 'strangely', 'mail', "how're", 'winks', 'f

In [11]:
avg_emb, corpus_factory = load_emb(glove_context)
analysis(avg_emb, corpus_factory)
find_topk(avg_emb, corpus_factory)

!!! load corpus factory success !!!
!!! Load model weights success !!!

 emb向量内积
tensor([[0.0117, 0.0072, 0.0075, 0.0051, 0.0071, 0.0063],
        [0.0072, 0.0177, 0.0072, 0.0064, 0.0068, 0.0065],
        [0.0075, 0.0072, 0.0137, 0.0069, 0.0077, 0.0073],
        [0.0051, 0.0064, 0.0069, 0.0096, 0.0059, 0.0067],
        [0.0071, 0.0068, 0.0077, 0.0059, 0.0114, 0.0066],
        [0.0063, 0.0065, 0.0073, 0.0067, 0.0066, 0.0097]], device='cuda:0',
       grad_fn=<MmBackward0>)
每个人最亲近的关系排序
['monica', 'rachel', 'phoebe', 'chandler', 'ross', 'joey']
['phoebe', 'rachel', 'monica', 'chandler', 'ross', 'joey']
['rachel', 'chandler', 'monica', 'ross', 'phoebe', 'joey']
['joey', 'rachel', 'ross', 'phoebe', 'chandler', 'monica']
['chandler', 'rachel', 'monica', 'phoebe', 'ross', 'joey']
['ross', 'rachel', 'joey', 'chandler', 'phoebe', 'monica']

 emb向量内积
tensor([[ 0.1195,  0.0043,  0.0049,  ...,  0.0340,  0.0336,  0.1354],
        [ 0.1373,  0.0044,  0.0053,  ...,  0.0429,  0.0395, -0.0699],
       

### 结果分析
[结果分析.txt](./%E7%BB%93%E6%9E%9C%E5%88%86%E6%9E%90.txt)