### 使用训练好的emb来分析数据

In [1]:
# 环境配置
%cd /playground/sgd_deep_learning/sgd_nlp/
import sys 
sys.path.append('./python')

/playground/sgd_deep_learning/sgd_nlp


In [3]:
from sgd_nlp.embedding import *
from torch.optim.lr_scheduler import ExponentialLR
import torch
import time
import math
import pickle
import os
import sys

In [4]:
SAVE_HOME = r'./apps/embedding/save/'

class glove_context:
    corpus_factory= os.path.join(SAVE_HOME, 'glove/corpus_obj.cf')
    model_path= os.path.join(SAVE_HOME, 'glove/glove_weights.path')
    emb_dim = 300
    sparse_emb = False
    win_width=11
    device = torch.device('cuda')


class skip_gram_context:
    corpus_factory= os.path.join(SAVE_HOME, 'skipgram/corpus_obj.cf')
    model_path= os.path.join(SAVE_HOME, 'skipgram/skipgram_weights.path')
    emb_dim = 300
    sparse_emb = True
    win_width=11
    device = torch.device('cuda')
    
class cbow_context:
    corpus_factory= os.path.join(SAVE_HOME, 'cbow/corpus_obj.cf')
    model_path= os.path.join(SAVE_HOME,'cbow/cbow_weights.path')
    emb_dim = 300
    sparse_emb = True
    win_width=11
    device = torch.device('cuda')

In [5]:
def load_corpus_factory(obj_file_name):
    with open(obj_file_name, 'rb') as fin:
        print("!!! load corpus factory success !!!")
        return pickle.load(fin)

    
def load_emb(context):
    corpus_factory = load_corpus_factory(context.corpus_factory)
    
    model = None
    if 'skip' in context.model_path:
        model = SkipGram(emb_dim=context.emb_dim,
                     token_num=corpus_factory.token_num(),
                     sparse_emb=context.sparse_emb).to(context.device)
    
    elif 'cbow' in  context.model_path:
        model = Cbow(emb_dim=context.emb_dim,
                token_num=corpus_factory.token_num(),
                win_width=context.win_width,
                sparse_emb=context.sparse_emb).to(context.device)
        
    elif 'glove' in  context.model_path:
        model = Glove(emb_dim=context.emb_dim,
                token_num=corpus_factory.token_num(),
                sparse_emb=context.sparse_emb).to(context.device)
        
    model.load_state_dict(torch.load(context.model_path))
    print("!!! Load model weights success !!!")
    
    avg_emb = None
    
    if 'glove' in context.model_path:
        emb_l = model.emb_i.weight
        emb_r = model.emb_j.weight
        avg_emb = (emb_l+emb_r)/2
    
    else:
        emb_l = model.emb_i.weight
        emb_r = model.emb_o.weight
        avg_emb = (emb_l+emb_r)/2
    
    return avg_emb, corpus_factory 
    


In [6]:
avg_emb, corpus_factory = load_emb(skip_gram_context)
print(type(avg_emb))
print(avg_emb)
print('avg_emb', avg_emb.shape)
print('token_num',corpus_factory.vocab.token_num())
print()
print(corpus_factory.vocab.log_info())

!!! load corpus factory success !!!
!!! Load model weights success !!!
<class 'torch.Tensor'>
tensor([[-9.2314e-01,  2.0012e+00, -1.9468e+00,  ..., -2.2160e-01,
         -1.5083e+00,  1.6486e+00],
        [ 2.7495e-01, -8.3205e-01,  2.2257e+00,  ...,  7.7470e-01,
         -1.0426e+00, -2.7914e+00],
        [ 1.5068e+00, -7.9358e-01,  1.1863e+00,  ..., -4.7400e-01,
         -6.3848e-01, -8.9149e-01],
        ...,
        [-9.9248e-01,  6.0319e-02,  9.2521e-01,  ...,  4.8005e-01,
          4.0357e-01,  4.3409e-01],
        [-2.3199e-01, -4.9969e-01, -3.6279e-01,  ...,  8.7473e-01,
         -1.1759e+00, -2.7884e-01],
        [-5.3532e-01,  4.0148e-01,  1.0868e+00,  ..., -1.5217e+00,
          8.2015e-01,  1.3501e-03]], device='cuda:0', grad_fn=<DivBackward0>)
avg_emb torch.Size([23707, 300])
token_num 23697

****** VOCAB LOG INFO ******
corpus_word_num: 887249
vocab_size: 23697
word_freq_count: 
[('the', 26050), ('you', 24618), ('i', 24456), ('to', 19978), ('and', 19046), ('a', 15636), ('

### 老友记6个人的亲密关系排序

``` python
['monica', 'phoebe', 'rachel', 'joey', 'chandler', 'ross',]
```

In [7]:
def analysis(emb, corpus_factory):
    friends = ['monica', 'phoebe', 'rachel', 'joey', 'chandler', 'ross']
    friends_id = corpus_factory.vocab[friends]
    
    friends_emb = emb[friends_id]
    
    # 计算6个人间的协方差
    cov = friends_emb.mm(friends_emb.transpose(0, 1))    
    print('\n emb向量内积')
    print(cov)

    sort_id = torch.argsort(cov, dim = 1, descending=True)
    
    print("每个人最亲近的关系排序")
    for i in range(6):
       
        sortid = sort_id[i].tolist()
        print([friends[j] for j in sortid])

### 找出每个人最相关的top-20的词
def find_topk(emb, corpus_factory ,k=20):
    friends = ['monica', 'phoebe', 'rachel', 'joey', 'chandler', 'ross']
    friends_id = corpus_factory.vocab[friends]
    friends_emb = emb[friends_id]
    
    cov = friends_emb.mm(emb.transpose(0, 1))   # [6, vocab_token_num]
    print('\n emb向量内积')
    print(cov)
    sort_id = torch.argsort(cov, dim = 1, descending=True)[:, :20]  # [6, 20]

    print("每个人关系最紧密的topk-20的词")
    for i in range(6):
       
        sortid = sort_id[i].tolist()
        print([corpus_factory.vocab.to_tokens(sortid)])
            



In [8]:
avg_emb, corpus_factory = load_emb(skip_gram_context)
analysis(avg_emb, corpus_factory)
find_topk(avg_emb, corpus_factory)

!!! load corpus factory success !!!
!!! Load model weights success !!!

 emb向量内积
tensor([[ 3.9516e+02,  3.1970e+01,  1.7339e+01,  4.8221e-01,  4.1077e+01,
          1.1537e+00],
        [ 3.1970e+01,  4.0726e+02, -7.8354e+00, -1.5132e+01, -3.7384e+01,
          2.9914e+01],
        [ 1.7339e+01, -7.8354e+00,  4.5861e+02,  6.2885e+01,  3.0006e+01,
         -3.7074e+01],
        [ 4.8221e-01, -1.5132e+01,  6.2885e+01,  5.1977e+02,  3.3129e+01,
          2.1989e+01],
        [ 4.1077e+01, -3.7384e+01,  3.0006e+01,  3.3129e+01,  4.1682e+02,
          4.7946e+01],
        [ 1.1537e+00,  2.9914e+01, -3.7074e+01,  2.1989e+01,  4.7946e+01,
          4.1671e+02]], device='cuda:0', grad_fn=<MmBackward0>)
每个人最亲近的关系排序
['monica', 'chandler', 'phoebe', 'rachel', 'ross', 'joey']
['phoebe', 'monica', 'ross', 'rachel', 'joey', 'chandler']
['rachel', 'joey', 'chandler', 'monica', 'phoebe', 'ross']
['joey', 'rachel', 'chandler', 'ross', 'monica', 'phoebe']
['chandler', 'ross', 'monica', 'joey', 'rachel',

In [9]:
avg_emb, corpus_factory = load_emb(cbow_context)
analysis(avg_emb, corpus_factory)
find_topk(avg_emb, corpus_factory)

!!! load corpus factory success !!!
!!! Load model weights success !!!

 emb向量内积
tensor([[12.4432,  2.1130,  3.0443,  2.3352,  3.1008,  0.8121],
        [ 2.1130, 16.7685,  2.9680,  1.5070,  1.4071,  0.8910],
        [ 3.0443,  2.9680, 11.7208,  2.0097,  1.6701,  1.6479],
        [ 2.3352,  1.5070,  2.0097, 12.6702,  3.6263,  1.6323],
        [ 3.1008,  1.4071,  1.6701,  3.6263, 10.4537,  1.7439],
        [ 0.8121,  0.8910,  1.6479,  1.6323,  1.7439,  8.2177]],
       device='cuda:0', grad_fn=<MmBackward0>)
每个人最亲近的关系排序
['monica', 'chandler', 'rachel', 'joey', 'phoebe', 'ross']
['phoebe', 'rachel', 'monica', 'joey', 'chandler', 'ross']
['rachel', 'monica', 'phoebe', 'joey', 'chandler', 'ross']
['joey', 'chandler', 'monica', 'rachel', 'ross', 'phoebe']
['chandler', 'joey', 'monica', 'ross', 'rachel', 'phoebe']
['ross', 'chandler', 'rachel', 'joey', 'phoebe', 'monica']

 emb向量内积
tensor([[-0.9886,  0.4517,  0.7783,  ...,  0.7079, -2.0263,  2.0932],
        [-4.7495, -0.1710,  0.3887,  ...,

In [10]:
avg_emb, corpus_factory = load_emb(glove_context)
analysis(avg_emb, corpus_factory)
find_topk(avg_emb, corpus_factory)

!!! load corpus factory success !!!
!!! Load model weights success !!!

 emb向量内积
tensor([[ 1.0646e+00,  1.3017e-02, -2.3883e-03,  1.1614e-01, -4.0152e-03,
          7.0150e-02],
        [ 1.3017e-02,  4.7444e+00,  3.7048e-01,  2.7121e-01, -3.4167e-02,
          2.2401e-01],
        [-2.3883e-03,  3.7048e-01,  1.0681e+00,  1.0799e-02, -3.5361e-02,
          4.9005e-02],
        [ 1.1614e-01,  2.7121e-01,  1.0799e-02,  6.5963e-01, -3.0109e-03,
          4.9508e-02],
        [-4.0152e-03, -3.4167e-02, -3.5361e-02, -3.0109e-03,  3.5225e-01,
          6.5839e-02],
        [ 7.0150e-02,  2.2401e-01,  4.9005e-02,  4.9508e-02,  6.5839e-02,
          3.4042e-01]], device='cuda:0', grad_fn=<MmBackward0>)
每个人最亲近的关系排序
['monica', 'joey', 'ross', 'phoebe', 'rachel', 'chandler']
['phoebe', 'rachel', 'joey', 'ross', 'monica', 'chandler']
['rachel', 'phoebe', 'ross', 'joey', 'monica', 'chandler']
['joey', 'phoebe', 'monica', 'ross', 'rachel', 'chandler']
['chandler', 'ross', 'joey', 'monica', 'phoebe',

### 结果分析
[结果分析.txt](./%E7%BB%93%E6%9E%9C%E5%88%86%E6%9E%90.txt)