In [1]:
## requires: pytorch, transformer, flash-attn
from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F
import pandas as pd
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, 
                                  trust_remote_code=True, 
                                  attn_implementation="flash_attention_2", 
                                  torch_dtype=torch.float16).to("cuda")
model.eval()

You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
Loading checkpoint shards: 100%|██████████| 2/2 [00:03<00:00,  1.87s/it]


MiniCPMModel(
  (embed_tokens): Embedding(122753, 2304)
  (layers): ModuleList(
    (0-39): 40 x MiniCPMDecoderLayer(
      (self_attn): MiniCPMFlashAttention2(
        (q_proj): Linear(in_features=2304, out_features=2304, bias=False)
        (k_proj): Linear(in_features=2304, out_features=2304, bias=False)
        (v_proj): Linear(in_features=2304, out_features=2304, bias=False)
        (o_proj): Linear(in_features=2304, out_features=2304, bias=False)
        (rotary_emb): MiniCPMRotaryEmbedding()
      )
      (mlp): MiniCPMMLP(
        (gate_proj): Linear(in_features=2304, out_features=5760, bias=False)
        (up_proj): Linear(in_features=2304, out_features=5760, bias=False)
        (down_proj): Linear(in_features=5760, out_features=2304, bias=False)
        (act_fn): SiLU()
      )
      (input_layernorm): MiniCPMRMSNorm()
      (post_attention_layernorm): MiniCPMRMSNorm()
    )
  )
  (norm): MiniCPMRMSNorm()
)

In [8]:
query = ["What is the capital of France?", "What is the capital of Italy?", "Where is the Eiffel Tower?"]
batch_dict = tokenizer(query, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")

outputs = model(**batch_dict)
attention_mask = batch_dict["attention_mask"]
hidden = outputs.last_hidden_state
hidden

tensor([[[ -5.5703, -17.1250,  -0.3840,  ...,  -0.7285,   1.3389,   3.2480],
         [ -2.2031,  -6.3750,   8.9688,  ...,  -4.9102,  -3.3594,   3.4863],
         [  4.1680,  -4.1484,   1.1309,  ...,  -2.8320,  -5.7852,  -0.4636],
         ...,
         [  4.1562,   2.1934,  -0.9971,  ...,  -1.7559,  -6.4180,  -0.4285],
         [  3.4590,   0.4956,  -0.5093,  ...,  -3.2441,  -5.9414,   0.2297],
         [  4.0977,  -2.1309,  -1.6572,  ...,  -2.3574,  -6.5664,  -1.1680]],

        [[ -5.5703, -17.1250,  -0.3840,  ...,  -0.7285,   1.3389,   3.2480],
         [ -2.3281,  -6.4648,   9.0547,  ...,  -4.8359,  -3.3379,   3.4766],
         [ -1.2227,  -8.3906,   2.4277,  ...,  -2.2012,  -7.9492,   1.8818],
         ...,
         [ -1.2764,  -0.7939,   0.7637,  ...,  -0.9521,  -9.0078,   1.6562],
         [ -2.9512,  -2.9316,   0.4612,  ...,   0.1503,  -7.5820,   1.1572],
         [ -2.1094,  -5.2891,  -0.8242,  ...,  -1.5107,  -8.1719,   1.2773]],

        [[ -2.1094,  -6.8438,   8.9688,  ...

In [46]:
attention_mask_ = attention_mask * attention_mask.cumsum(dim=1)
## add a dimension to the tensor so that it can be multiplied with the hidden states
## then use sum to transform into a 1-D tensor 
s = torch.sum(hidden * attention_mask_.unsqueeze(-1).float(), dim=1)

## sum the cumsum attention mask to get the total weight for each sentence in the batch
d = attention_mask_.sum(dim=1, keepdim=True).float()

## Calculate the weighted average of each sentence
reps = s / d

reps

tensor([[ 3.2854, -0.3959, -0.6500,  ..., -2.6827, -6.2625, -0.2065],
        [-2.2898, -3.8697,  0.6259,  ..., -1.3842, -8.1982,  1.8221],
        [ 4.9791, -2.4095,  0.4534,  ...,  5.7205, -1.8437,  5.3055]],
       device='cuda:0', grad_fn=<DivBackward0>)

In [45]:
attention_mask_.sum(dim=1, keepdim=True).float()

tensor([[36.],
        [36.],
        [45.]], device='cuda:0')

In [34]:
torch.sum(hidden * attention_mask_.unsqueeze(-1).float(), dim=1)

tensor([[ 118.2734,  -14.2524,  -23.3992,  ...,  -96.5781, -225.4492,
           -7.4358],
        [ -82.4316, -139.3105,   22.5315,  ...,  -49.8329, -295.1348,
           65.5967],
        [ 224.0586, -108.4258,   20.4042,  ...,  257.4219,  -82.9683,
          238.7461]], device='cuda:0', grad_fn=<SumBackward1>)

In [61]:
# Normalize
lst = torch.nn.functional.normalize(reps, p=2, dim=1).detach().cpu().numpy()
lst

array([[ 0.01396157, -0.00168243, -0.00276215, ..., -0.01140055,
        -0.02661312, -0.00087776],
       [-0.00965947, -0.01632464,  0.00264028, ..., -0.0058395 ,
        -0.03458437,  0.00768673],
       [ 0.02103702, -0.01018018,  0.00191576, ...,  0.02416953,
        -0.00778995,  0.02241604]], dtype=float32)

In [63]:
## quick check to see if the embeddings are normalized
np.linalg.norm(lst, axis=1)

array([1.        , 1.        , 0.99999994], dtype=float32)

In [64]:
def weighted_mean_pooling(hidden, attention_mask):
    attention_mask_ = attention_mask * attention_mask.cumsum(dim=1)
    s = torch.sum(hidden * attention_mask_.unsqueeze(-1).float(), dim=1)
    d = attention_mask_.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = weighted_mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

In [76]:
ans = ['Paris', 'Rome', 'Beijing']

# for each question in the query, find the cos similarity with the answer
for q in query:
    emb = encode([q])
    for a in ans:
        emb_a = encode([a])
        sim = np.dot(emb, emb_a.T)
        print(f"Q: {q}, A: {a}, sim: {sim[0][0]:.4f}")

Q: What is the capital of France?, A: Paris, sim: 0.5011
Q: What is the capital of France?, A: Rome, sim: 0.3838
Q: What is the capital of France?, A: Beijing, sim: 0.3836
Q: What is the capital of Italy?, A: Paris, sim: 0.3739
Q: What is the capital of Italy?, A: Rome, sim: 0.4121
Q: What is the capital of Italy?, A: Beijing, sim: 0.3550
Q: Where is the Eiffel Tower?, A: Paris, sim: 0.4380
Q: Where is the Eiffel Tower?, A: Rome, sim: 0.3241
Q: Where is the Eiffel Tower?, A: Beijing, sim: 0.3052


In [5]:
queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

[[0.35365185141563416, 0.18593288958072662]]


In [81]:
df = pd.read_csv("stories/stories_cn_oesz.csv")
df

Unnamed: 0,name,url,category,content
0,沙漏的启示_睡前故事,http://www.oesz.cn/shuiqian/11151.html,睡前故事,\n\n\t小男孩弗兰克非常健谈，如果他发现件新鲜东西，肯定会提出许多许多的问题来。他的母亲...
1,酒爷爷与手套奶奶,http://www.oesz.cn/shuiqian/11150.html,睡前故事,\n\n\t酒爷爷与手套奶奶酒爷爷喝醉了酒，在路上抱了一只小狗回家。醒来后，酒爷爷看见一只小...
2,南瓜哪儿去了_睡前故事,http://www.oesz.cn/shuiqian/11149.html,睡前故事,\n\n\t南瓜哪儿去了_睡前故事“南瓜?”小兔子说，“请原谅我，我不知道南瓜是你家的，它就...
3,蜘蛛夫妇的裁缝店,http://www.oesz.cn/shuiqian/11148.html,睡前故事,\n\n\t蜘蛛夫妇的裁缝店蜘蛛先生发现了一个商机，它决定和他的夫人一起开一家裁缝店，蜘蛛太...
4,知错能改才能进步,http://www.oesz.cn/shuiqian/11147.html,睡前故事,\n\n\t小猫咪咪看着太阳好，于是在外面玩，但是它抓不到蝴蝶，于是看到蜻蜓抓到蝴蝶心里很不...
...,...,...,...,...
8111,圆球公主_【公主童话故事】_ 故事大全阅读频道,http://www.oesz.cn/a/gongzhu/855.html,故事大全,\n\n\t有一天，王子照常在院子里的凉台上玩黄金球，抛抛转转。不一会儿，看见一个从来没有见...
8112,白雪公主,http://www.oesz.cn/a/gongzhu/854.html,故事大全,\n\n\t严冬时节，鹅毛一样的大雪片在天空中到处飞舞着，有一个王后坐在王宫里的一扇窗子边，...
8113,骄做的公主_【公主童话故事】_ 故事大全阅读频道,http://www.oesz.cn/a/gongzhu/853.html,故事大全,\n\n\t从前有一位非常漂亮的公主，但她也非常的骄做。人们都知道她是一位骄做的公主。很多王...
8114,苦命姑娘_【公主童话故事】_ 故事大全阅读频道,http://www.oesz.cn/a/gongzhu/852.html,故事大全,\n\n\t苦命姑娘_【公主童话故事】_ 故事大全阅读频道相传，从前一位国王和王后有七个女孩...


In [84]:
columns = ['name', 'content', 'category']
df.loc[:, 'combined'] = df.apply(lambda x: '|'.join(f"{col}: {str(x[col]).strip()}" for col in columns if pd.notna(x[col])), axis=1)

df

Unnamed: 0,name,url,category,content,combined
0,沙漏的启示_睡前故事,http://www.oesz.cn/shuiqian/11151.html,睡前故事,\n\n\t小男孩弗兰克非常健谈，如果他发现件新鲜东西，肯定会提出许多许多的问题来。他的母亲...,name: 沙漏的启示_睡前故事|content: 小男孩弗兰克非常健谈，如果他发现件新鲜东...
1,酒爷爷与手套奶奶,http://www.oesz.cn/shuiqian/11150.html,睡前故事,\n\n\t酒爷爷与手套奶奶酒爷爷喝醉了酒，在路上抱了一只小狗回家。醒来后，酒爷爷看见一只小...,name: 酒爷爷与手套奶奶|content: 酒爷爷与手套奶奶酒爷爷喝醉了酒，在路上抱了一...
2,南瓜哪儿去了_睡前故事,http://www.oesz.cn/shuiqian/11149.html,睡前故事,\n\n\t南瓜哪儿去了_睡前故事“南瓜?”小兔子说，“请原谅我，我不知道南瓜是你家的，它就...,name: 南瓜哪儿去了_睡前故事|content: 南瓜哪儿去了_睡前故事“南瓜?”小兔子...
3,蜘蛛夫妇的裁缝店,http://www.oesz.cn/shuiqian/11148.html,睡前故事,\n\n\t蜘蛛夫妇的裁缝店蜘蛛先生发现了一个商机，它决定和他的夫人一起开一家裁缝店，蜘蛛太...,name: 蜘蛛夫妇的裁缝店|content: 蜘蛛夫妇的裁缝店蜘蛛先生发现了一个商机，它决...
4,知错能改才能进步,http://www.oesz.cn/shuiqian/11147.html,睡前故事,\n\n\t小猫咪咪看着太阳好，于是在外面玩，但是它抓不到蝴蝶，于是看到蜻蜓抓到蝴蝶心里很不...,name: 知错能改才能进步|content: 小猫咪咪看着太阳好，于是在外面玩，但是它抓不...
...,...,...,...,...,...
8111,圆球公主_【公主童话故事】_ 故事大全阅读频道,http://www.oesz.cn/a/gongzhu/855.html,故事大全,\n\n\t有一天，王子照常在院子里的凉台上玩黄金球，抛抛转转。不一会儿，看见一个从来没有见...,name: 圆球公主_【公主童话故事】_ 故事大全阅读频道|content: 有一天，王子照...
8112,白雪公主,http://www.oesz.cn/a/gongzhu/854.html,故事大全,\n\n\t严冬时节，鹅毛一样的大雪片在天空中到处飞舞着，有一个王后坐在王宫里的一扇窗子边，...,name: 白雪公主|content: 严冬时节，鹅毛一样的大雪片在天空中到处飞舞着，有一个...
8113,骄做的公主_【公主童话故事】_ 故事大全阅读频道,http://www.oesz.cn/a/gongzhu/853.html,故事大全,\n\n\t从前有一位非常漂亮的公主，但她也非常的骄做。人们都知道她是一位骄做的公主。很多王...,name: 骄做的公主_【公主童话故事】_ 故事大全阅读频道|content: 从前有一位非...
8114,苦命姑娘_【公主童话故事】_ 故事大全阅读频道,http://www.oesz.cn/a/gongzhu/852.html,故事大全,\n\n\t苦命姑娘_【公主童话故事】_ 故事大全阅读频道相传，从前一位国王和王后有七个女孩...,name: 苦命姑娘_【公主童话故事】_ 故事大全阅读频道|content: 苦命姑娘_【公...


In [87]:
df.loc[:, 'embedding'] = None

df.at[0, 'embedding'] = encode(df.combined[0])[0]
df

Unnamed: 0,name,url,category,content,combined,embedding
0,沙漏的启示_睡前故事,http://www.oesz.cn/shuiqian/11151.html,睡前故事,\n\n\t小男孩弗兰克非常健谈，如果他发现件新鲜东西，肯定会提出许多许多的问题来。他的母亲...,name: 沙漏的启示_睡前故事|content: 小男孩弗兰克非常健谈，如果他发现件新鲜东...,"[-0.00498313, 0.06906862, 0.021018654, -0.0089..."
1,酒爷爷与手套奶奶,http://www.oesz.cn/shuiqian/11150.html,睡前故事,\n\n\t酒爷爷与手套奶奶酒爷爷喝醉了酒，在路上抱了一只小狗回家。醒来后，酒爷爷看见一只小...,name: 酒爷爷与手套奶奶|content: 酒爷爷与手套奶奶酒爷爷喝醉了酒，在路上抱了一...,
2,南瓜哪儿去了_睡前故事,http://www.oesz.cn/shuiqian/11149.html,睡前故事,\n\n\t南瓜哪儿去了_睡前故事“南瓜?”小兔子说，“请原谅我，我不知道南瓜是你家的，它就...,name: 南瓜哪儿去了_睡前故事|content: 南瓜哪儿去了_睡前故事“南瓜?”小兔子...,
3,蜘蛛夫妇的裁缝店,http://www.oesz.cn/shuiqian/11148.html,睡前故事,\n\n\t蜘蛛夫妇的裁缝店蜘蛛先生发现了一个商机，它决定和他的夫人一起开一家裁缝店，蜘蛛太...,name: 蜘蛛夫妇的裁缝店|content: 蜘蛛夫妇的裁缝店蜘蛛先生发现了一个商机，它决...,
4,知错能改才能进步,http://www.oesz.cn/shuiqian/11147.html,睡前故事,\n\n\t小猫咪咪看着太阳好，于是在外面玩，但是它抓不到蝴蝶，于是看到蜻蜓抓到蝴蝶心里很不...,name: 知错能改才能进步|content: 小猫咪咪看着太阳好，于是在外面玩，但是它抓不...,
...,...,...,...,...,...,...
8111,圆球公主_【公主童话故事】_ 故事大全阅读频道,http://www.oesz.cn/a/gongzhu/855.html,故事大全,\n\n\t有一天，王子照常在院子里的凉台上玩黄金球，抛抛转转。不一会儿，看见一个从来没有见...,name: 圆球公主_【公主童话故事】_ 故事大全阅读频道|content: 有一天，王子照...,
8112,白雪公主,http://www.oesz.cn/a/gongzhu/854.html,故事大全,\n\n\t严冬时节，鹅毛一样的大雪片在天空中到处飞舞着，有一个王后坐在王宫里的一扇窗子边，...,name: 白雪公主|content: 严冬时节，鹅毛一样的大雪片在天空中到处飞舞着，有一个...,
8113,骄做的公主_【公主童话故事】_ 故事大全阅读频道,http://www.oesz.cn/a/gongzhu/853.html,故事大全,\n\n\t从前有一位非常漂亮的公主，但她也非常的骄做。人们都知道她是一位骄做的公主。很多王...,name: 骄做的公主_【公主童话故事】_ 故事大全阅读频道|content: 从前有一位非...,
8114,苦命姑娘_【公主童话故事】_ 故事大全阅读频道,http://www.oesz.cn/a/gongzhu/852.html,故事大全,\n\n\t苦命姑娘_【公主童话故事】_ 故事大全阅读频道相传，从前一位国王和王后有七个女孩...,name: 苦命姑娘_【公主童话故事】_ 故事大全阅读频道|content: 苦命姑娘_【公...,


In [88]:
df.loc[:, 'embedding'] = None
embeddings = []
for i in range(10):
    if i % 100 == 0:
        print(f"Processing row {i}")
    embeddings.extend(encode(df.combined[i]))

embeddings

Processing row 0


[array([-0.00498313,  0.06906862,  0.02101865, ...,  0.00796567,
         0.01396937,  0.00395185], dtype=float32),
 array([ 0.0116552 , -0.03210025, -0.01857175, ...,  0.00095958,
        -0.01962594,  0.01796045], dtype=float32),
 array([ 0.03651097, -0.01062996,  0.0077906 , ...,  0.02704671,
        -0.02432447, -0.02019073], dtype=float32),
 array([ 0.04392185,  0.02777353,  0.01988   , ...,  0.00897329,
        -0.00743456, -0.00084875], dtype=float32),
 array([-0.01278089,  0.0137676 ,  0.02348899, ..., -0.00573102,
        -0.02369383,  0.00437704], dtype=float32),
 array([ 0.03910792,  0.01769857,  0.00804373, ...,  0.03112897,
        -0.00180959,  0.01069625], dtype=float32),
 array([ 0.01713266, -0.01267827,  0.01214441, ...,  0.03826563,
        -0.00708288,  0.00719004], dtype=float32),
 array([ 0.01166246, -0.00011373,  0.00472806, ...,  0.00481325,
        -0.00488158,  0.01568454], dtype=float32),
 array([ 0.02854568, -0.00621637,  0.00772317, ..., -0.00466084,
       

In [90]:
for i, embedding in enumerate(embeddings):
    print(i)
    df.at[i, 'embedding'] = embedding

0
1
2
3
4
5
6
7
8
9


In [94]:
df.to_csv("stories/stories_cn_oesz_embeddings.csv", index=False)