In [1]:
from transformers import BertTokenizer, BertModel
import torch
import os

# 设置 HTTP 和 HTTPS 代理
os.environ['HTTP_PROXY'] = 'http://localhost:7890'
os.environ['HTTPS_PROXY'] = 'http://localhost:7890'
# 加载预训练的BERT模型和分词器
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')




In [4]:
import spacy
import spacy.cli

# Ensure the model is installed
try:
    nlp = spacy.load('en_core_web_sm')
except OSError:
    spacy.cli.download('en_core_web_sm')
    nlp = spacy.load('en_core_web_sm')

# Now you can use the model
sentence = "This is a test sentence."
doc = nlp(sentence)

for token in doc:
    print(f"Token: {token.text}, POS: {token.pos_}, Dep: {token.dep_}, Head: {token.head.text}")

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Token: This, POS: PRON, Dep: nsubj, Head: is
Token: is, POS: AUX, Dep: ROOT, Head: is
Token: a, POS: DET, Dep: det, Head: sentence
Token: test, POS: NOUN, Dep: compound, Head: sentence
Token: sentence, POS: NOUN, Dep: attr, Head: is
Token: ., POS: PUNCT, Dep: punct, Head: is


In [5]:
import spacy

# 加载英文小型模型
nlp = spacy.load('en_core_web_sm')


In [6]:
# 示例句子
sentence = "This is a test sentence."

# 进行语法解析
doc = nlp(sentence)

# 打印每个词的词性和依存关系
for token in doc:
    print(f"Token: {token.text}, POS: {token.pos_}, Dep: {token.dep_}, Head: {token.head.text}")

# 可视化依存关系（需要安装displacy）
from spacy import displacy

displacy.render(doc, style='dep', jupyter=True)


Token: This, POS: PRON, Dep: nsubj, Head: is
Token: is, POS: AUX, Dep: ROOT, Head: is
Token: a, POS: DET, Dep: det, Head: sentence
Token: test, POS: NOUN, Dep: compound, Head: sentence
Token: sentence, POS: NOUN, Dep: attr, Head: is
Token: ., POS: PUNCT, Dep: punct, Head: is


In [10]:
from transformers.models.bert.modeling_bert import BertAttention, BertModel, BertPreTrainedModel

class CustomBertAttention(BertAttention):
    def forward(self, hidden_states, attention_mask=None, head_mask=None, output_attentions=False, past_key_value=None):
        # 调用父类的前向方法
        self_attention_outputs = super().forward(
            hidden_states,
            attention_mask=attention_mask,
            head_mask=head_mask,
            output_attentions=output_attentions,
            past_key_value=past_key_value
        )
        
        attention_output = self_attention_outputs[0]
        attention_weights = self_attention_outputs[1] if output_attentions else None
        
        # 假设我们希望提升第 3 个位置的权重
        boost_index = 3
        boost_factor = 2.0
        
        if attention_weights is not None:
            attention_weights[:, :, :, boost_index] *= boost_factor
        
        return (attention_output, attention_weights)

class CustomBertModel(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.bert = BertModel(config)
        self.bert.encoder.layer[0].attention = CustomBertAttention(config)

    def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None):
        outputs = self.bert(input_ids,
                            attention_mask=attention_mask,
                            token_type_ids=token_type_ids,
                            position_ids=position_ids,
                            head_mask=head_mask,
                            inputs_embeds=inputs_embeds)
        return outputs

# 使用自定义模型
custom_model = CustomBertModel.from_pretrained('bert-base-uncased')




In [11]:
def get_sentence_embedding(model, tokenizer, sentence):
    # 对句子进行编码
    inputs = tokenizer(sentence, return_tensors='pt')
    
    # 获取模型输出
    with torch.no_grad():
        outputs = model(**inputs)
    
    # 使用 [CLS] token 的隐藏状态作为句子的 embedding
    cls_embedding = outputs.last_hidden_state[:, 0, :]
    
    return cls_embedding

# 示例句子
sentence = "This is a test sentence."

# 计算原始模型的 embedding
original_embedding = get_sentence_embedding(model, tokenizer, sentence)

# 计算自定义模型的 embedding
custom_embedding = get_sentence_embedding(custom_model, tokenizer, sentence)


In [15]:
from scipy.spatial.distance import cosine

# Flatten the embeddings to 1-D
original_embedding = original_embedding.flatten()
custom_embedding = custom_embedding.flatten()

# 计算欧氏距离
euclidean_distance = torch.norm(original_embedding - custom_embedding).item()

# 计算余弦相似度
cosine_similarity = 1 - cosine(original_embedding.numpy(), custom_embedding.numpy())

print(f"Euclidean Distance: {euclidean_distance}")
print(f"Cosine Similarity: {cosine_similarity}")


Euclidean Distance: 0.0
Cosine Similarity: 0.9999999907397544
