<a href="https://colab.research.google.com/github/rax125896343/rax/blob/main/raxlabmeeting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [2]:
!pip install hanlp
!pip install transformers tokenizers

Collecting hanlp
  Downloading hanlp-2.1.0b50-py3-none-any.whl (651 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m651.1/651.1 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting hanlp-common>=0.0.19 (from hanlp)
  Downloading hanlp_common-0.0.19.tar.gz (28 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting hanlp-downloader (from hanlp)
  Downloading hanlp_downloader-0.0.25.tar.gz (13 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting hanlp-trie>=0.0.4 (from hanlp)
  Downloading hanlp_trie-0.0.5.tar.gz (6.7 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pynvml (from hanlp)
  Downloading pynvml-11.5.0-py3-none-any.whl (53 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.1/53.1 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sentencepiece>=0.1.91 (from hanlp)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K  

In [3]:
import os
import re
import json
from collections import Counter, OrderedDict
from tqdm import tqdm

# 数据处理及可视化
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import matplotlib
matplotlib.rc("font", family='SimHei') # 用来显示中文，对于macos系统需要换一个支持的字体

# 自然语言处理
import hanlp
import torch
from transformers import (
    BertTokenizer,
    GPT2LMHeadModel,
    TextGenerationPipeline,
    AutoTokenizer,
    AutoModelForSequenceClassification,
    AutoModelForSeq2SeqLM,
    pipeline
    )
import math

In [4]:
from transformers import BertTokenizer, GPT2LMHeadModel
ckpt_path = "uer/gpt2-chinese-cluecorpussmall" # checkpoint模型路径   #采用的模型为gpt2-chinese-cluecorpussmall
tokenizer = BertTokenizer.from_pretrained(ckpt_path) # 分词器
model = GPT2LMHeadModel.from_pretrained(ckpt_path) # 语言模型

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/110k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/217 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/577 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/421M [00:00<?, ?B/s]

In [8]:
def word_probability(model, tokenizer, context, word):
    # 构建输入文本
    input_text = context + word  # 将上下文和词语拼接
    input_ids = tokenizer.encode(input_text, add_special_tokens=False, return_tensors="pt")

    # 使用模型生成下一个词的预测概率分布
    with torch.no_grad():
        outputs = model(input_ids)

    predicted_logits = outputs.logits[0, -1]  # 获取最后一个词的输出概率分布
    predicted_probs = torch.softmax(predicted_logits, dim=-1)  # 转换为概率分布

    # 获取词语在词表中的索引
    word_index = tokenizer.encode(word, add_special_tokens=False)[0]

    # 获取给定词语的概率
    word_prob = predicted_probs[word_index].item()
    return word_prob

word_freq0 = word_probability(model, tokenizer, "", "的")
word_freq1 = word_probability(model, tokenizer, "", "西瓜")
word_freq2 = word_probability(model, tokenizer, "", "视语")

print('='*20 + 'P(的) vs P(西瓜) vs P(桌子)' + '='*20)
print(f'P(的): {word_freq0}')
print(f'P(西瓜): {word_freq1}')
print(f'P(视语): {word_freq2}')

P(的): 0.008103590458631516
P(西瓜): 0.004497095942497253
P(视语): 1.61291281983722e-05


In [11]:
def transition_probability(model, tokenizer, context, word):
    # 构建输入文本
    input_text = context + word
    input_ids = tokenizer.encode(input_text, add_special_tokens=False, return_tensors="pt")

    # 使用模型生成下一个词的预测概率分布
    with torch.no_grad():
        outputs = model(input_ids)

    predicted_logits = outputs.logits[0, -1]  # 获取最后一个词的输出概率分布
    predicted_probs = torch.softmax(predicted_logits, dim=-1)  # 转换为概率分布

    # 获取给定词语在词表中的索引
    word_index = tokenizer.encode(word, add_special_tokens=False)[0]

    # 获取给定词语的转移概率
    transition_prob = predicted_probs[word_index].item()
    return transition_prob

tp1 = transition_probability(model, tokenizer, '我喜欢吃', '西瓜')
tp2 = transition_probability(model, tokenizer, '我喜欢吃', '桌子')

print('='*10 + 'P(我喜欢吃西瓜) vs P(我喜欢吃桌子)' + '='*10)
print(f'P(我喜欢吃西瓜): {tp1}')
print(f'P(我喜欢吃桌子): {tp2}')

P(我喜欢吃西瓜): 0.0026784883812069893
P(我喜欢吃桌子): 0.0012899352004751563


In [12]:
words = ['美丽', '西瓜', '桌子']  #到时候可以建立一个list文档，然后直接引用就可以了

# 计算转移概率，使用生成模型的方法
for word in words:
    input_text = "我喜欢吃" + word
    input_ids = tokenizer.encode(input_text, add_special_tokens=False, return_tensors="pt")

    with torch.no_grad():
        outputs = model(input_ids)

    predicted_logits = outputs.logits[0, -1]  # 取最后一个词的输出
    predicted_probs = torch.softmax(predicted_logits, dim=-1)

    word_id = tokenizer.encode(word, add_special_tokens=False)[0]
    word_prob = predicted_probs[word_id].item()

    word_prob = math.log(word_prob)  # 对数概率 #这里可以解除掉演示一下

    print(f'词语"{word}"的对数概率: {word_prob}')


词语"美丽"的对数概率: -7.319270577729962
词语"西瓜"的对数概率: -5.922502680451066
词语"桌子"的对数概率: -6.653163294060036


In [13]:
for word in words:
    input_text = "我喜欢吃" + word
    input_ids = tokenizer.encode(input_text, add_special_tokens=False, return_tensors="pt")
    with torch.no_grad():
        outputs = model(input_ids)
    predicted_logits = outputs.logits[0, -1]  # 取最后一个词的输出
    predicted_probs = torch.softmax(predicted_logits, dim=-1)
    word_id = tokenizer.encode(word, add_special_tokens=False)[0]
    word_prob = predicted_probs[word_id].item()
    log_word_prob = math.log(word_prob)  # 对数概率
    s = -math.log(word_prob)
    print(f'surprisal(我喜欢吃"{word}"): {s}')


surprisal(我喜欢吃"美丽"): 7.319270577729962
surprisal(我喜欢吃"西瓜"): 5.922502680451066
surprisal(我喜欢吃"桌子"): 6.653163294060036


In [14]:
context = "人们吃"  #最高频率预测词

# 将上下文文本转换为输入张量
input_ids = tokenizer.encode(context, add_special_tokens=False, return_tensors="pt")

# 使用模型生成下一个词的预测概率分布
with torch.no_grad():
    outputs = model(input_ids)

predicted_logits = outputs.logits[0, -1]  # 获取最后一个词的输出概率分布
predicted_probs = torch.softmax(predicted_logits, dim=-1)  # 转换为概率分布

# 获取概率最高的词的索引
next_word_index = torch.argmax(predicted_probs).item()

# 使用分词器将索引转换为词
next_word = tokenizer.decode([next_word_index])

# 打印下一个可能出现的词和对应的概率
print(f"下一个可能出现的词: {next_word}")
print(f"概率: {predicted_probs[next_word_index].item()}")

下一个可能出现的词: 的
概率: 0.12039685994386673


In [15]:
#entropy
def entropy_cal(model, tokenizer, context):
    # 构建输入文本
    input_text = context
    input_ids = tokenizer.encode(input_text, add_special_tokens=False, return_tensors="pt")

    # 使用模型生成下一个词的预测概率分布
    with torch.no_grad():
        outputs = model(input_ids)

    predicted_logits = outputs.logits[0, -1]  # 获取最后一个词的输出概率分布
    predicted_probs = torch.softmax(predicted_logits, dim=-1)  # 转换为概率分布

    # 计算概率分布的熵
    entropy = -torch.sum(predicted_probs * torch.log2(predicted_probs + 1e-20))
    return entropy.item()

context1 = "蝴"
context2 = "。"

e1 = entropy_cal(model, tokenizer, context1)
e2 = entropy_cal(model, tokenizer, context2)

print('='*10 + 'entropy(蝴) vs entropy(。)' + '='*10)
print(f'entropy(蝴): {e1}')
print(f'entropy(。): {e2}')

entropy(蝴): 0.05716872215270996
entropy(。): 9.412870407104492


In [None]:
## 0. 分词
sent_ex = '这个门被锁了，锁很难被打开。'
tok = hanlp.load(hanlp.pretrained.tok.COARSE_ELECTRA_SMALL_ZH)
tks = tok(sent_ex)
print('0. 分词结果：')
print(tks)

## 1. 词性标注
pos = hanlp.load(hanlp.pretrained.pos.CTB9_POS_ELECTRA_SMALL)
print('1. 词性标注：')
print(pos(tks))

Downloading https://file.hankcs.com/hanlp/tok/coarse_electra_small_20220616_012050.zip to /root/.hanlp/tok/coarse_electra_small_20220616_012050.zip
Decompressing /root/.hanlp/tok/coarse_electra_small_20220616_012050.zip to /root/.hanlp/tok
Downloading https://file.hankcs.com/hanlp/utils/char_table_20210602_202632.json.zip to /root/.hanlp/utils/char_table_20210602_202632.json.zip
Decompressing /root/.hanlp/utils/char_table_20210602_202632.json.zip to /root/.hanlp/utils
Downloading https://file.hankcs.com/hanlp/transformers/electra_zh_small_20210706_125427.zip to /root/.hanlp/transformers/electra_zh_small_20210706_125427.zip
Decompressing /root/.hanlp/transformers/electra_zh_small_20210706_125427.zip to /root/.hanlp/transformers


0. 分词结果：
['这个', '门', '被', '锁', '了', '，', '锁', '很难', '被', '打开', '。']


Downloading https://file.hankcs.com/hanlp/pos/pos_ctb_electra_small_20220215_111944.zip to /root/.hanlp/pos/pos_ctb_electra_small_20220215_111944.zip
Decompressing /root/.hanlp/pos/pos_ctb_electra_small_20220215_111944.zip to /root/.hanlp/pos


1. 词性标注：
['DT', 'NN', 'SB', 'VV', 'SP', 'PU', 'VV', 'AD', 'SB', 'VV', 'PU']


In [16]:
#获取词向量

word2vec = hanlp.load(hanlp.pretrained.word2vec.MERGE_SGNS_BIGRAM_CHAR_300_ZH) # 加载word2vec词向量
word2vec('中国')

Downloading http://download.hanlp.com/embeddings/extra/merge_sgns_bigram_char300_20220130_214613.txt.zip to /root/.hanlp/thirdparty/download.hanlp.com/embeddings/extra/merge_sgns_bigram_char300_20220130_214613.txt.zip
Decompressing /root/.hanlp/thirdparty/download.hanlp.com/embeddings/extra/merge_sgns_bigram_char300_20220130_214613.txt.zip to /root/.hanlp/thirdparty/download.hanlp.com/embeddings/extra
Loading word2vec from cache [5m[33m...[0m[0m

99.82% 3.6 GB/3.6 GB Loading word2vec from text file [5m[33m...[0m[0m ETA: 0 s



tensor([ 1.4234e-02,  8.3600e-02,  2.4145e-02, -1.0256e-01, -1.0829e-01,
        -2.6786e-02, -9.6481e-02,  9.0537e-02, -5.4941e-02,  4.5936e-02,
        -4.2577e-02, -5.1776e-02,  4.9661e-02, -3.2703e-02, -6.6407e-03,
         9.8313e-03,  4.2377e-02, -7.1969e-02,  6.7363e-02, -1.2679e-01,
         1.3423e-03,  1.8129e-02,  1.3923e-02,  6.0298e-02,  2.9974e-02,
         3.4969e-02,  4.7053e-02, -1.4874e-02,  6.6235e-02, -1.5579e-01,
        -1.1716e-01,  8.8726e-02,  6.0976e-02, -8.0692e-02, -3.1017e-02,
        -1.3132e-02,  5.4841e-02,  4.0733e-02, -1.5295e-01, -7.8516e-02,
         6.6119e-02,  2.9393e-02, -3.0162e-02, -4.3704e-02,  8.3047e-03,
        -7.7654e-02, -1.5644e-02,  6.2678e-02,  7.3149e-02, -1.9128e-02,
         2.7543e-02, -1.4893e-02, -1.2223e-02,  9.6474e-02,  2.1985e-02,
         4.4640e-02, -2.4626e-02,  9.8536e-02, -1.3777e-01,  5.1621e-02,
         9.5042e-02, -3.2784e-02,  2.8697e-02, -1.3267e-02,  1.1536e-02,
        -9.0047e-02, -7.2654e-02, -8.7082e-04, -3.6

In [17]:
print(torch.nn.functional.cosine_similarity(
    word2vec('国王')-word2vec('王妃'),
    word2vec('男')-word2vec('女'), dim=0)
      )
print(torch.nn.functional.cosine_similarity(
    word2vec('公主')-word2vec('王妃'),
    word2vec('男')-word2vec('女'), dim=0)
      )

tensor(0.1429)
tensor(0.0366)


In [18]:
print(torch.nn.functional.cosine_similarity(
    word2vec('日本')-word2vec('东京'),
    word2vec('中国')-word2vec('北京'), dim=0)
      )
print(torch.nn.functional.cosine_similarity(
    word2vec('韩国')-word2vec('东京'),
    word2vec('中国')-word2vec('北京'), dim=0)
      )

tensor(0.4674)
tensor(0.3933)


In [19]:
# 单个词
print(word2vec.most_similar('北京'))
print('\n')

{'上海': 0.6443496942520142, '天津': 0.6384099721908569, '西安': 0.6117184162139893, '南京': 0.6113559603691101, '北京市': 0.6093109846115112, '海淀': 0.6049214601516724, '广州': 0.5977935791015625, '京城': 0.595507025718689, '沈阳': 0.5865166187286377, '深圳': 0.580772876739502}




In [20]:
from transformers import BertTokenizer, GPT2LMHeadModel, TextGenerationPipeline
ckpt_path = "uer/gpt2-chinese-cluecorpussmall" # checkpoint模型路径
tokenizer = BertTokenizer.from_pretrained(ckpt_path) # 分词器
model = GPT2LMHeadModel.from_pretrained(ckpt_path) # 语言模型

In [21]:
model.config.output_hidden_states = True
inputs = tokenizer('小明喜欢吃西瓜。小明喜欢打篮球。小明经常去花店', return_tensors="pt")
outputs = model(**inputs)

print('\n' + '='*10 + '最后一层输出的内隐表征维度: ' + '='*10)
print(str(outputs.hidden_states[-1].shape) + '  1 x 输入字数 x 表征维度')


torch.Size([1, 25, 768])  1 x 输入字数 x 表征维度


In [22]:
# 句法

import hanlp

In [38]:
Hanlp = hanlp.load(hanlp.pretrained.mtl.CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_SMALL_ZH) # 选择使用的模型
doc = Hanlp('编写这个代码好难！', tasks=['dep', 'con']) # 在tasks中选择需要的任务，如果不设置就进行所有任务（运行起来会慢一点）
doc.pretty_print()



In [39]:
Hanlp = hanlp.load(hanlp.pretrained.mtl.CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_SMALL_ZH)
doc = Hanlp('编写这个代码好难！')
tree = doc['con']



In [40]:
# 叶结点的位置
for i in range(len(tree.leaves())):
    print(tree.leaf_treeposition(i))

(0, 0, 0, 0, 0)
(0, 0, 0, 1, 0, 0, 0)
(0, 0, 0, 1, 0, 1, 0, 0)
(0, 0, 0, 1, 1, 0, 0)
(0, 1, 0, 0, 0)
(0, 1, 1, 0, 0)
(0, 2, 0)


In [42]:
tree[0, 0, 0, 0]

['VV', ['编写']]

In [43]:
# 转为括号表示法
bracket_form = tree.pformat().replace ('\n', '').replace(' ', '') # 去掉换行和空格
bracket_form

'(TOP(IP(IP(VP(VV编写)(NP(DP(DT这)(CLP(M个)))(NP(NN代码)))))(VP(ADVP(AD好))(VP(VA难)))(PU！)))'

In [44]:
# 转换为Chomsky Normal Form，可以用tree.un_chomsky_normal_form()转换回来
tree.chomsky_normal_form()
bracket_form = tree.pformat().replace ('\n', '').replace(' ', '')
print(bracket_form)

(TOP(IP(IP(VP(VV编写)(NP(DP(DT这)(CLP(M个)))(NP(NN代码)))))(IP|<VP-PU>(VP(ADVP(AD好))(VP(VA难)))(PU！))))


In [45]:
# 输出中有些节点只派生出一支，是冗余的（例如最外层的TOP根结点只派生出IP，以及句子中的IP只派生出VP），可以选择压缩节点
tree.collapse_unary(collapseRoot=True, joinChar='|') # 压缩冗余节点，压缩的节点用｜来表示
bracket_form = tree.pformat().replace ('\n', '').replace(' ', '')
bracket_form

'(TOP|IP(IP|VP(VV编写)(NP(DP(DT这)(CLP(M个)))(NP(NN代码))))(IP|<VP-PU>(VP(ADVP(AD好))(VP(VA难)))(PU！)))'

In [46]:
import re
import pandas as pd
# 计算括号表示法中每个词的括号数
bracket_clean= re.sub("([^()])", "", bracket_form) # 只保留括号
print(bracket_clean)

# 计算左括号数
left_bracket = [len(re.findall("\(", i)) for i in bracket_clean]
left_bracket_count = []
for i in left_bracket:
    if len(left_bracket_count) == 0 or (i == 1 and j != 1):
        left_bracket_count.append(1)
    elif i == 1 and j == 1:
        left_bracket_count[-1] += 1
    j = i
print("左括号数:", left_bracket_count)

# 计算右括号数
right_bracket = [len(re.findall("\)", i)) for i in bracket_clean]
right_bracket_count = []; j = 0
for i in right_bracket:
    if i == 1 and j != 1:
        right_bracket_count.append(1)
    elif i == 1 and j == 1:
        right_bracket_count[-1] += 1
    j = i
print("右括号数:", right_bracket_count)

# 可以保存为 dataframe 进行进一步的句法特征分析
df_bracket = pd.DataFrame([tree.leaves(), left_bracket_count, right_bracket_count]).T
df_bracket.columns = ['word', 'left_bracket', 'right_bracket']
# df_bracket.to_csv('bracket.csv', index=False) # 保存为csv文件
df_bracket

((()((()(()))(())))(((())(()))()))
左括号数: [3, 3, 2, 2, 4, 2, 1]
右括号数: [1, 1, 3, 4, 2, 3, 3]


Unnamed: 0,word,left_bracket,right_bracket
0,编写,3,1
1,这,3,1
2,个,2,3
3,代码,2,4
4,好,4,2
5,难,2,3
6,！,1,3


In [47]:
# 句法树的属性
print("Terminal nodes:", tree.leaves())
print("Tree depth:", tree.height())
print("Tree productions:", tree.productions())
print("Part of Speech:", tree.pos())

Terminal nodes: ['编写', '这', '个', '代码', '好', '难', '！']
Tree depth: 7
Tree productions: [TOP|IP -> IP|VP IP|<VP-PU>, IP|VP -> VV NP, VV -> '编写', NP -> DP NP, DP -> DT CLP, DT -> '这', CLP -> M, M -> '个', NP -> NN, NN -> '代码', IP|<VP-PU> -> VP PU, VP -> ADVP VP, ADVP -> AD, AD -> '好', VP -> VA, VA -> '难', PU -> '！']
Part of Speech: [('编写', 'VV'), ('这', 'DT'), ('个', 'M'), ('代码', 'NN'), ('好', 'AD'), ('难', 'VA'), ('！', 'PU')]


In [48]:
# 句法树的嵌套结构
for i in tree.subtrees():  # 根据Tree productions，遍历所有的子树，每一棵子树都是一个Tree对象，可以进行之前相同的操作
    print(i)

(TOP|IP
  (IP|VP (VV 编写) (NP (DP (DT 这) (CLP (M 个))) (NP (NN 代码))))
  (IP|<VP-PU> (VP (ADVP (AD 好)) (VP (VA 难))) (PU ！)))
(IP|VP (VV 编写) (NP (DP (DT 这) (CLP (M 个))) (NP (NN 代码))))
(VV 编写)
(NP (DP (DT 这) (CLP (M 个))) (NP (NN 代码)))
(DP (DT 这) (CLP (M 个)))
(DT 这)
(CLP (M 个))
(M 个)
(NP (NN 代码))
(NN 代码)
(IP|<VP-PU> (VP (ADVP (AD 好)) (VP (VA 难))) (PU ！))
(VP (ADVP (AD 好)) (VP (VA 难)))
(ADVP (AD 好))
(AD 好)
(VP (VA 难))
(VA 难)
(PU ！)


In [49]:
# 通过索引访问句法树的子树
treepositions = tree.treepositions() # 所有节点的索引
treepositions

[(),
 (0,),
 (0, 0),
 (0, 0, 0),
 (0, 1),
 (0, 1, 0),
 (0, 1, 0, 0),
 (0, 1, 0, 0, 0),
 (0, 1, 0, 1),
 (0, 1, 0, 1, 0),
 (0, 1, 0, 1, 0, 0),
 (0, 1, 1),
 (0, 1, 1, 0),
 (0, 1, 1, 0, 0),
 (1,),
 (1, 0),
 (1, 0, 0),
 (1, 0, 0, 0),
 (1, 0, 0, 0, 0),
 (1, 0, 1),
 (1, 0, 1, 0),
 (1, 0, 1, 0, 0),
 (1, 1),
 (1, 1, 0)]

In [50]:
for i in treepositions: # 遍历所有节点
    print(tree[i])

(TOP|IP
  (IP|VP (VV 编写) (NP (DP (DT 这) (CLP (M 个))) (NP (NN 代码))))
  (IP|<VP-PU> (VP (ADVP (AD 好)) (VP (VA 难))) (PU ！)))
(IP|VP (VV 编写) (NP (DP (DT 这) (CLP (M 个))) (NP (NN 代码))))
(VV 编写)
编写
(NP (DP (DT 这) (CLP (M 个))) (NP (NN 代码)))
(DP (DT 这) (CLP (M 个)))
(DT 这)
这
(CLP (M 个))
(M 个)
个
(NP (NN 代码))
(NN 代码)
代码
(IP|<VP-PU> (VP (ADVP (AD 好)) (VP (VA 难))) (PU ！))
(VP (ADVP (AD 好)) (VP (VA 难)))
(ADVP (AD 好))
(AD 好)
好
(VP (VA 难))
(VA 难)
难
(PU ！)
！


In [53]:
#依存句法

Hanlp = hanlp.load(hanlp.pretrained.mtl.CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_SMALL_ZH)
doc = Hanlp('编写这个代码好难！')
doc['dep']



[(6, 'dep'),
 (4, 'det'),
 (2, 'clf'),
 (1, 'dobj'),
 (6, 'advmod'),
 (0, 'root'),
 (6, 'punct')]

In [54]:
# 可以保存为 dataframe 进行进一步的句法特征分析
df_dep = pd.DataFrame(doc['dep'], columns=['head', 'rel'])
df_dep['word'] = doc['tok/fine']
df_dep = df_dep[['word', 'head', 'rel']]
df_dep

Unnamed: 0,word,head,rel
0,编写,6,dep
1,这,4,det
2,个,2,clf
3,代码,1,dobj
4,好,6,advmod
5,难,0,root
6,！,6,punct


In [55]:
#只需要将要处理的句子放在list中，一起进行特征抽取即可。这对所有特征都适用，不仅是句法特征。

sentences = ['编写这个代码好难！', '大家加油！']
docs = Hanlp(sentences)
docs.pretty_print()

In [56]:
# 提取出来的特征直接索引即可
print("句子数量为:", docs.count_sentences())
for i in range(docs.count_sentences()):
    print(docs['tok/fine'][i])

句子数量为: 2
['编写', '这', '个', '代码', '好', '难', '！']
['大家', '加油', '！']


In [59]:
#主题预测
# 从huggingface平台上找到对应的模型路径
model_path = 'uer/roberta-base-finetuned-chinanews-chinese'  #这里做了修改，遇到BUG怎么修复

# 使用transformers工具包加载模型
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

# 利用pipeline快速进行语言任务
text = '编写这个代码好难！'
text_classification = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)
res = text_classification(text)[0]
print("="*20, "单个句子主题分析计算", "="*20)
print(f"\nInput: {text}\nPrediction: {res['label']}, Score: {res['score']:.3f}")


# pipeline可以实现批量句子的计算
text_lst = ['编写这个代码好难！', '小明昨天篮球进了五个三分球']
res_lst = text_classification(text_lst)
print("\n\n")
print("="*20, "多个句子批量进行主题分析计算", "="*20)
for text, res in zip(text_lst, res_lst):
    print(f"\nInput: {text}\nPrediction: {res['label']}, Score: {res['score']:.3f}")

Downloading (…)okenizer_config.json:   0%|          | 0.00/295 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/880 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/110k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/409M [00:00<?, ?B/s]

No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda'
Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.



Input: 编写这个代码好难！
Prediction: culture, Score: 0.327




Input: 编写这个代码好难！
Prediction: culture, Score: 0.327

Input: 小明昨天篮球进了五个三分球
Prediction: sports, Score: 1.000


In [72]:
# 从huggingface平台上找到对应的模型路径
model_path = "google/flan-t5-large"      #同样记得修改自己找的模型

# 使用transformers工具包加载模型
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)


print("\n\n")
print("="*20, "上下文学习实现文本翻译", "="*20)
text = "translate English to Spanish: How old are you?"

# 调用模型分词器，对输入文本进行分词并转换为模型可处理的tensor形式
input_ids = tokenizer(text, return_tensors="pt").input_ids

# 调用模型的generate方法
outputs = model.generate(input_ids)
decoded_output = tokenizer.decode(outputs[0], skip_special_tokens = True)
print(f"Input: {text}\nOutput: {decoded_output}")



print("\n\n")
print("="*20, "上下文学习实现主题文本生成", "="*20)
text = '''Generate sentences with the topic :
medicine =>
'''

# 调用模型分词器，对输入文本进行分词并转换为模型可处理的tensor形式
input_ids = tokenizer(text, return_tensors="pt").input_ids

# 调用模型的generate方法
outputs = model.generate(input_ids)
decoded_output = tokenizer.decode(outputs[0], skip_special_tokens = True)
print(f"Input: {text}\nOutput: {decoded_output}")




Input: translate English to Spanish: How old are you?
Output: Cuánto edad te es?



Input: Generate sentences with the topic : 
medicine =>

Output: medicine is the science of medicine.
