<a href="https://colab.research.google.com/github/shirong52/LLM_study/blob/main/spaCy_use.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install spacy



In [2]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m56.6 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [3]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [4]:
nlp

<spacy.lang.en.English at 0x7f0e2fbfe0d0>

###列表解析

In [5]:
introduction_doc = nlp("This tutorial is about Natural Language Processing in Spacy.")
type(introduction_doc)
# [token.text for token in introduction_doc]

spacy.tokens.doc.Doc

In [6]:
[token.text for token in introduction_doc]

['This',
 'tutorial',
 'is',
 'about',
 'Natural',
 'Language',
 'Processing',
 'in',
 'Spacy',
 '.']

###句子检测

In [7]:
import pathlib
file_name = 'introduction.txt'
introduction_doc = nlp(pathlib.Path(file_name).read_text(encoding='utf-8'))
print([token.text for token in introduction_doc])

['This', 'tutorial', 'is', 'about', 'Natural', 'Language', 'Processing', 'in', 'Spacy', '.']


In [8]:
about_text = (
    "Gus Proto is a Python developer currently"
    " working for a London-based Fintech"
    " company. He is interested in learning"
    " Natural Language Processing."
)
about_doc = nlp(about_text)
sentences = list(about_doc.sents)
len(sentences)

2

In [9]:
for sentence in sentences:
    print(sentence)

Gus Proto is a Python developer currently working for a London-based Fintech company.
He is interested in learning Natural Language Processing.


In [10]:
ellipsis_text = (
    "Gus, can you, ... never mind, I forgot"
    " what I was saying. So, do you think"
    " we should ..."
)

from spacy.language import Language
@Language.component("set_custom_boundaries")
def set_custom_boundaries(doc):
  for token in doc[: -1]:
    if token.text == "...":  # 将省略号后面的词元标记为句子的开始
      doc[token.i + 1].is_sent_start = True
  return doc

custom_nlp = spacy.load("en_core_web_sm")
custom_nlp.add_pipe("set_custom_boundaries", before="parser")  # 将自定义的管道组件“set_custom_boundaries”添加到 SpaCy 的处理管道中，并且指定它在“parser”组件之前运行
custom_ellipsis_doc = custom_nlp(ellipsis_text)  # 将自定义的文本 ellipsis_text 传递给 SpaCy 模型进行处理
custom_ellipsis_sentences = list(custom_ellipsis_doc.sents)  # 获取文档中的句子迭代器。由于 SpaCy 的 Doc 对象的 sents 属性是一个生成器，所以需要用 list() 将其转换为列表，方便后续操作

for sentence in custom_ellipsis_sentences:
  print(sentence)

Gus, can you, ...
never mind, I forgot what I was saying.
So, do you think we should ...


###tokens

In [11]:
import spacy
nlp = spacy.load("en_core_web_sm")
about_text = (
    "Gus Proto is a Python developer currently"
    " working for a London-based Fintech"
    " company. He is interested in learning"
    " Natural Language Processing."
)

about_doc = nlp(about_text)
for token in about_doc:
  # Proto 是第二个单词，它在文本中的起始索引位置是 4（因为 Gus 占用了 4 个字符，包括空格）
  print(token, token.idx)  # token.idx 是该单词或标点符号在原始文本中的起始索引位置


Gus 0
Proto 4
is 10
a 13
Python 15
developer 22
currently 32
working 42
for 50
a 54
London 56
- 62
based 63
Fintech 69
company 77
. 84
He 86
is 89
interested 92
in 103
learning 106
Natural 115
Language 123
Processing 132
. 142


In [12]:
print(
    f"{'Text with Whitespace':22}"
    f"{'Is Alphanumeric?':15}"
    f"{'Is Punctuation?':18}"
    f"{'Is Stop Word?'}"
)

for token in about_doc:
  print(
      f"{str(token.text_with_ws):22}"
      f"{str(token.is_alpha):15}"
      f"{str(token.is_punct):18}"
      f"{str(token.is_stop)}"
  )

Text with Whitespace  Is Alphanumeric?Is Punctuation?   Is Stop Word?
Gus                   True           False             False
Proto                 True           False             False
is                    True           False             True
a                     True           False             True
Python                True           False             False
developer             True           False             False
currently             True           False             False
working               True           False             False
for                   True           False             True
a                     True           False             True
London                True           False             False
-                     False          True              False
based                 True           False             False
Fintech               True           False             False
company               True           False             False
.                  

In [13]:
custom_about_text = (
    "Gus Proto is a Python developer currently"
    " working for a London@based Fintech"
    " company. He is interested in learning"
    " Natural Language Processing."
)
print([token.text for token in nlp(custom_about_text)[8:15]])

['for', 'a', 'London@based', 'Fintech', 'company', '.', 'He']


In [14]:
import re
from spacy.tokenizer import Tokenizer

custom_nlp = spacy.load("en_core_web_sm")
prefix_re = spacy.util.compile_prefix_regex(
    custom_nlp.Defaults.prefixes
)
suffix_re = spacy.util.compile_suffix_regex(
    custom_nlp.Defaults.suffixes
)
# 定义了一个自定义的中缀列表 custom_infixes，其中包含了一个正则表达式 r"@"，表示 @ 符号可以作为单词的中缀
custom_infixes = [r"@"]

# 使用 + 将自定义的中缀规则 custom_infixes 添加到默认的中缀规则列表中
infix_re = spacy.util.compile_infix_regex(
    list(custom_nlp.Defaults.infixes) + custom_infixes
)

custom_nlp.tokenizer = Tokenizer(
    nlp.vocab,
    prefix_search=prefix_re.search,
    suffix_search=suffix_re.search,
    infix_finditer=infix_re.finditer,
    token_match=None,
)
custom_tokenizer_about_doc = custom_nlp(custom_about_text)
print([token.text for token in custom_tokenizer_about_doc[8:15]])

['for', 'a', 'London', '@', 'based', 'Fintech', 'company']


###stop words

In [15]:
import spacy
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS
len(spacy_stopwords)

326

In [16]:
for stop_word in list(spacy_stopwords)[:10]:
  print(stop_word)

used
well
go
us
whereby
whereafter
another
‘ll
whence
become


In [17]:
custom_about_text = (
    "Gus Proto is a Python developer currently"
    " working for a London@based Fintech"
    " company. He is interested in learning"
    " Natural Language Processing."
)

nlp = spacy.load("en_core_web_sm")
about_doc = nlp(custom_about_text)
print([token for token in about_doc if not token.is_stop])

[Gus, Proto, Python, developer, currently, working, London@based, Fintech, company, ., interested, learning, Natural, Language, Processing, .]


###词形还原

In [18]:
import spacy
nlp = spacy.load("en_core_web_sm")
conference_help_text = (
    "Gus is helping organize a developer"
    " conference on Applications of Natural Language"
    " Processing. He keeps organizing local Python meetups"
    " and several internal talks at his workplace."
)

conference_help_doc = nlp(conference_help_text)
for token in conference_help_doc:
  if str(token) != str(token.lemma_):
    print(f"{str(token):>20} : {str(token.lemma_)}")

                  is : be
          Processing : processing
                  He : he
               keeps : keep
          organizing : organize
             meetups : meetup
               talks : talk


###词频

In [19]:
import spacy
from collections import Counter

nlp = spacy.load("en_core_web_sm")
complete_text = (
    "Gus Proto is a Python developer currently"
    " working for a London-based Fintech company. He is"
    " interested in learning Natural Language Processing."
    " There is a developer conference happening on 21 July"
   ' 2019 in London. It is titled "Applications of Natural'
    ' Language Processing". There is a helpline number'
    " available at +44-1234567891. Gus is helping organize it."
    " He keeps organizing local Python meetups and several"
    " internal talks at his workplace. Gus is also presenting"
    ' a talk. The talk will introduce the reader about "Use'
    ' cases of Natural Language Processing in Fintech".'
    " Apart from his work, he is very passionate about music."
    " Gus is learning to play the Piano. He has enrolled"
    " himself in the weekend batch of Great Piano Academy."
    " Great Piano Academy is situated in Mayfair or the City"
    " of London and has world-class piano instructors."
)

complete_doc = nlp(complete_text)
words = [
    token.text
    for token in complete_doc
    if not token.is_stop and not token.is_punct
]

print(Counter(words).most_common(5))

[('Gus', 4), ('London', 3), ('Natural', 3), ('Language', 3), ('Processing', 3)]


###词性标注

In [20]:
import spacy
nlp = spacy.load("en_core_web_sm")
about_text = (
    "Gus Proto is a Python developer currently"
    " working for a London-based Fintech"
    " company. He is interested in learning"
    " Natural Language Processing."
)

about_doc = nlp(about_text)

for token in about_doc:
  print(
      f"""
      TOKEN: {str(token)}
      =====
      TAG: {str(token.tag_):10} POS: {token.pos_}
      EXPLANATION: {spacy.explain(token.tag_)}"""
  )


      TOKEN: Gus
      =====
      TAG: NNP        POS: PROPN
      EXPLANATION: noun, proper singular

      TOKEN: Proto
      =====
      TAG: NNP        POS: PROPN
      EXPLANATION: noun, proper singular

      TOKEN: is
      =====
      TAG: VBZ        POS: AUX
      EXPLANATION: verb, 3rd person singular present

      TOKEN: a
      =====
      TAG: DT         POS: DET
      EXPLANATION: determiner

      TOKEN: Python
      =====
      TAG: NNP        POS: PROPN
      EXPLANATION: noun, proper singular

      TOKEN: developer
      =====
      TAG: NN         POS: NOUN
      EXPLANATION: noun, singular or mass

      TOKEN: currently
      =====
      TAG: RB         POS: ADV
      EXPLANATION: adverb

      TOKEN: working
      =====
      TAG: VBG        POS: VERB
      EXPLANATION: verb, gerund or present participle

      TOKEN: for
      =====
      TAG: IN         POS: ADP
      EXPLANATION: conjunction, subordinating or preposition

      TOKEN: a
      =====
      TA

In [21]:
nouns = []
adjectives = []
for token in about_doc:
  if token.pos_ == "NOUN":
    nouns.append(token)
  if token.pos_ == "ADJ":
    adjectives.append(token)

nouns

[developer, company]

In [22]:
adjectives

[interested]

###可视化

In [23]:
import spacy
from spacy import displacy

nlp = spacy.load("en_core_web_sm")

about_interest_text = (
    "He is interested in learning Natural Language Processing."
)
about_interest_doc = nlp(about_interest_text)
displacy.serve(about_interest_doc, style="dep")




Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


In [25]:
displacy.render(about_interest_doc, style="dep", jupyter=True)

###预处理功能

In [26]:
import spacy
nlp = spacy.load("en_core_web_sm")
complete_text = (
    "Gus Proto is a Python developer currently"
    " working for a London-based Fintech company. He is"
    " interested in learning Natural Language Processing."
    " There is a developer conference happening on 21 July"
    ' 2019 in London. It is titled "Applications of Natural'
    ' Language Processing". There is a helpline number'
    " available at +44-1234567891. Gus is helping organize it."
    " He keeps organizing local Python meetups and several"
    " internal talks at his workplace. Gus is also presenting"
    ' a talk. The talk will introduce the reader about "Use'
    ' cases of Natural Language Processing in Fintech".'
    " Apart from his work, he is very passionate about music."
    " Gus is learning to play the Piano. He has enrolled"
    " himself in the weekend batch of Great Piano Academy."
    " Great Piano Academy is situated in Mayfair or the City"
    " of London and has world-class piano instructors."
)
complete_doc = nlp(complete_text)

def is_token_allowed(token):
  return bool(
      token
      and str(token).strip()
      and not token.is_stop
      and not token.is_punct
  )

def preprocess_token(token):
  return token.lemma_.strip().lower()

complete_filtered_tokens = [
    preprocess_token(token)
    for token in complete_doc
    if is_token_allowed(token)
]

complete_filtered_tokens

['gus',
 'proto',
 'python',
 'developer',
 'currently',
 'work',
 'london',
 'base',
 'fintech',
 'company',
 'interested',
 'learn',
 'natural',
 'language',
 'processing',
 'developer',
 'conference',
 'happen',
 '21',
 'july',
 '2019',
 'london',
 'title',
 'application',
 'natural',
 'language',
 'processing',
 'helpline',
 'number',
 'available',
 '+44',
 '1234567891',
 'gus',
 'helping',
 'organize',
 'keep',
 'organize',
 'local',
 'python',
 'meetup',
 'internal',
 'talk',
 'workplace',
 'gus',
 'present',
 'talk',
 'talk',
 'introduce',
 'reader',
 'use',
 'case',
 'natural',
 'language',
 'processing',
 'fintech',
 'apart',
 'work',
 'passionate',
 'music',
 'gus',
 'learn',
 'play',
 'piano',
 'enrol',
 'weekend',
 'batch',
 'great',
 'piano',
 'academy',
 'great',
 'piano',
 'academy',
 'situate',
 'mayfair',
 'city',
 'london',
 'world',
 'class',
 'piano',
 'instructor']

###基于规则的匹配

In [28]:
import spacy
nlp = spacy.load("en_core_web_sm")
about_text = (
    "Gus Proto is a Python developer currently"
    " working for a London-based Fintech"
    " company. He is interested in learning"
    " Natural Language Processing."
)

about_doc = nlp(about_text)

# Matcher 是 SpaCy 提供的一个工具，用于在文档中匹配特定的模式
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)

def extract_full_name(nlp_doc):
  pattern = [{"POS": "PROPN"}, {"POS": "PROPN"}]
  # 将定义的模式添加到 Matcher 对象中。
  # "FULL_NAME" 是模式的名称，[pattern] 是一个包含模式的列表
  matcher.add("FULL_NAME", [pattern])
  # matches 是一个包含匹配结果的列表，每个匹配结果是一个元组，包含匹配的 ID、起始索引和结束索引。
  matches = matcher(nlp_doc)
  for _, start, end in matches:
    span = nlp_doc[start:end]
    yield span.text

next(extract_full_name(about_doc))

'Gus Proto'

In [40]:
conference_org_text = ("There is a developer conference"
    " happening on 21 July 2019 in London. It is titled"
    ' "Applications of Natural Language Processing".'
    " There is a helpline number available"
    " at (123) 456-7891")

matcher = Matcher(nlp.vocab) # 这个代码在这里要再运行一次，否则结果不对，还是上面的matcher

def extract_phone_number(nlp_doc):
    pattern = [
        {"ORTH": "("},
        {"SHAPE": "ddd"},
        {"ORTH": ")"},
        {"SHAPE": "ddd"},
        {"ORTH": "-", "OP": "?"},
        {"SHAPE": "dddd"},
    ]
    matcher.add("PHONE_NUMBER",[pattern])
    matches = matcher(nlp_doc)
    for match_id, start, end in matches:
        span = nlp_doc[start:end]
        return span.text

conference_org_doc = nlp(conference_org_text)
extract_phone_number(conference_org_doc)

'(123) 456-7891'

###依赖分析

In [41]:
import spacy
nlp = spacy.load("en_core_web_sm")
piano_text = "Gus is learning piano"
piano_doc = nlp(piano_text)

for token in piano_doc:
  # """表示这是一个多行字符串的开始
  # token.tag_是一个属性，表示单词的词性标注
  # token.head.text表示当前单词所依赖的单词的文本内容
  # token.dep_是一个属性，表示单词的依存关系类型
  print(
      f"""
      TOKEN: {token.text}
      =====
      {token.tag_ = }
      {token.head.text = }
      {token.dep_ = }"""
  )


      TOKEN: Gus
      =====
      token.tag_ = 'NNP'
      token.head.text = 'learning'
      token.dep_ = 'nsubj'

      TOKEN: is
      =====
      token.tag_ = 'VBZ'
      token.head.text = 'learning'
      token.dep_ = 'aux'

      TOKEN: learning
      =====
      token.tag_ = 'VBG'
      token.head.text = 'learning'
      token.dep_ = 'ROOT'

      TOKEN: piano
      =====
      token.tag_ = 'NN'
      token.head.text = 'learning'
      token.dep_ = 'dobj'


In [42]:
displacy.serve(piano_doc, style="dep")




Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


###树和子树导航

In [44]:
import spacy
nlp = spacy.load("en_core_web_sm")
one_line_about_text = (
    "Gus Proto is a Python developer"
    " currently working for a London-based Fintech company"
)
one_line_about_doc = nlp(one_line_about_text)

# Extract children of `developer`
print([token.text for token in one_line_about_doc[5].children])

# Extract next neighboring node of `developer`
print (one_line_about_doc[5].nbor())

# Extract previous neighboring node of `developer`
print (one_line_about_doc[5].nbor(-1))

# Extract all tokens on the left of `developer`
print([token.text for token in one_line_about_doc[5].lefts])

# Extract tokens on the right of `developer`
print([token.text for token in one_line_about_doc[5].rights])

# Print subtree of `developer`
print (list(one_line_about_doc[5].subtree))

['a', 'Python', 'working']
currently
Python
['a', 'Python']
['working']
[a, Python, developer, currently, working, for, a, London, -, based, Fintech, company]


###浅层句法分析

####名词短语检测

In [45]:
import spacy
nlp = spacy.load("en_core_web_sm")

conference_text = (
    "There is a developer conference happening on 21 July 2019 in London."
)

conference_doc = nlp(conference_text)

for chunk in conference_doc.noun_chunks:
  print(chunk)

a developer conference
21 July
London


####动词短语检测

In [46]:
!pip install textacy

Collecting textacy
  Downloading textacy-0.13.0-py3-none-any.whl.metadata (5.3 kB)
Collecting cytoolz>=0.10.1 (from textacy)
  Downloading cytoolz-1.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.6 kB)
Collecting floret~=0.10.0 (from textacy)
  Downloading floret-0.10.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.1 kB)
Collecting jellyfish>=0.8.0 (from textacy)
  Downloading jellyfish-1.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.6 kB)
Collecting pyphen>=0.10.0 (from textacy)
  Downloading pyphen-0.17.2-py3-none-any.whl.metadata (3.2 kB)
Downloading textacy-0.13.0-py3-none-any.whl (210 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m210.7/210.7 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading cytoolz-1.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m33.4 MB/s[0m e

In [48]:
import textacy

about_talk_text = (
    "The talk will introduce reader about use"
    " cases of Natural Language Processing in"
    " Fintech, making use of"
    " interesting examples along the way."
)

patterns = [{"POS": "AUX"}, {"POS": "VERB"}]
about_talk_doc = textacy.make_spacy_doc(
    about_talk_text, lang="en_core_web_sm"
)

verb_phrases = textacy.extract.token_matches(
    about_talk_doc, patterns=patterns
)

for chunk in verb_phrases:
  print(chunk.text)

for chunk in about_talk_doc.noun_chunks:
  print(chunk)

will introduce
The talk
reader
use
cases
Natural Language Processing
Fintech
use
interesting examples
the way


###命名实体识别

In [49]:
import spacy
nlp = spacy.load("en_core_web_sm")

piano_class_text = (
    "Great Piano Academy is situated"
    " in Mayfair or the City of London and has"
    " world-class piano instructors."
)

piano_class_doc = nlp(piano_class_text)

for ent in piano_class_doc.ents:
  print(
      f"""
      {ent.text = }
      {ent.start_char = }
      {ent.end_char = }
      {ent.label_ = }
      spacy.explain('{ent.label_}') = {spacy.explain(ent.label_)}
      """
  )


      ent.text = 'Great Piano Academy'
      ent.start_char = 0
      ent.end_char = 19
      ent.label_ = 'ORG'
      spacy.explain('ORG') = Companies, agencies, institutions, etc.
      

      ent.text = 'Mayfair'
      ent.start_char = 35
      ent.end_char = 42
      ent.label_ = 'FAC'
      spacy.explain('FAC') = Buildings, airports, highways, bridges, etc.
      

      ent.text = 'the City of London'
      ent.start_char = 46
      ent.end_char = 64
      ent.label_ = 'GPE'
      spacy.explain('GPE') = Countries, cities, states
      


In [50]:
displacy.serve(piano_class_doc, style="ent")




Using the 'ent' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


In [52]:
survey_text = (
    "Out of 5 people surveyed, James Robert,"
    " Julie Fuller and Benjamin Brooks like"
    " apples. Kelly Cox and Matthew Evans"
    " like oranges."
)

def replace_person_names(token):
    if token.ent_iob != 0 and token.ent_type_ == "PERSON":
        return "[REDACTED] "
    return token.text_with_ws

def redact_names(nlp_doc):
    with nlp_doc.retokenize() as retokenizer:
        for ent in nlp_doc.ents:
            retokenizer.merge(ent)
    tokens = map(replace_person_names, nlp_doc)
    return "".join(tokens)

survey_doc = nlp(survey_text)
print(redact_names(survey_doc))

Out of 5 people surveyed, [REDACTED] , [REDACTED] and [REDACTED] like apples. [REDACTED] and [REDACTED] like oranges.
