In [None]:
import spacy
!python -m spacy download en_core_web_sm

In [8]:
import json
import spacy

with open("D:\Datascience_workspace_2023\MY_NLP\spacy-codes\data\countries.json", encoding="utf8") as f:
    COUNTRIES = json.loads(f.read())

nlp = spacy.blank("en")
doc = nlp("Czech Republic may help Slovakia protect its airspace")

# Import the PhraseMatcher and initialize it
from spacy.matcher import PhraseMatcher

matcher = PhraseMatcher(nlp.vocab)

# Create pattern Doc objects and add them to the matcher
# This is the faster version of: [nlp(country) for country in COUNTRIES]
patterns = list(nlp.pipe(COUNTRIES))
matcher.add("COUNTRY", patterns)

# Call the matcher on the test document and print the result
matches = matcher(doc)
print([doc[start:end] for match_id, start, end in matches])

[Czech Republic, Slovakia]


In [10]:
import spacy
from spacy.matcher import PhraseMatcher
from spacy.tokens import Span
import json

with open("D:\Datascience_workspace_2023\MY_NLP\spacy-codes\data\countries.json", encoding="utf8") as f:
    COUNTRIES = json.loads(f.read())
with open("D:\Datascience_workspace_2023\MY_NLP\spacy-codes\data\country_text.txt", encoding="utf8") as f:
    TEXT = f.read()

nlp = spacy.load("en_core_web_sm")
matcher = PhraseMatcher(nlp.vocab)
patterns = list(nlp.pipe(COUNTRIES))
matcher.add("COUNTRY", patterns)

# Create a doc and reset existing entities
doc = nlp(TEXT)
doc.ents = []

# Iterate over the matches
for match_id, start, end in matcher(doc):
    # Create a Span with the label for "GPE"
    span = Span(doc, start, end, label="GPE")

    # Overwrite the doc.ents and add the span
    doc.ents = list(doc.ents) + [span]

    # Get the span's root head token
    span_root_head = span.root.head
    # Print the text of the span root's head token and the span text
    print(span_root_head.text, "-->", span.text)

# Print the entities in the document
print([(ent.text, ent.label_) for ent in doc.ents if ent.label_ == "GPE"])

in --> Namibia
in --> South Africa
Africa --> Cambodia
of --> Kuwait
as --> Somalia
Somalia --> Haiti
Haiti --> Mozambique
in --> Somalia
for --> Rwanda
Britain --> Singapore
War --> Sierra Leone
of --> Afghanistan
invaded --> Iraq
in --> Sudan
of --> Congo
earthquake --> Haiti
[('Namibia', 'GPE'), ('South Africa', 'GPE'), ('Cambodia', 'GPE'), ('Kuwait', 'GPE'), ('Somalia', 'GPE'), ('Haiti', 'GPE'), ('Mozambique', 'GPE'), ('Somalia', 'GPE'), ('Rwanda', 'GPE'), ('Singapore', 'GPE'), ('Sierra Leone', 'GPE'), ('Afghanistan', 'GPE'), ('Iraq', 'GPE'), ('Sudan', 'GPE'), ('Congo', 'GPE'), ('Haiti', 'GPE')]


In [None]:
import spacy
from spacy.matcher import Matcher

# Load the English NLP model
nlp = spacy.load("en_core_web_sm")

# Define the modified drug pattern
pattern = [{"TEXT": "O", "IS_SPACE": True, "OP": "?"}, {"TEXT": "N", "IS_SPACE": True, "OP": "?"}, {"TEXT": "1", "IS_SPACE": True, "OP": "?"}, {"TEXT": "5", "IS_SPACE": True, "OP": "?"}, {"TEXT": "3", "IS_SPACE": True, "OP": "?"}]

# Initialize the Matcher with the current NLP model
matcher = Matcher(nlp.vocab)

# Add the drug pattern to the matcher
matcher.add("DrugMatcher", pattern)

# Process the text
text = "The patient received O N 15 and ON-15 medications."
doc = nlp(text)

# Use the matcher on the processed text
matches = matcher(doc)

# Extract matched spans from the document
drug_spans = [doc[start:end] for _, start, end in matches]

# Print the matched drug spans
for span in drug_spans:
    print(span.text)


In [2]:
import spacy
from spacy.matcher import Matcher

# Load the English NLP model
nlp = spacy.load("en_core_web_sm")

# Initialize the matcher
matcher = Matcher(nlp.vocab)

# Define a sequence of patterns
pattern_sequence = [
    {"LOWER": "start"},
    {"POS": "ADJ"},
    {"POS": "NOUN"},
    {"LOWER": "end"}
]

# Add the pattern sequence to the matcher
matcher.add("PatternSequence", pattern_sequence)

# Example text
text = "The journey starts with a beautiful sunrise and ends with a warm embrace."

# Process the text
doc = nlp(text)

# Apply the matcher to find matches
matches = matcher(doc)

# Extract matched spans and entities
matched_sequences = []
for match_id, start, end in matches:
    label = nlp.vocab.strings[match_id]
    sequence_text = doc[start:end].text
    matched_sequences.append((sequence_text, label))

# Print the identified pattern sequences
print(matched_sequences)


ValueError: [E178] Each pattern should be a list of dicts, but got: {'LOWER': 'start'}. Maybe you accidentally passed a single pattern to Matcher.add instead of a list of patterns? If you only want to add one pattern, make sure to wrap it in a list. For example: `matcher.add('PatternSequence', [pattern])`

Introduction to spaCy

Hi, I'm Ines! I'm one of the core developers of spaCy, a popular library for advanced Natural Language Processing in Python.

In this lesson, we'll take a look at the most important concepts of spaCy and how to get started.

In [5]:
 
# Import spaCy
import spacy

# Create a blank English nlp object
nlp = spacy.blank("en")
# contains the processing pipeline
# includes language-specific rules for tokenization etc.
# The Doc object

# Created by processing a string of text with the nlp object
doc = nlp("Hello world!")

for token in doc:
    print(token.text)




Hello
world
!


In [7]:
# Index into the Doc to get a single Token
token = doc[1]

# Get the token text via the .text attribute
print(token.text)


# Token objects represent the tokens in a document – for example, a word or a punctuation character.

# To get a token at a specific position, you can index into the doc.

# Token objects also provide various attributes that let you access more information about the tokens. For example, the .text attribute returns the verbatim token text.

world


In [9]:
import spacy

# Load the small English pipeline
nlp = spacy.load("en_core_web_sm")

# Process a text
doc = nlp("She ate the pizza")

# Iterate over the tokens
for token in doc:
    # Print the text and the predicted part-of-speech tag
    print(token.text, token.pos_,token.dep_,token.head.text)

She PRON nsubj ate
ate VERB ROOT ate
the DET det pizza
pizza NOUN dobj ate


In [17]:
# Predicting Named Entities
# Process a text
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

for ent in doc.ents:
    print(ent.text,ent.label_)

Apple ORG
U.K. GPE
$1 billion MONEY


In [18]:
spacy.explain("ORG")

'Companies, agencies, institutions, etc.'

In [23]:
import spacy
from spacy.matcher import Matcher

 

nlp = spacy.load("en_core_web_sm")
doc = nlp("Upcoming iPhone X release date leaked as Apple reveals pre-orders")

# Initialize the Matcher with the shared vocabulary
matcher = Matcher(nlp.vocab)

# Create a pattern matching two tokens: "iPhone" and "X"
pattern = [{"TEXT": "iPhone"}, {"TEXT": "X"}]


# Add the pattern to the matcher
matcher.add("IPHONE_X_PATTERN", [pattern])

# Use the matcher on the doc
matches = matcher(doc)
print("Matches:", [doc[start:end].text for match_id, start, end in matches])

Matches: ['iPhone X']


In [None]:
# Part 1
# Write one pattern that only matches mentions of the full iOS versions: “iOS 7”, “iOS 11” and “iOS 10”.