# SpaCy

In [1]:
import spacy
nlp=spacy.load("en_core_web_sm")

In [3]:
text="Apple Inc. is located in Cupertino, California. It is a great company."

doc=nlp(text)

In [4]:
doc

Apple Inc. is located in Cupertino, California. It is a great company.

In [5]:
# Tokenization
print("Tokens:")
for token in doc:
    print(token.text)
    

Tokens:
Apple
Inc.
is
located
in
Cupertino
,
California
.
It
is
a
great
company
.


In [6]:
# Part-of-Speech (POS) Tagging
print("\nPOS Tags:")
for token in doc:
    print(f"{token.text}: {token.pos_} ({token.tag_})")



POS Tags:
Apple: PROPN (NNP)
Inc.: PROPN (NNP)
is: AUX (VBZ)
located: VERB (VBN)
in: ADP (IN)
Cupertino: PROPN (NNP)
,: PUNCT (,)
California: PROPN (NNP)
.: PUNCT (.)
It: PRON (PRP)
is: AUX (VBZ)
a: DET (DT)
great: ADJ (JJ)
company: NOUN (NN)
.: PUNCT (.)


In [7]:
# Lemmatization
print("\nLemmas:")
for token in doc:
    print(f"{token.text}: {token.lemma_}")




Lemmas:
Apple: Apple
Inc.: Inc.
is: be
located: locate
in: in
Cupertino: Cupertino
,: ,
California: California
.: .
It: it
is: be
a: a
great: great
company: company
.: .


In [8]:
# Named Entity Recognition (NER)
print("\nNamed Entities:")
for ent in doc.ents:
    print(ent)
    print(f"{ent.text}: {ent.label_}")



Named Entities:
Apple Inc.
Apple Inc.: ORG
Cupertino
Cupertino: GPE
California
California: GPE


In [9]:
# Dependency Parsing
print("\nDependency Parsing:")
for token in doc:
    print(f"{token.text}: {token.dep_}, head: {token.head.text}")


Dependency Parsing:
Apple: compound, head: Inc.
Inc.: nsubjpass, head: located
is: auxpass, head: located
located: ROOT, head: located
in: prep, head: located
Cupertino: pobj, head: in
,: punct, head: Cupertino
California: appos, head: Cupertino
.: punct, head: located
It: nsubj, head: is
is: ROOT, head: is
a: det, head: company
great: amod, head: company
company: attr, head: is
.: punct, head: is


In [10]:
#Sentence Segmentation
print("\nSentences:")
for sent in doc.sents:
    print(sent.text)


Sentences:
Apple Inc. is located in Cupertino, California.
It is a great company.


In [11]:
#Word Vectors (if using a model that has them)
nlp_lg = spacy.load("en_core_web_lg") #load large model
doc_lg = nlp_lg("king queen apple") #process text
print("\nWord Vectors and Similarity:")
king = doc_lg[0]
queen = doc_lg[1]
apple = doc_lg[2]

print(f"Similarity between king and queen: {king.similarity(queen)}")
print(f"Similarity between king and apple: {king.similarity(apple)}")

OSError: [E050] Can't find model 'en_core_web_lg'. It doesn't seem to be a Python package or a valid path to a data directory.

In [12]:
print(spacy.info())

{'spacy_version': '3.8.7', 'location': '/usr/local/lib/python3.11/dist-packages/spacy', 'platform': 'Linux-6.6.56+-x86_64-with-glibc2.35', 'python_version': '3.11.13', 'pipelines': {'en_core_web_sm': '3.8.0'}}


In [13]:
from spacy.matcher import Matcher
nlp = spacy.load("en_core_web_sm") 
matcher = Matcher(nlp.vocab) 

pattern = [{"LOWER": "apple"}, {"LOWER": "ic."}] #look for apple inc.
pattern1 = [{"LOWER": "located"}, {"LOWER": "in"}]
matcher.add("AppleIncPattern", [pattern])
matcher.add("AppleIncPattern1", [pattern1])
matches = matcher(doc)

for match_id, start, end in matches:
    string_id = nlp.vocab.strings[match_id]
    span = doc[start:end]
    print(f"Matched span: {span.text}, pattern ID: {string_id}")

Matched span: located in, pattern ID: AppleIncPattern1
