SpaCy Models : https://spacy.io/models/en

In [1]:
!pip install -U spacy
!pip install -U spacy-lookups-data
!pip install -m spacy download en_core_web_sm

Collecting spacy
  Downloading spacy-3.8.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (27 kB)
Collecting thinc<8.4.0,>=8.3.4 (from spacy)
  Downloading thinc-8.3.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (15 kB)
Collecting blis<1.3.0,>=1.2.0 (from thinc<8.4.0,>=8.3.4->spacy)
  Downloading blis-1.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.7 kB)
Downloading spacy-3.8.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (30.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.6/30.6 MB[0m [31m39.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading thinc-8.3.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.9/3.9 MB[0m [31m41.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading blis-1.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

## Tokenization

In [8]:
import spacy

# Download (if needed) and load the model
try:
    nlp = spacy.load('en_core_web_sm')
    print("Model 'en_core_web_sm' loaded successfully.")
except OSError:
    print("'en_core_web_sm' model not found. Downloading...")
    import spacy.cli
    spacy.cli.download("en_core_web_sm")  # Download the model
    nlp = spacy.load('en_core_web_sm')
    print("Model 'en_core_web_sm' downloaded and loaded successfully.")
except Exception as e:
    print(f"An error occurred: {e}")
    exit()

Model 'en_core_web_sm' loaded successfully.


In [9]:
text = "Apple isn't looking at buyig U.K. Startup for $1 billion"
doc = nlp(text)

print(f"Text: {text}")
print("Tokens:")
for token in doc:
    print(f"{token.text} ({token.pos_})")

Text: Apple isn't looking at buyig U.K. Startup for $1 billion
Tokens:
Apple (PROPN)
is (AUX)
n't (PART)
looking (VERB)
at (ADP)
buyig (NOUN)
U.K. (PROPN)
Startup (PROPN)
for (ADP)
$ (SYM)
1 (NUM)
billion (NUM)


## Part-of_Speech [POS] Tagging

In [10]:
doc

Apple isn't looking at buyig U.K. Startup for $1 billion

In [11]:
for token in doc:
    print(f'{token.text:{15}} {token.lemma_:{15}} {token.pos_:{10}} {token.is_stop}')

Apple           Apple           PROPN      False
is              be              AUX        True
n't             not             PART       True
looking         look            VERB       False
at              at              ADP        True
buyig           buyig           NOUN       False
U.K.            U.K.            PROPN      False
Startup         Startup         PROPN      False
for             for             ADP        True
$               $               SYM        False
1               1               NUM        False
billion         billion         NUM        False


## Dependency Parsing

In [12]:
for chunk in doc.noun_chunks:
    print(f'{chunk.text:{30}} {chunk.root.text:{30}} {chunk.root.dep_}')

Apple                          Apple                          nsubj
buyig U.K. Startup             Startup                        pobj


## Named Entity Recognition (NER)

In [13]:
doc

Apple isn't looking at buyig U.K. Startup for $1 billion

In [14]:
for ent in doc.ents:
    print(f'{ent.text:{20}} {ent.label_}')

Apple                ORG
U.K. Startup         ORG
$1 billion           MONEY


## Sentence Segmentation

In [15]:
doc

Apple isn't looking at buyig U.K. Startup for $1 billion

In [16]:
for sent in doc.sents:
    print(sent)

Apple isn't looking at buyig U.K. Startup for $1 billion


In [17]:
doc1 = nlp("Welcome to Colemerg. Thank you very much...Please give me a five")
doc1

Welcome to Colemerg. Thank you very much...Please give me a five

In [18]:
for sent in doc1.sents:
    print(sent)

Welcome to Colemerg.
Thank you very much...Please give me a five


In [19]:
from spacy.language import Language

@Language.component("set_rule")
def set_rule(doc):
    for token in doc[:-1]:
        if token.text == '...':
            doc[token.i + 1].is_sent_start = True
    return doc

In [20]:
nlp.add_pipe("set_rule", before='parser')
doc1 = nlp("Welcome to Colemerg. Thank you very much...Please give me a five")

In [21]:
for sent in doc1.sents:
    print(sent)

Welcome to Colemerg.
Thank you very much...
Please give me a five


In [22]:
for token in doc1:
    print(token.text)

Welcome
to
Colemerg
.
Thank
you
very
much
...
Please
give
me
a
five


## Visualization

In [23]:
from spacy import displacy

In [24]:
doc

Apple isn't looking at buyig U.K. Startup for $1 billion

In [25]:
displacy.render(doc, style='dep')

In [26]:
displacy.render(doc, style='dep', options={'compact':True, 'distance':100})

In [27]:
displacy.render(doc, style='ent')