In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
import spacy

In [2]:
nlp = spacy.blank('en')

doc = nlp("Captain america ate 100$ of samosa. Then he said I can do this all day.")

for token in doc:
    print(token)

Captain
america
ate
100
$
of
samosa
.
Then
he
said
I
can
do
this
all
day
.


As you know, the pipeline is blank here, lets check once

In [3]:
nlp.pipe_names

[]

Lets install a trained pipeline and load it to perform different operations

In [4]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m85.7 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [5]:
nlp = spacy.load('en_core_web_sm')

doc = nlp("Captain america ate 100$ of samosa. Then he said I can do this all day.")

for token in doc:
    print(token)

Captain
america
ate
100
$
of
samosa
.
Then
he
said
I
can
do
this
all
day
.


In [8]:
nlp.pipe_names 

# These are the pipeline this trained pipeline contains these things

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [9]:
nlp.pipeline

# What are all these pipelines means

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x7d58dcb36870>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x7d58dc8ad850>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x7d58dc945620>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x7d58dcb45990>),
 ('lemmatizer',
  <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x7d58dc65b490>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x7d58dc9457e0>)]

In [10]:
doc = nlp("Captain america ate 100$ of samosa. Then he said I can do this all day.")

for token in doc:
    print(token, "|", token.pos_, "|", token.lemma_)

Captain | PROPN | Captain
america | PROPN | america
ate | VERB | eat
100 | NUM | 100
$ | NUM | $
of | ADP | of
samosa | PROPN | samosa
. | PUNCT | .
Then | ADV | then
he | PRON | he
said | VERB | say
I | PRON | I
can | AUX | can
do | VERB | do
this | PRON | this
all | DET | all
day | NOUN | day
. | PUNCT | .


The above printed values are the POS - Parts of Speech that is defined in the spacy library
- word
- which type of word it is
- and what  is its base form

### Named Entity Recognition

In [12]:

doc = nlp("Tesla Inc is going to acquire twitter for $45 billion")

for ent in doc.ents:
    print(ent.text,"|", ent.label_,"|", spacy.explain(ent.label_))

Tesla Inc | ORG | Companies, agencies, institutions, etc.
$45 billion | MONEY | Monetary values, including unit


### To make it look better

In [13]:
from spacy import displacy

displacy.render(doc, style='ent')

### Adding a component to a blank pipeline

In [14]:
source_nlp = spacy.load("en_core_web_sm")

nlp = spacy.blank("en")
nlp.add_pipe("ner",source = source_nlp)

nlp.pipe_names

['ner']

In [15]:
doc = nlp("Tesla Inc is going to acquire twitter for $45 billion")
for ent in doc.ents:
    print(ent.text, ent.label_)

Tesla Inc ORG
$45 billion MONEY


We have sucessfully added the a new component to the blank pipline, and we can create our own customizable pipelines