In [18]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Step 0: Initialisation

In [19]:
import spacy
nlp = spacy.load('en_core_web_sm')
about_text = """Christmas is celebrated as the birthday of Jesus. It is celebrated on 25 December every year. It is the most important festival of the Christians. 
One of the major attractions of the Christmas festival is the Christmas Tree. A Christmas Tree is a decorated tree. 
It is usually an evergreen conifer traditionally associated with the celebration of Christmas. Candles, toffees and cakes are tied with ribbon and paper napkins to the tree. 
The Christmas Tree in ancient times was considered a symbol of the continuity of life. 
It is believed that evil spirits stay away from the house and keep the flow of positive energy. 
They are available in many sizes in the market. A person disguised as Santa Claus distributes sweets among the children. 
Family reunions and the exchange of gifts are a widespread feature of the season. Children get dressed in new clothes. Prayers are offered in churches."""
about_doc = nlp(about_text)

# Step 1: Sentence segmentation

In [20]:
sentences = list(about_doc.sents)

for sentence in sentences:
    print (sentence)

Christmas is celebrated as the birthday of Jesus.
It is celebrated on 25 December every year.
It is the most important festival of the Christians. 

One of the major attractions of the Christmas festival is the Christmas Tree.
A Christmas Tree is a decorated tree. 

It is usually an evergreen conifer traditionally associated with the celebration of Christmas.
Candles, toffees and cakes are tied with ribbon and paper napkins to the tree. 

The Christmas Tree in ancient times was considered a symbol of the continuity of life. 

It is believed that evil spirits stay away from the house and keep the flow of positive energy. 

They are available in many sizes in the market.
A person disguised as Santa Claus distributes sweets among the children. 

Family reunions and the exchange of gifts are a widespread feature of the season.
Children get dressed in new clothes.
Prayers are offered in churches.


# Step 2: Tokenizer

In [21]:
import re
from spacy.tokenizer import Tokenizer
custom_nlp = spacy.load('en_core_web_sm')
prefix_re = spacy.util.compile_prefix_regex(custom_nlp.Defaults.prefixes)
suffix_re = spacy.util.compile_suffix_regex(custom_nlp.Defaults.suffixes)
infix_re = re.compile(r'''[-~]''')
def customize_tokenizer(nlp):
    # Adds support to use `-` as the delimiter for tokenization
    return Tokenizer(nlp.vocab, prefix_search=prefix_re.search,
                     suffix_search=suffix_re.search,
                     infix_finditer=infix_re.finditer,
                     token_match=None
                     )


custom_nlp.tokenizer = customize_tokenizer(custom_nlp)
custom_tokenizer_about_doc = custom_nlp(about_text)
print([token.text for token in custom_tokenizer_about_doc])

['Christmas', 'is', 'celebrated', 'as', 'the', 'birthday', 'of', 'Jesus', '.', 'It', 'is', 'celebrated', 'on', '25', 'December', 'every', 'year', '.', 'It', 'is', 'the', 'most', 'important', 'festival', 'of', 'the', 'Christians', '.', '\n', 'One', 'of', 'the', 'major', 'attractions', 'of', 'the', 'Christmas', 'festival', 'is', 'the', 'Christmas', 'Tree', '.', 'A', 'Christmas', 'Tree', 'is', 'a', 'decorated', 'tree', '.', '\n', 'It', 'is', 'usually', 'an', 'evergreen', 'conifer', 'traditionally', 'associated', 'with', 'the', 'celebration', 'of', 'Christmas', '.', 'Candles', ',', 'toffees', 'and', 'cakes', 'are', 'tied', 'with', 'ribbon', 'and', 'paper', 'napkins', 'to', 'the', 'tree', '.', '\n', 'The', 'Christmas', 'Tree', 'in', 'ancient', 'times', 'was', 'considered', 'a', 'symbol', 'of', 'the', 'continuity', 'of', 'life', '.', '\n', 'It', 'is', 'believed', 'that', 'evil', 'spirits', 'stay', 'away', 'from', 'the', 'house', 'and', 'keep', 'the', 'flow', 'of', 'positive', 'energy', '.', 

# Step 3: Parts of speech

In [22]:
for token in about_doc:
    print (token,":", token.tag_, spacy.explain(token.tag_))

Christmas : NNP noun, proper singular
is : VBZ verb, 3rd person singular present
celebrated : VBN verb, past participle
as : IN conjunction, subordinating or preposition
the : DT determiner
birthday : NN noun, singular or mass
of : IN conjunction, subordinating or preposition
Jesus : NNP noun, proper singular
. : . punctuation mark, sentence closer
It : PRP pronoun, personal
is : VBZ verb, 3rd person singular present
celebrated : VBN verb, past participle
on : IN conjunction, subordinating or preposition
25 : CD cardinal number
December : NNP noun, proper singular
every : DT determiner
year : NN noun, singular or mass
. : . punctuation mark, sentence closer
It : PRP pronoun, personal
is : VBZ verb, 3rd person singular present
the : DT determiner
most : RBS adverb, superlative
important : JJ adjective
festival : NN noun, singular or mass
of : IN conjunction, subordinating or preposition
the : DT determiner
Christians : NNPS noun, proper plural
. : . punctuation mark, sentence closer

 :

# 4- Lemmatization

In [23]:
lemmatised=[]
for token in about_doc:
    lemmatised.append(token.lemma_)
    
print (lemmatised)

['Christmas', 'be', 'celebrate', 'as', 'the', 'birthday', 'of', 'Jesus', '.', '-PRON-', 'be', 'celebrate', 'on', '25', 'December', 'every', 'year', '.', '-PRON-', 'be', 'the', 'most', 'important', 'festival', 'of', 'the', 'Christians', '.', '\n', 'one', 'of', 'the', 'major', 'attraction', 'of', 'the', 'Christmas', 'festival', 'be', 'the', 'Christmas', 'Tree', '.', 'a', 'Christmas', 'Tree', 'be', 'a', 'decorate', 'tree', '.', '\n', '-PRON-', 'be', 'usually', 'an', 'evergreen', 'conifer', 'traditionally', 'associate', 'with', 'the', 'celebration', 'of', 'Christmas', '.', 'candle', ',', 'toffee', 'and', 'cake', 'be', 'tie', 'with', 'ribbon', 'and', 'paper', 'napkin', 'to', 'the', 'tree', '.', '\n', 'the', 'Christmas', 'Tree', 'in', 'ancient', 'time', 'be', 'consider', 'a', 'symbol', 'of', 'the', 'continuity', 'of', 'life', '.', '\n', '-PRON-', 'be', 'believe', 'that', 'evil', 'spirit', 'stay', 'away', 'from', 'the', 'house', 'and', 'keep', 'the', 'flow', 'of', 'positive', 'energy', '.', '

# 5- Stopwords


In [24]:
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS
for token in about_doc:
    if not token.is_stop:
        print (token,":", token.lemma_)

Christmas : Christmas
celebrated : celebrate
birthday : birthday
Jesus : Jesus
. : .
celebrated : celebrate
25 : 25
December : December
year : year
. : .
important : important
festival : festival
Christians : Christians
. : .

 : 

major : major
attractions : attraction
Christmas : Christmas
festival : festival
Christmas : Christmas
Tree : Tree
. : .
Christmas : Christmas
Tree : Tree
decorated : decorate
tree : tree
. : .

 : 

usually : usually
evergreen : evergreen
conifer : conifer
traditionally : traditionally
associated : associate
celebration : celebration
Christmas : Christmas
. : .
Candles : candle
, : ,
toffees : toffee
cakes : cake
tied : tie
ribbon : ribbon
paper : paper
napkins : napkin
tree : tree
. : .

 : 

Christmas : Christmas
Tree : Tree
ancient : ancient
times : time
considered : consider
symbol : symbol
continuity : continuity
life : life
. : .

 : 

believed : believe
evil : evil
spirits : spirit
stay : stay
away : away
house : house
flow : flow
positive : positive

# 6- Dependency parsing


In [25]:
from spacy import displacy

sentences = list(about_doc.sents)

for sentence in sentences:
    displacy.render(sentence,jupyter=True)

In [26]:
for token in about_doc:
    print (token.text, token.tag_, token.head.text, token.dep_)

Christmas NNP celebrated nsubjpass
is VBZ celebrated auxpass
celebrated VBN celebrated ROOT
as IN celebrated prep
the DT birthday det
birthday NN as pobj
of IN birthday prep
Jesus NNP of pobj
. . celebrated punct
It PRP celebrated nsubjpass
is VBZ celebrated auxpass
celebrated VBN celebrated ROOT
on IN celebrated prep
25 CD December nummod
December NNP on pobj
every DT year det
year NN December npadvmod
. . celebrated punct
It PRP is nsubj
is VBZ is ROOT
the DT festival det
most RBS important advmod
important JJ festival amod
festival NN is attr
of IN festival prep
the DT Christians det
Christians NNPS of pobj
. . is punct

 _SP . 
One CD is nsubj
of IN One prep
the DT attractions det
major JJ attractions amod
attractions NNS of pobj
of IN attractions prep
the DT festival det
Christmas NNP festival compound
festival NN of pobj
is VBZ is ROOT
the DT Tree det
Christmas NNP Tree compound
Tree NNP is attr
. . is punct
A DT Tree det
Christmas NNP Tree compound
Tree NNP is nsubj
is VBZ is RO

# 7- Noun phrases


In [27]:
# Extract Noun Phrases
for chunk in about_doc.noun_chunks:
    print (chunk)

Christmas
the birthday
Jesus
It
25 December
It
the most important festival
the Christians
the major attractions
the Christmas festival
the Christmas Tree
A Christmas Tree
a decorated tree
It
an evergreen conifer
the celebration
Christmas
Candles
toffees
cakes
ribbon and paper napkins
the tree
The Christmas Tree
ancient times
the continuity
life
It
evil spirits
the house
the flow
positive energy
They
many sizes
the market
A person
Santa Claus
sweets
the children
Family reunions
the exchange
gifts
a widespread feature
the season
Children
new clothes
Prayers
churches


# Step 8: Named entity recognition


In [28]:
for ent in about_doc.ents:
    print(ent.text, ":", ent.label_, spacy.explain(ent.label_))

Christmas : DATE Absolute or relative dates or periods
Jesus : PERSON People, including fictional
25 December every year : DATE Absolute or relative dates or periods
Christians : NORP Nationalities or religious or political groups
One : CARDINAL Numerals that do not fall under another type
the Christmas festival : EVENT Named hurricanes, battles, wars, sports events, etc.
the Christmas Tree : EVENT Named hurricanes, battles, wars, sports events, etc.
A Christmas Tree : PERSON People, including fictional
Christmas : DATE Absolute or relative dates or periods
Santa Claus : ORG Companies, agencies, institutions, etc.
