In [2]:
import en_core_web_lg
import spacy
from spacy import displacy


In [3]:
nlp=en_core_web_lg.load()

In [4]:
spacy.__version__

'2.0.18'

### Entity Recognition

In [6]:
text="Hitler was born in Austria—then part of Austria-Hungary—and was raised near Linz. He moved to Germany in 1913 and was decorated during his service in the German Army in World War I. In 1919, he joined the German Workers' Party (DAP), the precursor of the NSDAP, and was appointed leader of the NSDAP in 1921. In 1923, he attempted to seize power in a failed coup in Munich and was imprisoned. In jail, he dictated the first volume of his autobiography and political manifesto Mein Kampf . After his release in 1924, Hitler gained popular support by attacking the Treaty of Versailles and promoting Pan-Germanism, anti-semitism and anti-communism with charismatic oratory and Nazi propaganda. He frequently denounced international capitalism and communism as part of a Jewish conspiracy"

In [8]:
# Parse the text with spaCy. This runs the entire NLP pipeline.
doc = nlp(text)

In [9]:
for entity in doc.ents:
    print(f"{entity.text} ({entity.label_})")

Hitler (PERSON)
Austria (GPE)
Austria (GPE)
Linz (GPE)
Germany (GPE)
1913 (DATE)
the German Army (ORG)
World War I. (EVENT)
1919 (DATE)
the German Workers' Party (ORG)
DAP (ORG)
NSDAP (ORG)
NSDAP (ORG)
1921 (DATE)
1923 (DATE)
Munich (GPE)
first (ORDINAL)
Mein Kampf (PERSON)
1924 (DATE)
Hitler (PERSON)
the Treaty of Versailles (LAW)
Pan-Germanism (ORG)
Nazi (NORP)
Jewish (NORP)


##### Entities present in the text

In [10]:
doc.ents

(Hitler,
 Austria,
 Austria,
 Linz,
 Germany,
 1913,
 the German Army,
 World War I.,
 1919,
 the German Workers' Party,
 DAP,
 NSDAP,
 NSDAP,
 1921,
 1923,
 Munich,
 first,
 Mein Kampf,
 1924,
 Hitler,
 the Treaty of Versailles,
 Pan-Germanism,
 Nazi,
 Jewish)

In [11]:
entity.label, entity.label_

(379, 'NORP')

##### Get explaination of entities 

In [12]:
spacy.explain('EVENT')

'Named hurricanes, battles, wars, sports events, etc.'

In [13]:
text="India is a beautiful country. It got its independence in 1947"

In [14]:
def explain_text(text):
    doc = nlp(text)
    for ent in doc.ents:
        print(f'{ent}, Label: {ent.label_}, {spacy.explain(ent.label_)}')

##### output with text and its corresponding entities and explanation

In [15]:
explain_text(text)

India, Label: GPE, Countries, cities, states
1947, Label: DATE, Absolute or relative dates or periods


#### Automatic Question generation

In [14]:
text="Madhubani art (or Mithila painting) is practiced in the Mithila region of the Indian subcontinent. This painting is done with fingers, twigs, brushes, nib-pens, and matchsticks, etc. Using natural dyes and pigments, and is characterised by eye-catching geometrical patterns.[citation needed] There is ritual content for particular occasions, such as birth or marriage, and festivals, such as Holi, Surya Shasti, Kali Puja, Upanayana, Durga Puja."

In [15]:

doc = nlp(text)

- We need noun chunks. Noun chunks are noun phrases - not a single word, but a short phrase which describes the noun. 

In [16]:
for idx, sentence in enumerate(doc.sents):
    for noun in sentence.noun_chunks:
        print(f'sentence{idx+1}', noun)

sentence1 Madhubani art
sentence1 Mithila painting
sentence1 the Mithila region
sentence1 the Indian subcontinent
sentence2 This painting
sentence2 fingers
sentence2 twigs
sentence2 brushes
sentence2 nib-pens
sentence2 matchsticks
sentence3 natural dyes
sentence3 pigments
sentence3 eye-catching geometrical patterns.[citation
sentence4 ritual content
sentence4 particular occasions
sentence4 birth
sentence4 marriage
sentence4 festivals
sentence4 Holi
sentence4 Surya Shasti
sentence4 Kali Puja
sentence4 Upanayana
sentence4 Durga Puja


In [17]:
for token in doc:
    print(token, token.pos_, token.tag_)

Madhubani PROPN NNP
art NOUN NN
( PUNCT -LRB-
or CCONJ CC
Mithila PROPN NNP
painting NOUN NN
) PUNCT -RRB-
is VERB VBZ
practiced VERB VBN
in ADP IN
the DET DT
Mithila PROPN NNP
region NOUN NN
of ADP IN
the DET DT
Indian ADJ JJ
subcontinent NOUN NN
. PUNCT .
This DET DT
painting NOUN NN
is VERB VBZ
done VERB VBN
with ADP IN
fingers NOUN NNS
, PUNCT ,
twigs NOUN NNS
, PUNCT ,
brushes NOUN NNS
, PUNCT ,
nib NOUN NN
- PUNCT HYPH
pens NOUN NNS
, PUNCT ,
and CCONJ CC
matchsticks NOUN NNS
, PUNCT ,
etc X FW
. PUNCT .
Using VERB VBG
natural ADJ JJ
dyes NOUN NNS
and CCONJ CC
pigments NOUN NNS
, PUNCT ,
and CCONJ CC
is VERB VBZ
characterised VERB VBN
by ADP IN
eye NOUN NN
- PUNCT HYPH
catching VERB VBG
geometrical ADJ JJ
patterns.[citation PROPN NNP
needed VERB VBN
] PUNCT -RRB-
There ADV EX
is VERB VBZ
ritual NOUN NN
content NOUN NN
for ADP IN
particular ADJ JJ
occasions NOUN NNS
, PUNCT ,
such ADJ JJ
as ADP IN
birth NOUN NN
or CCONJ CC
marriage NOUN NN
, PUNCT ,
and CCONJ CC
festivals NOUN NNS

- https://spacy.io/api/annotation   ( to understand what are the abbrvations of POS)

#### create rules to ask question

In [18]:

ruleset = [
    {
        'id': 1, 
        'req_tags': ['NNP', 'VBZ', 'NN'],
    }, 
    {
        'id': 2, 
        'req_tags': ['NNP', 'VBZ'],
    }
    ]

In [19]:

def get_pos_tag(doc, tag):
    return [tok for tok in doc if tok.tag_ == tag]

In [20]:
def sent_to_ques(sent:str)->str:
    """
    Return a question string corresponding to a sentence string using a set of pre-written rules
    """
    doc = nlp(sent)
    pos_tags = [token.tag_ for token in doc]
    for idx, rule in enumerate(ruleset):
        if rule['id'] == 1:
            if all(key in pos_tags for key in rule['req_tags']): 
                print(f"Rule id {rule['id']} matched for sentence: {sent}")
                NNP = get_pos_tag(doc, "NNP")
                NNP = str(NNP[0])
                VBZ = get_pos_tag(doc, "VBZ")
                VBZ = str(VBZ[0])
                ques = f'What {VBZ} {NNP}?'
                return(ques)
        if rule['id'] == 2:
            if all(key in pos_tags for key in rule['req_tags']): #'NNP', 'VBZ' in sentence.
                print(f"Rule id {rule['id']} matched for sentence: {sent}")
                NNP = get_pos_tag(doc, "NNP")
                NNP = str(NNP[0])
                VBZ = get_pos_tag(doc, "VBZ")
                VBZ = str(VBZ[0].lemma_)
                ques = f'What does {NNP} {VBZ}?'
                return(ques)

In [21]:

for sent in doc.sents:
    print(f"The generated question is: {sent_to_ques(str(sent))}")

Rule id 1 matched for sentence: Madhubani art (or Mithila painting) is practiced in the Mithila region of the Indian subcontinent.
The generated question is: What is Madhubani?
The generated question is: None
Rule id 1 matched for sentence: Using natural dyes and pigments, and is characterised by eye-catching geometrical patterns.[citation needed]
The generated question is: What is patterns.[citation?
Rule id 1 matched for sentence: There is ritual content for particular occasions, such as birth or marriage, and festivals, such as Holi, Surya Shasti, Kali Puja, Upanayana, Durga Puja.
The generated question is: What is Holi?
