In [None]:
#we are doing python keyword extraction
!pip install --quiet flashtext==2.7
!pip install git+https://github.com/boudinfl/pke.git

In [None]:
!pip install scipy==1.8.0
!pip install networkx==2.6

In [None]:
!python -m spacy info

In [12]:
import textwrap

In [1]:
import json
import requests
import string
import re
import nltk
import string
import itertools
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
import pke
from nltk.corpus import stopwords
from nltk.corpus import wordnet
import traceback
from nltk.tokenize import sent_tokenize
from flashtext import KeywordProcessor


In [4]:
def tokenize_sentences(text):
    sentences = sent_tokenize(text)
    sentences = [sentence.strip() for sentence in sentences if len(sentence) > 20]
    return sentences

In [5]:

def get_noun_adj_verb(text):
    out=[]
    try:
        extractor = pke.unsupervised.MultipartiteRank()
        extractor.load_document(input=text,language='en')
        #    not contain punctuation marks or stopwords as candidates.
        pos = {'VERB', 'ADJ', 'NOUN'}
        stoplist = list(string.punctuation)
        stoplist += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-']
        stoplist += stopwords.words('english')
        extractor.candidate_selection(pos=pos)
        extractor.candidate_weighting(alpha=1.1,
                                      threshold=0.75,
                                      method='average')
        keyphrases = extractor.get_n_best(n=30)


        for val in keyphrases:
            out.append(val[0])
    except:
        out = []
        traceback.print_exc()

    return out

In [6]:
from pprint import pprint
def get_sentences_for_keyword(keywords, sentences):
    keyword_processor = KeywordProcessor()
    keyword_sentences = {}
    for word in keywords:
        keyword_sentences[word] = []
        keyword_processor.add_keyword(word)
    for sentence in sentences:
        keywords_found = keyword_processor.extract_keywords(sentence)
        for key in keywords_found:
            keyword_sentences[key].append(sentence)

    for key in keyword_sentences.keys():
        values = keyword_sentences[key]
        values = sorted(values, key=len, reverse=True)
        keyword_sentences[key] = values
    return keyword_sentences

In [7]:

def get_fill_in_the_blanks(sentence_mapping):
    out={"title":"Fill in the blanks for these sentences with matching words at the top"}
    blank_sentences = []
    processed = []
    keys=[]
    for key in sentence_mapping:
        if len(sentence_mapping[key])>0:
            sent = sentence_mapping[key][0]
            # Compile a regular expression pattern into a regular expression object, which can be used for matching and other methods
            insensitive_sent = re.compile(re.escape(key), re.IGNORECASE)
            no_of_replacements =  len(re.findall(re.escape(key),sent,re.IGNORECASE))
            line = insensitive_sent.sub(' _________ ', sent)
            if (sentence_mapping[key][0] not in processed) and no_of_replacements<2:
                blank_sentences.append(line)
                processed.append(sentence_mapping[key][0])
                keys.append(key)
    out["sentences"]=blank_sentences[:10]
    out["keys"]=keys[:10]
    return out

In [8]:
from IPython.core.display import display, HTML
import xml.etree.ElementTree as et
import random


In [22]:
def fillintheblanks(text):
  wrapper = textwrap.TextWrapper(width=150)
  word_list = wrapper.wrap(text=text)
  sentences = tokenize_sentences(text)
  noun_verbs_adj = get_noun_adj_verb(text)
  keyword_sentence_mapping_noun_verbs_adj = get_sentences_for_keyword(noun_verbs_adj, sentences)
  fill_in_the_blanks = get_fill_in_the_blanks(keyword_sentence_mapping_noun_verbs_adj)
  root = et.Element("div")
  heading = et.Element("h2")
  heading.text = fill_in_the_blanks['title']
  keywords = et.Element("ul")
  keywords.set('style', 'color:white;')
  all_keys = fill_in_the_blanks['keys']
  random.shuffle(all_keys)
  for blank in all_keys:
      child=et.Element("li")
      child.text = blank
      keywords.append(child)
  sentences = et.Element("ol")
  sentences.set('style', 'color:brown;')
  for sentence in fill_in_the_blanks['sentences']:
    child=et.Element("li")
    child.text = sentence
    sentences.append(child)
    sentences.append(et.Element("br"))
  heading_content = et.Element("h4")
  root.append(heading)
  heading_content.append(keywords)
  heading_content.append(sentences)
  root.append(heading_content)
  xmlstr = et.tostring(root)
  xmlstr = xmlstr.decode("utf-8")
  display(HTML(xmlstr))

In [23]:
text="""There is a lot of volcanic activity at divergent plate boundaries in the oceans. For example, many undersea volcanoes are found along the Mid-Atlantic Ridge.
This is a divergent plate boundary that runs north-south through the middle of the Atlantic Ocean. As tectonic plates pull away from each other
at a divergent plate boundary, they create deep fissures, or cracks, in the crust. Molten rock, called magma, erupts through these cracks onto Earth’s
surface. At the surface, the molten rock is called lava. It cools and hardens, forming rock. Divergent plate boundaries also occur in the continental
crust. Volcanoes form at these boundaries, but less often than in ocean crust. That’s because continental crust is thicker than oceanic crust.
This makes it more difficult for molten rock to push up through the crust. Many volcanoes form along convergent plate boundaries where one tectonic
plate is pulled down beneath another at a subduction zone. The leading edge of the plate melts as it is pulled into the mantle, forming magma that
erupts as volcanoes. When a line of volcanoes forms along a subduction zone, they make up a volcanic arc. The edges of the Pacific plate are long
subduction zones lined with volcanoes. This is why the Pacific rim is called the “Pacific Ring of Fire.”"""

In [24]:
fillintheblanks(text)

In [25]:
text="""Naur Gopal Maloo (21), a Chemistry student at IIT-Delhi, committed suicide in his hostel room on Friday, police said
 Maloo was found hanging from a fan in his room on the fourth floor of Nilgiri Hall of Residence, at 7.15 am. A first-year MSc student,
 he hailed from West Bengal’s Hoogli, police said. According to DCP (southwest) Milind Mahadeo Dumbere, Maloo had tried to commit suicide
 by consuming sleeping pills on April 10 as well. “He was admitted to Safdarjung Hospital and was discharged. His brother, Bachoo Ram, stayed
 with him and he was also counselled. The reason for the suicide is being looked into,” the DCP said.The youngest of three brothers, Maloo was
described by his friends as a bright student, and a “simple and sober” person. “He told me he had got an internship with a big company in Kolkata
 for two months and would be going there in May. He was also preparing for GRE as he wanted to do his PhD from Germany,” said Shamlu, a
 PhD student at IIT-D and Maloo’s senior from Kolkata’s Presidency College."""

In [26]:
fillintheblanks(text)