In [1]:
#We need texacy, which inturn loads spacy library
!pip install textacy==0.9.1

Collecting textacy==0.9.1
[?25l  Downloading https://files.pythonhosted.org/packages/3a/5e/3b8391cf6ff39350b73f8421184cf6792002b5c2c17982b7c9fbd5ff36de/textacy-0.9.1-py3-none-any.whl (203kB)
[K     |████████████████████████████████| 204kB 4.1MB/s 
Collecting cytoolz>=0.8.0
[?25l  Downloading https://files.pythonhosted.org/packages/58/67/1c60da8ba831bfefedb64c78b9f6820bdf58972797c95644ee3191daf27a/cytoolz-0.11.0.tar.gz (477kB)
[K     |████████████████████████████████| 481kB 7.0MB/s 
Collecting jellyfish>=0.7.0
[?25l  Downloading https://files.pythonhosted.org/packages/30/a6/4d039bc827a102f62ce7a7910713e38fdfd7c7a40aa39c72fb14938a1473/jellyfish-0.8.2-cp37-cp37m-manylinux2014_x86_64.whl (90kB)
[K     |████████████████████████████████| 92kB 5.8MB/s 
[?25hCollecting pyphen>=0.9.4
[?25l  Downloading https://files.pythonhosted.org/packages/7c/5a/5bc036e01389bc6a6667a932bac3e388de6e7fa5777a6ff50e652f60ec79/Pyphen-0.10.0-py3-none-any.whl (1.9MB)
[K     |████████████████████████████████

In [2]:
!python -m spacy download en_core_web_sm

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


In [3]:
#Use spacy version 2.2.4 for this notebook
!pip install spacy==2.2.4



In [4]:
import spacy
import textacy.ke
from textacy import *

In [5]:
#Load a spacy model, which will be used for all further processing.
en = textacy.load_spacy_lang("en_core_web_sm")

In [6]:
#Let us use a sample text file, nlphistory.txt, which is the text from the history section of Wikipedia's
#page on Natural Language Processing 
#https://en.wikipedia.org/wiki/Natural_language_processing
try :
    from google.colab import files
    uploaded=files.upload()
    mytext = open('nlphistory.txt').read()

except ModuleNotFoundError :
    mytext = open('Data/nlphistory.txt').read()

Saving nlphistory.txt to nlphistory.txt


In [7]:
#convert the text into a spacy document.
doc = textacy.make_spacy_doc(mytext, lang=en)

In [8]:
textacy.ke.textrank(doc, topn=5)

[('successful natural language processing system', 0.02475549496438359),
 ('statistical machine translation system', 0.024648673368376665),
 ('natural language system', 0.020518708001159278),
 ('statistical natural language processing', 0.01858983530270439),
 ('natural language task', 0.01579726776487791)]

In [9]:
#Print the keywords using TextRank algorithm, as implemented in Textacy.
print("Textrank output: ", [kps for kps, weights in textacy.ke.textrank(doc, normalize="lemma", topn=5)])\
#Print the key words and phrases, using SGRank algorithm, as implemented in Textacy
print("SGRank output: ", [kps for kps, weights in textacy.ke.sgrank(doc, topn=5)])


Textrank output:  ['successful natural language processing system', 'statistical machine translation system', 'natural language system', 'statistical natural language processing', 'natural language task']
SGRank output:  ['natural language processing system', 'statistical machine translation', 'research', 'late 1980', 'early']


In [10]:
#To address the issue of overlapping key phrases, textacy has a function: aggregage_term_variants.
#Choosing one of the grouped terms per item will give us a list of non-overlapping key phrases!
terms = set([term for term,weight in textacy.ke.sgrank(doc)])
print(textacy.ke.utils.aggregate_term_variants(terms))

[{'natural language processing system'}, {'statistical machine translation'}, {'statistical model'}, {'late 1980'}, {'research'}, {'example'}, {'early'}, {'world'}, {'ELIZA'}, {'real'}]


In [11]:
#A way to look at key phrases is just consider all noun chunks as potential ones. 
#However, keep in mind this will result in a lot of phrases, and no way to rank them!

print([chunk for chunk in textacy.extract.noun_chunks(doc)])

[history, natural language processing, 1950s, work, earlier periods, Alan Turing, article, what, criterion, intelligence, Georgetown experiment, fully automatic translation, more than sixty Russian sentences, English, authors, three or five years, machine translation, real progress, ALPAC report, ten-year-long research, expectations, machine translation, Little further research, machine translation, late 1980s, first statistical machine translation systems, notably successful natural language processing systems, SHRDLU, natural language system, restricted "blocks worlds, restricted vocabularies, ELIZA, simulation, Rogerian psychotherapist, Joseph Weizenbaum, almost no information, human thought, emotion, ELIZA, startlingly human-like interaction, "patient, very small knowledge base, ELIZA, generic response, example, head, you, head, 1970s, many programmers, "conceptual ontologies, real-world information, computer-understandable data, Examples, MARGIE, Schank, Cullingford, (Wilensky, Le

Textacy also has a bunch of other information extraction functions, many of them based on regular expression patterns and heuristics to address extracting specific expressions such as acronyms and quotations. Apart from these, we can also extract matching custom regular expressions including POS tag patterns, or look for statements involving an entity, subject-verb-object tuples etc. We will discuss some of these as they come, in this chapter. 

Documentation: https://chartbeat-labs.github.io/textacy/build/html/index.html