In this tutorial, we learn how to:
- load text+metadata records from a dataset
- inspect and preprocess raw texts
- add a collection of documents processed by spaCy into a corpus
- inspect aggregated corpus metadata
- extract different kinds of structured data from one or many documents

In [6]:
%pip install spacy
%pip install textacy

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [8]:
%python -m spacy download en_core_web_sm

OSError: [E050] Can't find model 'en_core_web_sm'. It doesn't seem to be a Python package or a valid path to a data directory.

In [15]:
# Explore how certain members of the U.S. Congress have spoken about 'workers'
#   - dataset of thousands of speeches sourced from the Congressional Record.

import textacy.datasets
dataset = textacy.datasets.CapitolWords()
dataset.info
{'name': 'capitol_words',
 'site_url': 'http://sunlightlabs.github.io/Capitol-Words/',
 'description': 'Collection of ~11k speeches in the Congressional Record given by notable U.S. politicians between Jan 1996 and Jun 2016.'}
dataset.download()


ModuleNotFoundError: No module named 'en_core_web_sm'

In [4]:
# Each record contains full text of speech and basic metadata

record = next(dataset.records(limit=1))
record


Record(text='Mr. Speaker, 480,000 Federal employees are working without pay, a form of involuntary servitude; 280,000 Federal employees are not working, and they will be paid. Virtually all of these workers have mortgages to pay, children to feed, and financial obligations to meet.\nMr. Speaker, what is happening to these workers is immoral, is wrong, and must be rectified immediately. Newt Gingrich and the Republican leadership must not continue to hold the House and the American people hostage while they push their disastrous 7-year balanced budget plan. The gentleman from Georgia, Mr. Gingrich, and the Republican leadership must join Senator Dole and the entire Senate and pass a continuing resolution now, now to reopen Government.\nMr. Speaker, that is what the American people want, that is what they need, and that is what this body must do.', meta={'date': '1996-01-04', 'congress': 104, 'speaker_name': 'Bernie Sanders', 'speaker_party': 'I', 'title': 'JOIN THE SENATE AND PASS A CON

In [9]:
# Avoid a full read-through and extract only specific parts of interest

from textacy import extract
textacy.set_doc_extensions("extract")  # just setting these now -- we'll use them later!

# As a first step, inspect our keywords in context
list(extract.keyword_in_context(record.text, "work(ing|ers?)", window_width=35))


[('ker, 480,000 Federal employees are ',
  'working',
  ' without pay, a form of involuntary'),
 (' 280,000 Federal employees are not ',
  'working',
  ', and they will be paid. Virtually '),
 ('ll be paid. Virtually all of these ',
  'workers',
  ' have mortgages to pay, children to'),
 ('peaker, what is happening to these ',
  'workers',
  ' is immoral, is wrong, and must be ')]

In [6]:
# preprocess the text to get rid of potential data quality issues and other distractions that may affect our analysis

from textacy import preprocessing as preproc

preprocessor = preproc.make_pipeline(
    preproc.normalize.unicode,
    preproc.normalize.quotation_marks,
    preproc.normalize.whitespace,
)
preproc_text = preprocessor(record.text)
preproc_text[:200]

# changes are “destructive” — can’t reconstruct the original without keeping a copy around or re-loading it from disk


'Mr. Speaker, 480,000 Federal employees are working without pay, a form of involuntary servitude; 280,000 Federal employees are not working, and they will be paid. Virtually all of these workers have m'

In [7]:
# make a spaCy Doc by applying a language-specific model pipeline to the text

doc = textacy.make_spacy_doc((preproc_text, record.meta), lang="en_core_web_sm")
doc._.preview
doc._.meta


OSError: [E050] Can't find model 'en_core_web_sm'. It doesn't seem to be a Python package or a valid path to a data directory.

In [None]:
# make a spaCy Doc by applying a language-specific model pipeline to the text

doc = textacy.make_spacy_doc((preproc_text, record.meta), lang="en_core_web_sm")
doc._.preview
doc._.meta


In [None]:
# get a sense of how 'workers' are described using annotated part-of-speech tags

# extract just the adjectives and determinants immediately preceding our keyword
patterns = [
    {
        "POS": {
            "IN": ["ADJ", "DET"]
            },
        "OP": "+"
    },
    {
        "ORTH": {
            "REGEX": "workers?"
            }
    }
]
token_matches = extract.token_matches(doc, patterns)
list(token_matches)

# examples aren’t very interesting. would like results aggregated over all speeches: skilled workers, American workers, young workers...

[these workers, these workers]

In [None]:
# To accomplish this, load many records into a textacy.Corpus

records = dataset.records(limit=500)
preproc_records = ((preprocessor(text), meta) for text, meta in records)
corpus = textacy.Corpus("en_core_web_sm", data=preproc_records)
print(corpus)

Corpus(500 docs, 291289 tokens)


In [None]:
# get a better sense of what’s in our corpus by leveraging the documents’ metadata
import collections

date = corpus.agg_metadata("date", min), corpus.agg_metadata("date", max)
speaker_name = corpus.agg_metadata("speaker_name", collections.Counter)

print(date)
print(speaker_name)

('1996-01-04', '1997-04-24')
Counter({'Rick Santorum': 147, 'Joseph Biden': 140, 'John Kasich': 99, 'Bernie Sanders': 92, 'Lindsey Graham': 22})


In [None]:
# extract matches from each processed document

import itertools

matches = itertools.chain.from_iterable(extract.token_matches(doc, patterns) for doc in corpus)

# lemmatize their texts for consistency
# inspect the most common descriptions of workers
collections.Counter(match.lemma_ for match in matches).most_common(20)


[('american worker', 38),
 ('those worker', 5),
 ('the worker', 5),
 ('average american worker', 4),
 ('the average american worker', 4),
 ('more worker', 3),
 ('nonunion worker', 3),
 ('these worker', 2),
 ('federal worker', 2),
 ('that worker', 2),
 ('young worker', 2),
 ('skilled worker', 1),
 ('the more worker', 1),
 ('average worker', 1),
 ('young american worker', 1),
 ('most american worker', 1),
 ('any worker', 1),
 ('a worker', 1),
 ('social worker', 1),
 ('the social worker', 1)]

In [None]:
# To better understand the context of these mentions, extract keyterms (the most important or “key” terms)

corpus[0]._.extract_keyterms("textrank", normalize="lemma", window_size=10, edge_weighting="count", topn=10)


[('year balanced budget plan', 0.033721812470386026),
 ('Mr. Speaker', 0.032162715590532916),
 ('Mr. Gingrich', 0.031358819981176664),
 ('american people', 0.02612752273629427),
 ('republican leadership', 0.025418705021243045),
 ('federal employee', 0.021731159162187104),
 ('Newt Gingrich', 0.01988327361247088),
 ('pay', 0.018930131314143193),
 ('involuntary servitude', 0.015559235022115406),
 ('entire Senate', 0.015032623278646105)]

In [None]:
# Now, select the subset of speeches in which “worker(s)” were mentioned
docs_mentioning_workers = corpus.get(lambda doc: any(doc._.extract_regex_matches("workers?")))

# extract the keyterms from each and aggregaate
kt_weights = collections.Counter()

for doc in docs_mentioning_workers:
  keyterms = doc._.extract_keyterms(
      "textrank", normalize="lemma",
      window_size=10,
      edge_weighting="count",
      topn=10
  )
  kt_weights.update(dict(keyterms))

# rank the results
kt_weights.most_common(20)

# we can see from the list that 'workers' are brought up in discussion of jobs, the minimum wage, and trust funds. Makes sense!

[('minimum wage today', 0.15268345523692883),
 ('Mr. Speaker', 0.12629658074837496),
 ('real wage', 0.11170539679079608),
 ('minimum wage', 0.10559792485488079),
 ('wage job', 0.10102361828065554),
 ('american worker', 0.09808577723575501),
 ('family friendly company', 0.07527248179516885),
 ('american people', 0.07230595280822841),
 ('family work strategy', 0.07139211174164181),
 ('new job', 0.07009455277537283),
 ('tax dollar', 0.06415552977734736),
 ('violent crime trust fund', 0.0606067871587139),
 ('crime bill trust fund', 0.060534358199475835),
 ('crime law trust fund', 0.05916903052361145),
 ('time job', 0.05699067136562007),
 ('russian poultry market', 0.05179865219250223),
 ('temporary job', 0.051032675437375746),
 ('low wage job', 0.05086241618977966),
 ('health care reform bill', 0.05047788075809563),
 ('Kennedy health insurance reform bill', 0.05024956756215013)]