# Textacy documentation: https://chartbeat-labs.github.io/textacy/

In [None]:
import textacy

In [None]:
text = (
    'Since the so-called "statistical revolution" in the late 1980s and mid 1990s, '
    'much Natural Language Processing research has relied heavily on machine learning. '
    'Formerly, many language-processing tasks typically involved the direct hand coding '
    'of rules, which is not in general robust to natural language variation. '
    'The machine-learning paradigm calls instead for using statistical inference '
    'to automatically learn such rules through the analysis of large corpora '
    'of typical real-world examples.')

In [None]:
textacy.text_utils.KWIC(text, 'language', window_width=35)

In [None]:
textacy.preprocess_text(text, lowercase=True, no_punct=True)

In [None]:
metadata = {
    'title': 'Natural-language processing', 
    'url': 'https://en.wikipedia.org/wiki/Natural-language_processing',
    'source': 'wikipedia'
}

In [None]:
doc = textacy.Doc(text, metadata=metadata, lang='en')

In [None]:
doc

In [None]:
doc.spacy_doc

In [None]:
ngrams = list(textacy.extract.ngrams(doc, 3, filter_stops=True, filter_punct=True, filter_nums=False))
print(ngrams[0:10])

In [None]:
import textacy.keyterms
textacy.keyterms.textrank(doc, normalize='lemma', n_keyterms=10)

In [None]:
textacy.keyterms.sgrank(doc, ngrams=(1, 2, 3, 4), normalize='lower', n_keyterms=0.1)

In [None]:
ts = textacy.TextStats(doc)
ts.basic_counts

In [None]:
bot = doc.to_bag_of_terms(
    ngrams=(1, 2, 3), named_entities=True, weighting='count',
    as_strings=True)

In [None]:
for k, v in bot.items():
    print(f"{k}: {v}")

## Working with Many Texts

In [None]:
from textacy.datasets.capitol_words import *
cw = CapitolWords()
cw.download()

In [None]:
cw.info

In [None]:
records = textacy.io.read_json(
    f"{cw.info['data_dir']}/capitol-words-py3.json.gz",
    mode='rt', lines=True)

In [None]:
for record in records:
    doc = textacy.Doc(record['text'], metadata=record['title'])
    print(doc)
    break

In [None]:
records = cw.records(speaker_name={'Hillary Clinton', 'Barack Obama'})
text_stream, metadata_stream = textacy.io.split_records(records, 'text')

### Make a corpus

In [None]:
corpus = textacy.Corpus(textacy.load_spacy('en'),
                        texts=cw.texts(speaker_party='R', chamber='House', limit=100))

In [None]:
corpus

In [None]:
corpus[10:15]

In [None]:
corpus.n_docs, corpus.n_sents, corpus.n_tokens

#### Create a document-term matrix

In [None]:
vectorizer = textacy.Vectorizer(
    tf_type='linear', apply_idf=True, idf_type='smooth', norm='l2',
    min_df=2, max_df=0.95)

In [None]:
doc_term_matrix = vectorizer.fit_transform(
    (doc.to_terms_list(ngrams=1, named_entities=True, as_strings=True) for doc in corpus))

In [None]:
print(repr(doc_term_matrix))

#### Create a topic model

In [None]:
model = textacy.TopicModel('nmf', n_topics=3)
model.fit(doc_term_matrix)
doc_topic_matrix = model.transform(doc_term_matrix)
doc_topic_matrix.shape

In [None]:
for topic_idx, top_terms in model.top_topic_terms(vectorizer.id_to_term, top_n=10):
    print('topic', topic_idx, ':', '   '.join(top_terms))

In [None]:
model.termite_plot(doc_term_matrix, vectorizer.id_to_term)