In [2]:
import pandas as pd

df = pd.read_sql_query('''SELECT d.document_id AS document_id, d.text AS text FROM document_text d JOIN is_syllabus i 
                                   ON d.document_id = i.document_id
                                   WHERE i.score > 0.95''',
                      'postgres:///osp')

In [3]:
print("The corpus has {} syllabi".format(df.shape[0]))

The corpus has 28117 syllabi


In [4]:
import textacy
corpus = textacy.corpus.Corpus('en', texts=list(df.text))

In [5]:
terms_list = (doc.to_terms_list(ngrams=1, named_entities=True, as_strings=True) for doc in corpus)
doc_term_matrix, id2term = textacy.vsm.doc_term_matrix(
        terms_list, weighting='tfidf', normalize=True, smooth_idf=True,
        min_df=3, max_df=0.95, max_n_terms=200000)


In [6]:
model = textacy.tm.TopicModel('nmf', n_topics=30)

model.fit(doc_term_matrix)

In [8]:
for topic_idx, top_terms in model.top_topic_terms(id2term, top_n=10):
    print(topic_idx)
    print(', '.join(top_terms))
    print('-----\n')

0
dallas, university, texas, rule, grievance, utd, regulation, ’s, resolve, dean
-----

1
point, discussion, week, group, social, university, program, project, blackboard, reading
-----

2
equation, linear, solve, quadratic, rational, graph, function, test, lab, polynomial
-----

3
t, s, r, ection, ion, e, o, er, las, g
-----

4
hcc, counselor, test, withdrawal, college, 713, houston, tuition, ada, and/or
-----

5
lcc, 721, laredo, revise, 956, ion, south, emergency, college, 794
-----

6
lab, laboratory, biology, lecture, report, lin, practical, anatomy, physiology, exercise
-----

7
, module, blackboard, test, o, ~, project, hw, tsi, ecollege
-----

8
speech, persuasive, informative, speeches, point, communication, speak, presentation, audience, speaking
-----

9
210, alamo, 485, colleges, concourse, 486, college, |, curricular, antonio
-----

10
ch, 6, 12, 9, homework, 11, 10, 8, 7, 14
-----

11
â, â, €, gbc, p., webcampus, ada, sexual, uta, â¢â
-----

12
essay, english, draft, 