In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import csv

import itertools as itls

In [3]:
import cytoolz as tlz
from cytoolz import curried as tlzc

from text2math import raw2text as r2t
from text2math import text2tokens as t2t
from text2math import tokens2numbers as t2n
import text2math as txt2m

from gensim import corpora

In [4]:
head = lambda l: list(tlzc.take(4, l))

---

# Loading Data

In [5]:
def read_student_title():
    with open("../student_title.csv") as f:
        return tlz.drop(1, [line for line in csv.reader(f)])

In [6]:
STUDENT_TITLES = list(tlz.map(tlz.second, read_student_title()))

In [7]:
head(STUDENT_TITLES)

['No Contest: Participatory Technologies and the Transformation of Urban Authority',
 'Machine Learning',
 'Statistics for Non-Statisticians',
 'Pentland| Alex| Social Physics: How Good Ideas Spread-the Lessons from a New Science| New York| NY: The Penguin Press| 2014. vii + 320 Pages. $27.95 (hardback)']

In [8]:
len(STUDENT_TITLES)

65023

---

# Initial Cleaning and Prepping

In [9]:
def uni_and_bigram_cleanup(txts):
    return tlz.pipe(txts,
                    tlzc.map(r2t.decode_and_fix),
                    tlzc.map(lambda title: itls.chain(t2t.unigram(title), t2t.bigram(title))),
                    tlzc.map(tuple),
                    list)

In [10]:
%time CLEAN_TOKENS_SETS = uni_and_bigram_cleanup(STUDENT_TITLES)

CPU times: user 43.2 s, sys: 738 ms, total: 43.9 s
Wall time: 50.7 s


In [11]:
tlz.random_sample?

In [12]:
head(tlz.random_sample(0.1, CLEAN_TOKENS_SETS, random_state=2017))

[(u'climate',
  u'change',
  u'impacts',
  u'adaptation',
  u'cities',
  u'review',
  u'literature',
  u'climate_change',
  u'change_impacts',
  u'impacts_adaptation',
  u'adaptation_cities',
  u'cities_review',
  u'review_literature'),
 (u'statistics',
  u'non',
  u'statisticians',
  u'statistics_non',
  u'non_statisticians'),
 (u'unveiling',
  u'intricacies',
  u'bullying',
  u'students',
  u'perspectives',
  u'polytechnic',
  u'singapore',
  u'unveiling_intricacies',
  u'intricacies_bullying',
  u'bullying_students',
  u'students_perspectives',
  u'perspectives_polytechnic',
  u'polytechnic_singapore'),
 (u'emerging',
  u'research',
  u'indigenous',
  u'management',
  u'asia',
  u'emerging_research',
  u'research_indigenous',
  u'indigenous_management',
  u'management_asia')]

---

# Creating Dictionary and Bag of Words Corpus

In [13]:
def make_dict_and_corpus(tokensets):

    dictionary = corpora.Dictionary(tokensets)

    dictionary.filter_extremes(no_below=5,
                               no_above=0.5,
                               keep_n=100000)
    dictionary.compactify()

    corpus = tlz.map(dictionary.doc2bow, tokensets)

    return (dictionary, corpus)

In [14]:
%time DICTIONARY, BOW_CORPUS = make_dict_and_corpus(CLEAN_TOKENS_SETS)

CPU times: user 3.45 s, sys: 193 ms, total: 3.65 s
Wall time: 4.91 s


---

In [15]:
c = list(BOW_CORPUS)

In [18]:
c[2:4]

[[(2809, 1), (6748, 1), (7162, 1), (8810, 1), (14006, 1)],
 [(672, 1),
  (705, 2),
  (2706, 1),
  (6115, 1),
  (8307, 1),
  (9552, 1),
  (10752, 1),
  (11449, 1),
  (12729, 1),
  (13322, 1),
  (13521, 1),
  (16074, 1),
  (17116, 1),
  (17665, 1),
  (19280, 1),
  (20919, 1)]]

---

# Saving Dictionary and Bag of Words Corpus

In [86]:
%time DICTIONARY.save("student_titles_dict.dict")

CPU times: user 57.1 ms, sys: 6.16 ms, total: 63.3 ms
Wall time: 67.2 ms


In [87]:
%time corpora.MmCorpus.serialize("student_titles_bow_corpus.mm", BOW_CORPUS)

CPU times: user 4.66 s, sys: 249 ms, total: 4.91 s
Wall time: 4.99 s
