# Topic Analysis of Python's PEPs

You can get the data on [https://github.com/python/peps](https://github.com/python/peps)

In [1]:
import os
import string
import gensim
import nltk

In [2]:
folder = "peps"
all_files = os.listdir(folder)

In [3]:
all_files = [f for f in all_files if ".txt" in f or ".rst" in f]

In [4]:
all_texts = []
for filename in all_files:
    with open(os.path.join(folder, filename)) as f:
        data = f.readlines()
        data = " ".join(data)
        data = data.replace("\n", " ")
        data = data.translate(str.maketrans('', '', string.punctuation))
        data = data.lower()
    all_texts.append(data.split(" "))

In [5]:
len(all_texts)

499

In [6]:
dic = gensim.corpora.Dictionary(all_texts)
doc_term = [dic.doc2bow(text) for text in all_texts]

In [7]:
model_vw = gensim.models.wrappers.ldavowpalwabbit.LdaVowpalWabbit('/usr/bin/vw', doc_term, num_topics=3, id2word=dic, passes=50)

In [8]:
model_vw.show_topics(formatted=False)

[[(0.3775682, ''),
  (0.035730477, 'the'),
  (0.017290713, 'to'),
  (0.01528061, 'a'),
  (0.012601474, 'of'),
  (0.011599175, 'is'),
  (0.010923134, 'and'),
  (0.009913662, 'in'),
  (0.008121213, 'for'),
  (0.0073990175, 'be')],
 [(0.063974544, 'git'),
  (0.03795259, 'branch'),
  (0.022772124, 'commit'),
  (0.019131832, 'branches'),
  (0.018818825, 'repository'),
  (0.017196584, 'merge'),
  (0.015387925, 'pull'),
  (0.014990298, 'mercurial'),
  (0.012090781, 'repositories'),
  (0.011485127, 'commits')],
 [(0.0063857413, 'translations'),
  (0.0034663824, 'switcher'),
  (0.0025820502, 'peaches'),
  (0.0023689077, 'pears'),
  (0.0021920898, 'pi'),
  (0.0017569656, 'docsbuildscripts'),
  (0.0017283072, 'tau'),
  (0.0017215981, 'coordinator'),
  (0.0015099157, 'japanese'),
  (0.0015050643, 'translators')]]

In [9]:
all_texts = []
for filename in all_files:
    with open(os.path.join(folder, filename)) as f:
        data = f.readlines()
        data = " ".join(data)
        data = data.replace("\n", " ")
        data = data.translate(str.maketrans('', '', string.punctuation))
        data = data.lower()
        data = [word for word, tag in nltk.pos_tag(nltk.tokenize.word_tokenize(data)) if tag[:2] in {"JJ", "VB", "NN"} and len(word) > 2]
    all_texts.append(data)

In [10]:
all_words = []
for text in all_texts:
    all_words += text

In [11]:
import collections

In [12]:
len(all_texts)

499

In [13]:
word_count = collections.Counter(all_words)

In [14]:
words_to_keep = set([word for word, count in word_count.items() if count > 10 and count < len(all_texts)/2])

In [15]:
all_texts_filtered = [[word for word in text if word in words_to_keep] for text in all_texts]

In [16]:
dic_filtered = gensim.corpora.Dictionary(all_texts_filtered)
doc_term_filtered = [dic_filtered.doc2bow(text) for text in all_texts_filtered]

In [17]:
model_vw_2 = gensim.models.wrappers.ldavowpalwabbit.LdaVowpalWabbit('/usr/bin/vw', doc_term_filtered, num_topics=3, id2word=dic_filtered, passes=50)

In [18]:
model_vw_2.show_topics(formatted=False)

[[(0.0021211521, 'council'),
  (0.0020209826, 'typing'),
  (0.001967179, 'vote'),
  (0.0019637041, 'email'),
  (0.0017696245, 'imports'),
  (0.0017605977, 'docstring'),
  (0.0017311072, 'metaclass'),
  (0.0017160302, 'dependencies'),
  (0.0017121191, 'tool'),
  (0.0016449234, 'branches')],
 [(0.0042784135, 'http'),
  (0.00423048, 'asynchronous'),
  (0.0041640424, 'coroutine'),
  (0.003738793, 'servers'),
  (0.0036994428, 'snapshot'),
  (0.0036574178, 'stream'),
  (0.003479043, 'await'),
  (0.003413014, 'role'),
  (0.0034056753, 'tls'),
  (0.003208759, 'middleware')],
 [(0.003375465, 'float'),
  (0.0033292538, 'expr'),
  (0.0030802137, 'decimal'),
  (0.0029939243, 'numeric'),
  (0.0028386556, 'pointer'),
  (0.0027712095, 'comparison'),
  (0.0025899224, 'range'),
  (0.0024018735, 'iteration'),
  (0.0023699212, 'lambda'),
  (0.0023177918, 'integers')]]