In [1]:
!pip install pandas --user
!pip install glob --user
!pip install toolz --user

Collecting glob
[31m  Could not find a version that satisfies the requirement glob (from versions: )[0m
[31mNo matching distribution found for glob[0m


In [2]:
import os
import glob
from collections import defaultdict
import pandas as pd

### Find all repositories

In [3]:
data_repos = glob.glob('../data/*')
print(data_repos)

['../data/ParlAI', '../data/keras', '../data/tensor2tensor', '../data/fastText', '../data/Horizon', '../data/pytext', '../data/Detectron']


### Save all filenames of each repo in a dictionary

In [4]:
repo_files_dict = defaultdict(list)
for repo in data_repos:
    filenames = glob.glob(repo + '/**/*.py',recursive=True)
    repo_files_dict[repo] = filenames
print(len(repo_files_dict))

7


### Create Pandas dataframe to store repo name and their python files 

In [5]:
repos_df = defaultdict(list)
for repo, filenames in repo_files_dict.items():
    code = ''
    for filename in filenames:
        code += filename + '\n'
        file = open(filename,'r')
        code += file.read()
    repos_df['repo_name'].append(repo)
    repos_df['code'].append(code)

repos_df = pd.DataFrame.from_dict(repos_df)


## Create vocabulary for Encoder
### Refer https://arxiv.org/pdf/1508.07909.pdf

In [6]:
from encoder import *
import pickle

In [22]:
corpus = ' <\eol>\n'.join(repos_df['code'])
corpus[:1000]

'../data/ParlAI/setup.py\n#!/usr/bin/env python3\n\n# Copyright (c) 2017-present, Facebook, Inc.\n# All rights reserved.\n# This source code is licensed under the BSD-style license found in the\n# LICENSE file in the root directory of this source tree. An additional grant\n# of patent rights can be found in the PATENTS file in the same directory.\n\n\nfrom setuptools import setup, find_packages\nimport sys\n\nif sys.version_info < (3,):\n    sys.exit(\'Sorry, Python 3 is required for ParlAI.\')\n\nwith open(\'README.md\', encoding="utf8") as f:\n    readme = f.read()\n\nwith open(\'LICENSE\') as f:\n    license = f.read()\n\nwith open(\'requirements.txt\') as f:\n    reqs = f.read()\n\nsetup(\n    name=\'parlai\',\n    version=\'0.1.0\',\n    description=\'Unified API for accessing dialog datasets.\',\n    long_description=readme,\n    url=\'http://parl.ai/\',\n    license=license,\n    packages=find_packages(exclude=(\n        \'data\', \'docs\', \'downloads\', \'examples\', \'logs\',

In [27]:
%%bash
rm -rf data/lang_gen
mkdir -p data/lang_gen/

In [31]:
ofp = open('data/lang_gen/raw.txt',"w") 
ofp.write(corpus)

12039974

In [32]:
!wc -l data/lang_gen/*.txt

325501 data/lang_gen/raw.txt


## Encoding

In [13]:
encoder = Encoder(40000, pct_bpe=0.88,ngram_min=1,ngram_max=5)
encoder.fit(corpus.split('\n'))

In [14]:
example = "def trim_vocab(n, vocab):\n# type: (int, Dict[str, int]) -> None\n''' Deletes all pairs below 10 * vocab size to prevent memory problems '''\npair_counts = sorted(vocab.items(), key=lambda p: -p[1])\npairs_to_trim = [pair for pair, count in pair_counts[n:]]\nfor pair in pairs_to_trim:\ndel vocab[pair]"
print(encoder.tokenize(example))
print(next(encoder.transform([example])))
print(next(encoder.inverse_transform(encoder.transform([example]))))

['def', '__sow', 'tri', 'm_v', 'ocab', '__eow', '(', 'n', ',', 'vocab', '):', '#', 'type', ':', '(', 'int', ',', 'dict', '[', 'str', ',', 'int', '])', '->', 'none', "'''", '__sow', 'delet', 'es', '__eow', 'all', 'pairs', 'below', '10', '*', 'vocab', 'size', 'to', 'prevent', 'memory', '__sow', 'probl', 'ems', '__eow', "'''", '__sow', 'pair_', 'count', 's', '__eow', '=', 'sorted', '(', 'vocab', '.', 'items', '(),', 'key', '=', 'lambda', 'p', ':', '-', 'p', '[', '1', '])', '__sow', 'pairs', '_to_t', 'rim', '__eow', '=', '[', 'pair', 'for', 'pair', ',', 'count', 'in', '__sow', 'pair_', 'count', 's', '__eow', '[', 'n', '__sow', ':]]', '__eow', 'for', 'pair', 'in', '__sow', 'pairs', '_to_t', 'rim', '__eow', ':', 'del', 'vocab', '[', 'pair', ']']
[21, 4800, 5056, 15476, 6526, 4801, 3, 96, 4, 458, 16, 10, 74, 8, 3, 104, 4, 108, 14, 109, 4, 104, 124, 237, 35, 405, 4800, 10464, 4833, 4801, 71, 1652, 861, 354, 80, 458, 136, 30, 2815, 557, 4800, 27696, 10064, 4801, 405, 4800, 18518, 724, 91, 4801,

In [23]:
print(encoder.bpe_vocab)
encoder.bpe_vocab.save('data/vocab.txt')

{'__sow': 4800, '__eow': 4801, 'e': 4802, 't': 4803, '_': 4804, 'a': 4805, 's': 4806, 'r': 4807, 'i': 4808, 'n': 4809, 'o': 4810, 'l': 4811, 'd': 4812, 'c': 4813, 'p': 4814, 'm': 4815, 'u': 4816, 'g': 4817, 'b': 4818, 'f': 4819, 'h': 4820, 'v': 4821, 'k': 4822, 'w': 4823, 'x': 4824, 'y': 4825, 't_': 4826, 'in': 4827, 're': 4828, 'at': 4829, 'te': 4830, 'er': 4831, 'on': 4832, 'es': 4833, 'or': 4834, 'en': 4835, 'e_': 4836, 'ti': 4837, 'st': 4838, 'et': 4839, 'al': 4840, 'co': 4841, 'd_': 4842, 'ed': 4843, 'nt': 4844, 'le': 4845, 'ta': 4846, 'de': 4847, 'se': 4848, 'q': 4849, '_t': 4850, 'ar': 4851, 'ge': 4852, '_s': 4853, 'ra': 4854, 'me': 4855, 'ct': 4856, '-': 4857, 's_': 4858, 'n_': 4859, "'": 4860, 'io': 4861, 'ac': 4862, '0': 4863, '2': 4864, 'ng': 4865, 'ne': 4866, 'to': 4867, '1': 4868, 'ion': 4869, 'an': 4870, 'di': 4871, 'as': 4872, 'z': 4873, 'tr': 4874, 'lo': 4875, '_p': 4876, 'ea': 4877, '_i': 4878, 'ma': 4879, 'tio': 4880, 'ro': 4881, 'tion': 4882, '_f': 4883, '_a': 4884, 