In [1]:
import os
import glob
from collections import defaultdict
import pandas as pd

### Find all repositories

In [2]:
data_repos = glob.glob('../data/*')
print(data_repos)

['../data/Horizon', '../data/TensorFlow-Examples', '../data/text_classification', '../data/Detectron', '../data/neural-style']


### Save all filenames of each repo in a dictionary

In [3]:
repo_files_dict = defaultdict(list)
for repo in data_repos:
    filenames = glob.glob(repo + '/**/*.py',recursive=True)
    repo_files_dict[repo] = filenames
print(len(repo_files_dict))

5


### Create Pandas dataframe to store repo name and their python files 

In [4]:
repos_df = defaultdict(list)
for repo, filenames in repo_files_dict.items():
    code = ''
    for filename in filenames:
        code += filename + '\n'
        file = open(filename,'r')
        code += file.read()
    repos_df['repo_name'].append(repo)
    repos_df['code'].append(code)

repos_df = pd.DataFrame.from_dict(repos_df)


## Create vocabulary for Encoder
### Refer https://arxiv.org/pdf/1508.07909.pdf

In [5]:
from encoder import *
import pickle

In [6]:
corpus = ' <\eol>\n'.join(repos_df['code'])
corpus[:1000]

'../data/Horizon/setup.py\n#!/usr/bin/env python3\n# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.\n\nfrom setuptools import find_packages, setup\n\n\ndef readme():\n    with open("README.md") as f:\n        return f.read()\n\n\ndef requirements():\n    with open("requirements.txt") as f:\n        return f.read()\n\n\nsetup(\n    name="horizon",\n    version="0.1",\n    author="Facebook",\n    description=("Facebook RL"),\n    long_description=readme(),\n    url="https://github.com/facebookresearch/Horizon",\n    license="BSD",\n    packages=find_packages(),\n    install_requires=[],\n    dependency_links=[],\n    test_suite="ml.rl.test",\n)\n../data/Horizon/__init__.py\n../data/Horizon/ml/__init__.py\n../data/Horizon/ml/rl/__init__.py\n#!/usr/bin/env python3\n# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.\n../data/Horizon/ml/rl/tensorboardX.py\n#!/usr/bin/env python3\n# Copyright (c) Facebook, Inc. and its affiliates. All rights reserve

In [7]:
encoder = Encoder(40000, pct_bpe=0.88,ngram_min=1,ngram_max=5)
encoder.fit(corpus.split('\n'))

In [8]:
example = "def trim_vocab(n, vocab):\n# type: (int, Dict[str, int]) -> None\n''' Deletes all pairs below 10 * vocab size to prevent memory problems '''\npair_counts = sorted(vocab.items(), key=lambda p: -p[1])\npairs_to_trim = [pair for pair, count in pair_counts[n:]]\nfor pair in pairs_to_trim:\ndel vocab[pair]"
print(encoder.tokenize(example))
print(next(encoder.transform([example])))
print(next(encoder.inverse_transform(encoder.transform([example]))))

['def', '__sow', 'tri', 'm_v', 'ocab', '__eow', '(', 'n', ',', 'vocab', '):', '#', 'type', ':', '(', 'int', ',', 'dict', '[', 'str', ',', 'int', '])', '->', 'none', "'''", '__sow', 'delet', 'es', '__eow', 'all', '__sow', 'pairs', '__eow', 'below', '10', '*', 'vocab', 'size', 'to', 'prevent', 'memory', '__sow', 'probl', 'ems', '__eow', "'''", '__sow', 'pair', '_coun', 'ts', '__eow', '=', 'sorted', '(', 'vocab', '.', 'items', '(),', 'key', '=', 'lambda', 'p', ':', '-', 'p', '[', '1', '])', '__sow', 'pairs', '_to_t', 'rim', '__eow', '=', '[', 'pair', 'for', 'pair', ',', 'count', 'in', '__sow', 'pair', '_coun', 'ts', '__eow', '[', 'n', '__sow', ':]]', '__eow', 'for', 'pair', 'in', '__sow', 'pairs', '_to_t', 'rim', '__eow', ':', 'del', 'vocab', '[', 'pair', ']']
[28, 4801, 5162, 15714, 9626, 4800, 5, 190, 3, 673, 20, 7, 135, 9, 5, 115, 3, 454, 12, 164, 3, 115, 73, 270, 45, 794, 4801, 37959, 4838, 4800, 114, 4801, 16681, 4800, 763, 222, 43, 673, 161, 27, 2988, 758, 4801, 36755, 15416, 4800, 

In [9]:
print(len(encoder.bpe_vocab))
encoder.save('../model/bpe.json')

35200
