In [16]:
%load_ext autoreload
%autoreload 2

from pathlib import Path

# EN = spacy.load('en_core_web_sm')
import en_core_web_sm
import pandas as pd
from sklearn.model_selection import train_test_split

from general_utils import apply_parallel, flattenlist
EN = en_core_web_sm.load()

from ktext.preprocess import processor
import pandas as pd

In [5]:
from general_utils import get_step2_prerequisite_files, read_training_files

In [10]:
df = pd.read_pickle('df.pkl')

In [12]:
def listlen(x):
    if not isinstance(x, list):
        return 0
    return len(x)

# separate functions w/o docstrings
# docstrings should be at least 3 words in the docstring to be considered a valid docstring

with_docstrings = df[df.docstring_tokens.str.split().apply(listlen) >= 3]
without_docstrings = df[df.docstring_tokens.str.split().apply(listlen) < 3]

## Partition code by repository to minimize leakage between train, valid & test sets. 
Rough assumption that each repository has its own style.  We want to avoid having code from the same repository in the training set as well as the validation or holdout set.

In [14]:
grouped = with_docstrings.groupby('nwo')

In [17]:
# train, valid, test splits
train, test = train_test_split(list(grouped), train_size=0.87, shuffle=True, random_state=8081)
# train, valid = train_test_split(train, train_size=0.82, random_state=8081)



In [18]:
train = pd.concat([d for _, d in train]).reset_index(drop=True)
# valid = pd.concat([d for _, d in valid]).reset_index(drop=True)
test = pd.concat([d for _, d in test]).reset_index(drop=True)

In [19]:
print(f'train set num rows {train.shape[0]:,}')
# print(f'valid set num rows {valid.shape[0]:,}')
print(f'test set num rows {test.shape[0]:,}')
print(f'without docstring rows {without_docstrings.shape[0]:,}')

train set num rows 1,222,687
test set num rows 179,249
without docstring rows 4,001,960


Preview what the training set looks like.  You can start to see how the data looks, the function tokens and docstring tokens are what will be fed downstream into the models.  The other information is important for diagnostics and bookeeping.

In [20]:
train['api_sequence'].shape

(1222687,)

In [64]:
df.head()

Unnamed: 0,nwo,path,function_name,lineno,original_function,function_tokens,docstring_tokens,api_sequence,tokenized_function_name
0,fnl/libfnl,src/fnl/nlp/dictionary.py,__init__,19,"def __init__(self, *leafs, **edges):\n self...",def __init__ self leafs edges self edges edges...,,self edges edges self leafs sorted leafs,init
1,fnl/libfnl,src/fnl/nlp/dictionary.py,__eq__,23,"def __eq__(self, other):\n if isinstance(ot...",def __eq__ self other if isinstance other Node...,,if isinstance other node return id self id oth...,eq
2,fnl/libfnl,src/fnl/nlp/dictionary.py,__repr__,29,def __repr__(self):\n return 'Node<leafs={}...,def __repr__ self return Node leafs edges form...,,"return node<leafs={}, edges={}> format self le...",repr
3,fnl/libfnl,src/fnl/nlp/dictionary.py,create_or_get,32,"def createOrGet(self, token):\n """"""\n\t\tCr...",def createOrGet self token if token in self ed...,create or get the node pointed to by ` token `...,if token self edges node self edges token else...,create or get
4,fnl/libfnl,src/fnl/nlp/dictionary.py,set_leaf,47,"def setLeaf(self, key, order):\n """"""\n\t\tS...",def setLeaf self key order self leafs append o...,store the ` key ` as a leaf of this node at po...,self leafs append order key self leafs sorted ...,set leaf


In [71]:
from ktext.preprocess import processor
keep_n = 10000

# todo, probably tokens should also be seq to seq
function_token_processor = processor(heuristic_pct_padding=0.7, keep_n=keep_n, padding='post', truncating='post')
train_token_v = function_token_processor.fit_transform(train['function_tokens'])

docstring_processor = processor(append_indicators=True, heuristic_pct_padding=0.7, keep_n=keep_n, padding='post', truncating='post')
train_docstring_v = docstring_processor.fit_transform(train['docstring_tokens'])

methname_processor = processor(append_indicators=True, heuristic_pct_padding=0.7, keep_n=keep_n, padding='post', truncating='post')
train_methname_v = methname_processor.fit_transform(train['tokenized_function_name'])

api_seq_processor = processor(append_indicators=True, heuristic_pct_padding=0.7, keep_n=keep_n, padding='post', truncating='post')
train_api_seq_v = api_seq_processor.fit_transform(train['api_sequence'])


 See full histogram by insepecting the `document_length_stats` attribute.
 See full histogram by insepecting the `document_length_stats` attribute.
 See full histogram by insepecting the `document_length_stats` attribute.
 See full histogram by insepecting the `document_length_stats` attribute.


In [72]:
print(train_token_v.shape)

(1222687, 55)


In [73]:
print(train_docstring_v.shape)

(1222687, 15)


In [74]:
print(train_api_seq_v.shape)

(1222687, 45)


In [75]:
print(train_methname_v.shape)

(1222687, 5)


In [76]:
OUTPUT_PATH = Path('./data/vectors/processors/')
import dill as dpickle
import numpy as np


# Save the preprocessor
with open(OUTPUT_PATH/'function_token_processor.dpkl', 'wb') as f:
    dpickle.dump(function_token_processor, f)

with open(OUTPUT_PATH/'docstring_processor.dpkl', 'wb') as f:
    dpickle.dump(docstring_processor, f)

with open(OUTPUT_PATH/'methname_processor.dpkl', 'wb') as f:
    dpickle.dump(methname_processor, f)

with open(OUTPUT_PATH/'api_seq_processor.dpkl', 'wb') as f:
    dpickle.dump(api_seq_processor, f)

# # Save the processed data
# np.save(OUTPUT_PATH/'py_t_code_vecs_v2.npy', t_code)
# np.save(OUTPUT_PATH/'py_t_comment_vecs_v2.npy', t_comment)

In [77]:
import tables

def save_vecs(vecs, fout):
    np.save(fout, vecs)

In [78]:
save_vecs(train_token_v, './data/vectors/train.tokens.npy')

In [79]:
save_vecs(train_api_seq_v, './data/vectors/train.apiseq.npy')

In [80]:
save_vecs(train_methname_v, './data/vectors/train.methname.npy')

In [81]:
save_vecs(train_docstring_v, './data/vectors/train.desc.npy')

# Generating Test vectors

In [82]:
test_token_v = function_token_processor.transform_parallel(test['function_tokens'])
test_api_seq_v = api_seq_processor.transform_parallel(test['api_sequence'])
test_methname_v = methname_processor.transform_parallel(test['tokenized_function_name'])
test_docstring_v = docstring_processor.transform_parallel(test['docstring_tokens'])



In [83]:
save_vecs(test_token_v, './data/vectors/test.tokens.npy')
save_vecs(test_api_seq_v, './data/vectors/test.apiseq.npy')
save_vecs(test_methname_v, './data/vectors/test.methname.npy')
save_vecs(test_docstring_v, './data/vectors/test.desc.npy')

# Generating Vocab

In [84]:
import pickle

with open('./data/vectors/vocab.apiseq.pkl', 'wb') as f:
    pickle.dump(api_seq_processor.token2id, f)

with open('./data/vectors/vocab.methname.pkl', 'wb') as f:
    pickle.dump(methname_processor.token2id, f)

with open('./data/vectors/vocab.desc.pkl', 'wb') as f:
    pickle.dump(docstring_processor.token2id, f)

with open('./data/vectors/vocab.tokens.pkl', 'wb') as f:
    pickle.dump(function_token_processor.token2id, f)

In [86]:
train['original_function'].replace('\n', ' ', regex=True)

0          @indexer(IModule) def SearchableTextIndexer(ob...
1          def tree_depth(obj):     """Determine how deep...
2          def item_depth(item):     """Return the survey...
3          def _get_id(self, orig_id):     """Pick an id ...
4          def update(self):     """ Set view attributes ...
5          def checkDepth(self):     """Check if creating...
6          def checkForRisks(self):     """Check if the c...
7          def allowed(self):     """ A module is allowed...
8          def getToken(field, value, default=None):     ...
9          def exportImage(self, parent, image, caption=N...
10         def exportSurvey(self, parent, survey):     ""...
11         def exportProfileQuestion(self, parent, profil...
12         def exportModule(self, parent, module):     ""...
13         def exportRisk(self, parent, risk):     """ :r...
14         def exportSolution(self, parent, solution):   ...
15         def render(self):     """ :returns: an XML exp...
16         def _canCopy(