# Setup

In [0]:
import os
from google.colab import drive as gdrive

# @markdown Setup output directory for the models
OUTPUT_DIR = 'Colab/varname/' # @param {type:'string'}

SAVE_ON_GDRIVE = False # @param {type:'boolean'}

if SAVE_ON_GDRIVE:
  GDRIVE_ROOT = os.path.abspath('gdrive')
  GDRIVE_OUT = os.path.join(GDRIVE_ROOT, 'My Drive', OUTPUT_DIR)
  print('[INFO] Mounting Google Drive in {}'.format(GDRIVE_ROOT))
  gdrive.mount(GDRIVE_ROOT, force_remount = True)
  OUT_PATH = GDRIVE_OUT
else:
  OUT_PATH = os.path.abspath(OUTPUT_DIR)

os.makedirs(OUT_PATH, exist_ok = True)

In [0]:
# @markdown Machine setup

# Install java 11
!sudo DEBIAN_FRONTEND=noninteractive apt-get install -qq git openjdk-11-jdk > /dev/null

# Install python 3.7 and pip
!sudo DEBIAN_FRONTEND=noninteractive apt-get install -qq python3.7 python3.7-dev python3.7-venv python3-pip > /dev/null
!sudo update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.7 1 > /dev/null
!python3 -m pip install -q --upgrade pip > /dev/null

# Install pipenv (i.e. a better python package manager).
!pip3 install pipenv -qq > /dev/null
%env PIPENV_QUIET 1
%env PIPENV_VENV_IN_PROJECT 1
%env PIPENV_SKIP_LOCK 1

from IPython.display import clear_output
clear_output()

In [0]:
# @markdown Download code

# Clone the project and cd into it
!git clone --branch master https://github.com/simonepri/varname-seq2seq code
%cd -q code

# Install dependencies
!pipenv install > /dev/null

# Dataset sources

In [4]:
# @markdown Define the git repos from which to generate the dataset
%%writefile datasets.txt
https://github.com/Bukkit/Bukkit
https://github.com/clojure/clojure
https://github.com/apache/dubbo
https://github.com/google/error-prone
https://github.com/grails/grails-core
https://github.com/google/guice
https://github.com/hibernate/hibernate-orm
https://github.com/jhy/jsoup
https://github.com/junit-team/junit4
https://github.com/apache/kafka
https://github.com/libgdx/libgdx
https://github.com/dropwizard/metrics
https://github.com/square/okhttp
https://github.com/spring-projects/spring-framework
https://github.com/apache/tomcat
https://github.com/apache/cassandra

Writing datasets.txt


In [0]:
# @markdown Download the repos
import os
DATASET_PATH = os.path.realpath('datasets.txt')

!mkdir -p $OUT_PATH/corpora
!cd $OUT_PATH/corpora && xargs < $DATASET_PATH -n 1 git clone 

# Dataset generation

In [0]:
LANGUAGE = 'java' # @param = ['java']

In [0]:
# @markdown Generate AST cache if needed
!pipenv run bin src/bin/generate_cache.py \
  --language $LANGUAGE \
  --data-path $OUT_PATH/corpora

In [0]:
# @markdown Generate examples
!pipenv run bin src/bin/generate_examples.py \
    --language $LANGUAGE \
    --cache-only True \
    --input-path $OUT_PATH/corpora \
    --output-path $OUT_PATH/examples

In [0]:
# @markdown Generate masked examples
OBFUSCATE = True # @param {type:'boolean'}

!pipenv run bin src/bin/generate_masked.py \
    --obfuscate $OBFUSCATE \
    --input-path $OUT_PATH/examples \
    --output-path $OUT_PATH/masked

In [0]:
# @markdown Group masked examples by number of tokens, length of the target variable and number of masked variables
!pipenv run bin src/bin/generate_groups.py \
    --input-path $OUT_PATH/masked \
    --output-path $OUT_PATH/groups

In [0]:
# @markdown Build the dataset
SEED = 42 # @param {type:'number'}
SPLITS = '60,10,30' # @param {type:'string'}
EXCLUDED = 'juice,kafka' # @param {type:'string'}

# Build the train,dev,test splits
!pipenv run bin src/bin/generate_dataset.py \
    --splits $SPLITS \
    --exclude $EXCLUDED \
    --input-path $OUT_PATH/groups \
    --output-path $OUT_PATH/dataset

# Build the unseen test set
!pipenv run bin src/bin/generate_dataset.py \
    --prefix "unseen" \
    --no-splits \
    --include $EXCLUDED \
    --input-path $OUT_PATH/groups \
    --output-path $OUT_PATH/dataset