# Setup

In [0]:
import os
from google.colab import drive as gdrive

# @markdown Setup output directory for the models
OUTPUT_DIR = 'Colab/varname/' # @param {type:'string'}

SAVE_ON_GDRIVE = False # @param {type:'boolean'}

if SAVE_ON_GDRIVE:
  GDRIVE_ROOT = os.path.abspath('gdrive')
  GDRIVE_OUT = os.path.join(GDRIVE_ROOT, 'My Drive', OUTPUT_DIR)
  print('[INFO] Mounting Google Drive in {}'.format(GDRIVE_ROOT))
  gdrive.mount(GDRIVE_ROOT, force_remount = True)
  OUT_PATH = GDRIVE_OUT
else:
  OUT_PATH = os.path.abspath(OUTPUT_DIR)

os.makedirs(OUT_PATH, exist_ok = True)

In [0]:
# @markdown Machine setup

# Install java 11
!sudo DEBIAN_FRONTEND=noninteractive apt-get install -qq git openjdk-11-jdk > /dev/null

# Install python 3.7 and pip
!sudo DEBIAN_FRONTEND=noninteractive apt-get install -qq python3.7 python3.7-dev python3.7-venv python3-pip > /dev/null
!sudo update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.7 1 > /dev/null
!python3 -m pip install -q --upgrade pip > /dev/null

# Install pipenv (i.e. a better python package manager).
!pip3 install pipenv -qq > /dev/null
%env PIPENV_QUIET 1
%env PIPENV_VENV_IN_PROJECT 1
%env PIPENV_SKIP_LOCK 1

from IPython.display import clear_output
clear_output()

In [0]:
# @markdown Download code

# Clone the project and cd into it
!git clone --branch master https://github.com/simonepri/varname-seq2seq code
%cd -q code

# Install dependencies
!pipenv install > /dev/null

In [0]:
# @markdown Download the dataset
DATASET = "java-corpora-dataset-obfuscated.tgz"  # @param ["java-corpora-dataset-obfuscated.tgz", "java-corpora-dataset.tgz"]
!pipenv run bin src/bin/download_data.py \
    --file-name "$DATASET" \
    --data-path "data/dataset"

# Model training

In [0]:
# @markdown Model configs
BATCH_SIZE = 256 # @param {type:'number'}
RNN_CELL = "lstm" # @param ['lstm', 'gru']
RNN_BIDIRECTIONAL = False # @param {type:'boolean'}
RNN_NUL_LAYERS =  1# @param {type:'number'}
RNN_HIDDEN_SIZE = 256 # @param {type:'number'}
RNN_EMBEDDING_SIZE = 256 # @param {type:'number'}
RNN_TF_RATIO = "auto" # @param {type:'raw'}
INPUT_SEQ_MAX_LEN = 256 # @param {type:'number'}
OUTPUT_SEQ_MAX_LEN = 32 # @param {type:'number'}

In [0]:
# @markdown Run training
RUN_TRAIN = True # @param {type:'boolean'}
TRAIN_RUN_ID = "lstm-256-256-dtf-obf" # @param {type:'string'}
TRAIN_EPOCHS = 35 # @param {type:'number'}
if RUN_TRAIN:
  !pipenv run bin src/bin/run_seq2seq.py \
    --do-train \
    --run-id "$TRAIN_RUN_ID" \
    --epochs "$TRAIN_EPOCHS" \
    --batch-size "$BATCH_SIZE" \
    --rnn-cell "$RNN_CELL" \
    --rnn-num-layers "$RNN_NUL_LAYERS" \
    --rnn-hidden-size "$RNN_HIDDEN_SIZE" \
    --rnn-embedding-size "$RNN_EMBEDDING_SIZE" \
    --rnn-tf-ratio "$RNN_TF_RATIO" \
    --rnn-bidirectional "$RNN_BIDIRECTIONAL" \
    --input-seq-max-length "$INPUT_SEQ_MAX_LEN" \
    --output-seq-max-length "$OUTPUT_SEQ_MAX_LEN" \
    --output-path "$OUT_PATH"/models \
    --cache-path "$OUT_PATH"/cache \
    --train-file data/dataset/train.mk.tsv \
    --valid-file data/dataset/dev.mk.tsv

# Model testing

In [0]:
# @markdown Print available models
!ls -Ral "$OUT_PATH"/models

In [0]:
# @markdown Run tests
RUN_TEST = True # @param {type:'boolean'}
TEST_RUN_ID = "lstm-256-256-dtf-obf" # @param {type:'string'}
if RUN_TEST:
  !pipenv run bin src/bin/run_seq2seq.py \
    --do-test \
    --run-id "$TEST_RUN_ID" \
    --batch-size "$BATCH_SIZE" \
    --output-path "$OUT_PATH"/models \
    --cache-path "$OUT_PATH"/cache \
    --test-file data/dataset/test.mk.tsv
  !pipenv run bin src/bin/run_seq2seq.py \
    --do-test \
    --run-id "$TEST_RUN_ID" \
    --batch-size "$BATCH_SIZE" \
    --output-path "$OUT_PATH"/models \
    --cache-path "$OUT_PATH"/cache \
    --test-file data/dataset/unseen.all.mk.tsv