In [None]:
INSTALL_REQUIRES = False 
PREFIX = 'fairseq-custom'
SRC_DIR = 'fairseq'

import sys
import IPython

if INSTALL_REQUIRES:
    print("installing deps and restarting kernel")
    #     !{sys.executable} -m pip install -U split-folders tqdm albumentations crc32c wget
    !{sys.executable} -m pip install 'sagemaker[local]' --upgrade
    !{sys.executable} -m pip install -U smdebug sagemaker-experiments
    !{sys.executable} -m pip install -U sagemaker ipyplot jsonlines
    # !/bin/bash ./local/local_change_setting.sh
    IPython.Application.instance().kernel.do_shutdown(True)

In [None]:
import os
import time
import shutil
from time import strftime

import boto3
import sagemaker

from smexperiments.experiment import Experiment
from smexperiments.trial import Trial
from sagemaker.pytorch import PyTorch

%matplotlib inline

In [None]:
role = sagemaker.get_execution_role()
sess = boto3.Session()
region = sess.region_name
account = boto3.client("sts").get_caller_identity().get("Account")

In [None]:
sagemaker.__version__

In [None]:
def create_experiment(experiment_name):
    try:
        sm_experiment = Experiment.load(experiment_name)
    except:
        sm_experiment = Experiment.create(experiment_name=experiment_name)

In [None]:
def create_trial(experiment_name):
    create_date = strftime("%m%d-%H%M-%s")
    sm_trial = Trial.create(trial_name=f'{experiment_name}-{create_date}',
                            experiment_name=experiment_name)

    job_name = f'{sm_trial.trial_name}'
    return job_name

In [None]:
sagemaker_session = sagemaker.Session()
bucket = sagemaker_session.default_bucket()

In [None]:
s3_base_path = f's3://{bucket}/{PREFIX}'
s3_code_path = f'{s3_base_path}/code'
s3_output_path = f'{s3_base_path}/output'
s3_checkpoint_path = f'{s3_base_path}/checkpoints'
s3_data_path = f's3://{bucket}/datasets/LibriSpeech'

In [None]:
%%bash -s "$s3_data_path"

if [ ! -d datasets ]; then
    mkdir datasets
    mkdir datasets/raw-100
    mkdir datasets/train-100
    touch datasets/train-100/dummy
    wget -P datasets/raw-100 https://www.openslr.org/resources/12/dev-other.tar.gz -q
    wget -P datasets/raw-100 https://www.openslr.org/resources/12/train-clean-100.tar.gz -q
    
    # mkdir datasets/raw-960
    # mkdir datasets/train-960
    # touch datasets/train-960/dummy
    # wget -P datasets/raw-960 https://www.openslr.org/resources/12/dev-other.tar.gz -q
    # wget -P datasets/raw-960 https://www.openslr.org/resources/12/train-clean-100.tar.gz -q
    # wget -P datasets/raw-960 https://www.openslr.org/resources/12/train-clean-360.tar.gz -q
    # wget -P datasets/raw-960 https://www.openslr.org/resources/12/train-other-500.tar.gz -q
    
    aws s3 sync datasets $1    
else
    echo "dataset is already downloaded"
fi

In [None]:
!git clone https://github.com/facebookresearch/fairseq.git

In [None]:
%%writefile {SRC_DIR}/train.sh

echo "--------------------------------------------------------"
echo "step 1: check environment "
echo "--------------------------------------------------------"

nvidia-smi
df -h

## Preprocessing
echo ""
echo "--------------------------------------------------------"
echo "step 2: install dependencies "
echo "--------------------------------------------------------"

chmod 1777 /tmp 
apt-get update -y
apt-get install -y --allow-downgrades --allow-change-held-packages --no-install-recommends libsndfile1
apt-get install libsndfile1-dev

pip install soundfile tensorboardX editdistance torchsummaryX
pip install --editable ./
# export PYTHONPATH=$(pwd):$PYTHONPATH

echo ""
echo "--------------------------------------------------------"
echo "stage 3: prepare train data and generate manifests"
echo "--------------------------------------------------------"

bash prepare-100.sh

echo ""
echo "--------------------------------------------------------"
echo "stage 4: download pretrained model"
echo "--------------------------------------------------------"

mkdir /opt/ml/input/data/pretrained_models

## pretrained data2vec-base model
#wget https://dl.fbaipublicfiles.com/fairseq/data2vec/audio_base_ls.pt -P /opt/ml/input/data/pretrained_models -q 

## pretrained hubert-base model
#wget https://dl.fbaipublicfiles.com/hubert/hubert_base_ls960.pt -P /opt/ml/input/data/pretrained_models -q 

# pretrained wav2vec-base model
wget https://dl.fbaipublicfiles.com/fairseq/wav2vec/wav2vec_small.pt -P /opt/ml/input/data/pretrained_models -q 

echo ""
echo "--------------------------------------------------------"
echo "stage 5: start finetuning"
echo "--------------------------------------------------------"
cd $SM_MODULE_DIR
export HYDRA_FULL_ERROR=1 
# re-launch when train is unexpectedly crashed.
for i in {0..9}
do
    echo ""
    echo "hydra-train start"
    fairseq-hydra-train $@
    rm /opt/ml/model/crash.pt
done

In [None]:
%%writefile {SRC_DIR}/prepare-100.sh

if [ ${SM_CHANNEL_TRAIN+x} ] && [ -d $SM_CHANNEL_TRAIN/LibriSpeech ]; then
    echo "train data is already unzipped"
else
    if [ ${SM_CHANNEL_RAW+x} ] && [ -d $SM_CHANNEL_RAW ]; then
        echo "unzipping raw data"
        tar -zxf $SM_CHANNEL_RAW/train-clean-100.tar.gz -C $SM_CHANNEL_TRAIN
        tar -zxf $SM_CHANNEL_RAW/dev-other.tar.gz -C $SM_CHANNEL_TRAIN
    else
        echo "training data error"
        exit 0
    fi
    
    mkdir -p $SM_CHANNEL_TRAIN/LibriSpeech/manifests
    cd /opt/ml/code/examples/wav2vec
    
    echo "downloading wav2vec dict "
    wget https://dl.fbaipublicfiles.com/fairseq/wav2vec/dict.ltr.txt -P $SM_CHANNEL_TRAIN/LibriSpeech/manifests -q
    
    echo "generating tsv, ltr, wrd file (train-100)"
    python wav2vec_manifest.py $SM_CHANNEL_TRAIN/LibriSpeech/train-clean-100 --dest $SM_CHANNEL_TRAIN/LibriSpeech/manifests/ --ext flac --valid-percent 0
    mv $SM_CHANNEL_TRAIN/LibriSpeech/manifests/train.tsv $SM_CHANNEL_TRAIN/LibriSpeech/manifests/train-100.tsv 
    python libri_labels.py $SM_CHANNEL_TRAIN/LibriSpeech/manifests/train-100.tsv --output-dir $SM_CHANNEL_TRAIN/LibriSpeech/manifests/ --output-name train-100

    echo "generating tsv, ltr, wrd file (dev-other)"
    python wav2vec_manifest.py $SM_CHANNEL_TRAIN/LibriSpeech/dev-other --dest $SM_CHANNEL_TRAIN/LibriSpeech/manifests/ --ext flac --valid-percent 0
    mv $SM_CHANNEL_TRAIN/LibriSpeech/manifests/train.tsv $SM_CHANNEL_TRAIN/LibriSpeech/manifests/dev-other.tsv 
    python libri_labels.py $SM_CHANNEL_TRAIN/LibriSpeech/manifests/dev-other.tsv --output-dir $SM_CHANNEL_TRAIN/LibriSpeech/manifests/ --output-name dev-other
fi

In [None]:
%%writefile {SRC_DIR}/prepare-960.sh

if [ ${SM_CHANNEL_TRAIN+x} ] && [ -d $SM_CHANNEL_TRAIN/LibriSpeech ]; then
    echo "train data is already unzipped"
else
    if [ ${SM_CHANNEL_RAW+x} ] && [ -d $SM_CHANNEL_RAW ]; then
        echo "unzipping raw data"
        tar -zxf $SM_CHANNEL_RAW/train-clean-100.tar.gz -C $SM_CHANNEL_TRAIN
        tar -zxf $SM_CHANNEL_RAW/train-clean-360.tar.gz -C $SM_CHANNEL_TRAIN
        tar -zxf $SM_CHANNEL_RAW/train-other-500.tar.gz -C $SM_CHANNEL_TRAIN
        tar -zxf $SM_CHANNEL_RAW/dev-other.tar.gz -C $SM_CHANNEL_TRAIN
        mkdir $SM_CHANNEL_TRAIN/LibriSpeech/train-960
        mv $SM_CHANNEL_TRAIN/LibriSpeech/train-clean-100 $SM_CHANNEL_TRAIN/LibriSpeech/train-960/
        mv $SM_CHANNEL_TRAIN/LibriSpeech/train-clean-360 $SM_CHANNEL_TRAIN/LibriSpeech/train-960/
        mv $SM_CHANNEL_TRAIN/LibriSpeech/train-other-500 $SM_CHANNEL_TRAIN/LibriSpeech/train-960/
    else
        echo "training data error"
        exit 0
    fi
    
    mkdir -p $SM_CHANNEL_TRAIN/LibriSpeech/manifests
    cd /opt/ml/code/examples/wav2vec
    
    echo "downloading wav2vec dict "
    wget https://dl.fbaipublicfiles.com/fairseq/wav2vec/dict.ltr.txt -P $SM_CHANNEL_TRAIN/LibriSpeech/manifests -q
    
    echo "generating tsv, ltr, wrd file (train-960)"
    python wav2vec_manifest.py $SM_CHANNEL_TRAIN/LibriSpeech/train-960 --dest $SM_CHANNEL_TRAIN/LibriSpeech/manifests/ --ext flac --valid-percent 0
    mv $SM_CHANNEL_TRAIN/LibriSpeech/manifests/train.tsv $SM_CHANNEL_TRAIN/LibriSpeech/manifests/train-960.tsv 
    python libri_labels.py $SM_CHANNEL_TRAIN/LibriSpeech/manifests/train-960.tsv --output-dir $SM_CHANNEL_TRAIN/LibriSpeech/manifests/ --output-name train-960

    echo "generating tsv, ltr, wrd file (dev-other)"
    python wav2vec_manifest.py $SM_CHANNEL_TRAIN/LibriSpeech/dev-other --dest $SM_CHANNEL_TRAIN/LibriSpeech/manifests/ --ext flac --valid-percent 0
    mv $SM_CHANNEL_TRAIN/LibriSpeech/manifests/train.tsv $SM_CHANNEL_TRAIN/LibriSpeech/manifests/dev-other.tsv 
    python libri_labels.py $SM_CHANNEL_TRAIN/LibriSpeech/manifests/dev-other.tsv --output-dir $SM_CHANNEL_TRAIN/LibriSpeech/manifests/ --output-name dev-other
fi


In [None]:
experiment_name = 'fairseq-vanilla-w2v-exp1'
# instance_type = 'ml.g5.12xlarge' 
instance_type = 'local_gpu'
instance_count = 1
do_spot_training = False
max_wait = None
max_run = 5*24*60*60

In [None]:
from pathlib import Path
source_dir = f'{Path.cwd()}/{SRC_DIR}'


if instance_type in ['local_gpu', 'local']:
    from sagemaker.local import LocalSession
    
    sagemaker_session = LocalSession()
    sagemaker_session.config = {'local': {'local_code': True}}
    s3_raw_data_path = f'file://{Path.cwd()}/datasets/raw-100'
    s3_train_data_path = f'file://{Path.cwd()}/datasets/train-100'
    s3_checkpoint_path = None
else:
    sagemaker_session = sagemaker.Session()
    s3_raw_data_path = f'{s3_data_path}/raw-100'
    s3_train_data_path = f'{s3_data_path}/train-100'
    s3_checkpoint_path = f's3://{bucket}/{SRC_DIR}/checkpoints'

In [None]:
metric_definitions=[{'Name': 'train:loss', 'Regex': '"train_loss": "([0-9\\.]+)'}, 
                    {'Name': 'valid:wer', 'Regex': '"dev-other_raw_wer": "([0-9\\.]+)'}]

In [None]:
hyperparameters = {
    'config-dir': '/opt/ml/code/examples/wav2vec/config/finetuning',
    'config-name': 'base_100h',
}

In [None]:
distribution = None

if do_spot_training:
    max_wait = max_run

In [None]:
# all input configurations, parameters, and metrics specified in estimator 
# definition are automatically tracked
estimator = PyTorch(
    entry_point='train.sh',
    source_dir=source_dir,
    role=role,
    sagemaker_session=sagemaker_session,
    framework_version='1.10',
    py_version='py38',
    instance_count=instance_count,
    instance_type=instance_type,
    code_location = s3_code_path,
    output_path=s3_output_path,
    hyperparameters=hyperparameters,
    distribution=distribution,
    metric_definitions=metric_definitions,
    max_run=max_run,
    use_spot_instances=do_spot_training,
    max_wait=max_wait,
    checkpoint_s3_uri=s3_checkpoint_path,
)

In [None]:
create_experiment(experiment_name)
job_name = create_trial(experiment_name)
estimator.fit(
    inputs={
        'raw': s3_raw_data_path, 
        'train': s3_train_data_path
    },
    job_name=job_name,
    experiment_config={
      'TrialName': job_name,
      'TrialComponentDisplayName': job_name,
    },
    wait=False,
)

In [None]:
job_name=estimator.latest_training_job.name

In [None]:
sagemaker_session.logs_for_job(job_name=job_name, wait=True)