# Finetuning

## 1. Mount Google Drive (To save trained checkpoints and to load the dataset)

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

## 2. Clone the tortoise repo

In [None]:
!git clone https://github.com/shovonjamali/DL-Art-School.git
%cd DL-Art-School

## 3. Download model weights for the VQ-VAE and Autoregressive Model (GPT-2)

In [None]:
!wget https://huggingface.co/Gatozu35/tortoise-tts/resolve/main/dvae.pth -O experiments/dvae.pth
!wget https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/autoregressive.pth -O experiments/autoregressive.pth

## 4. Install the requirements

In [None]:
!pip install -r codes/requirements.laxed.txt

## 5. (OPEN THIS) Parameters to be able to train on colab

In [None]:
from pathlib import Path
from math import ceil
DEFAULT_TRAIN_BS = 64
DEFAULT_VAL_BS = 32
#@markdown # Hyperparameter calculation
#@markdown Run this cell to obtain suggested parameters for training
Dataset_Training_Path = "/content/gdrive/MyDrive/tts-dataset/train.txt" #@param {type:"string"}
ValidationDataset_Training_Path = "/content/gdrive/MyDrive/tts-dataset/val.txt" #@param {type:"string"}

#@markdown ### **NOTE**: Dataset must be in the following format.

#@markdown  `dataset/`
#@markdown * ---├── `val.txt`
#@markdown * ---├── `train.txt`
#@markdown * ---├── `wavs/`

#@markdown `wavs/` directory must contain `.wav` files.

#@markdown  Example for `train.txt` and `val.txt`:

#@markdown * `wavs/A.wav|Write the transcribed audio here.`

#@markdown todo: actually check the dataset structure

if Dataset_Training_Path == ValidationDataset_Training_Path:
  print("WARNING: training dataset path == validation dataset path!!!")
  print("\tThis is technically okay but will make all of the validation metrics useless. ")
  print("it will also SUBSTANTIALLY slow down the rate of training, because validation datasets are supposed to be much smaller than training ones.")

def txt_file_lines(p: str) -> int:
  return len(Path(p).read_text().strip().split('\n'))
training_samples = txt_file_lines(Dataset_Training_Path)
val_samples = txt_file_lines(ValidationDataset_Training_Path)

if training_samples < 128: print("WARNING: very small dataset! the smallest dataset tested thus far had ~200 samples.")
if val_samples < 20: print("WARNING: very small validation dataset! val batch size will be scaled down to account")

def div_spillover(n: int, bs: int) -> int: # returns new batch size
  epoch_steps,remain = divmod(n,bs)
  if epoch_steps*2 > bs: return bs # don't bother optimising this stuff if epoch_steps are high
  if not remain: return bs # unlikely but still

  if remain*2 < bs: # "easier" to get rid of remainder -- should increase bs
    target_bs = n//epoch_steps
  else: # easier to increase epoch_steps by 1 -- decrease bs
    target_bs = n//(epoch_steps+1)
  assert n%target_bs < epoch_steps+2 # should be very few extra
  return target_bs

if training_samples < DEFAULT_TRAIN_BS:
  print("WARNING: dataset is smaller than a single batch. This will almost certainly perform poorly. Trying anyway")
  train_bs = training_samples
else:
  train_bs = div_spillover(training_samples, DEFAULT_TRAIN_BS)
if val_samples < DEFAULT_VAL_BS:
  val_bs = val_samples
else:
  val_bs = div_spillover(val_samples, DEFAULT_VAL_BS)

steps_per_epoch = training_samples//train_bs
lr_decay_epochs = [20, 40, 56, 72]
lr_decay_steps = [steps_per_epoch * e for e in lr_decay_epochs]
print_freq = min(100, max(20, steps_per_epoch))
val_freq = save_checkpoint_freq = print_freq * 3

print("===CALCULATED SETTINGS===")
print(f'{train_bs=} {val_bs=}')
print(f'{val_freq=} {lr_decay_steps=}')
print(f'{print_freq=} {save_checkpoint_freq=}')

In [None]:
#@markdown ##_Settings for normal users:_
Experiment_Name = "DA-train-run-1" #@param {type:"string"}
Dataset_Training_Name= "training_dataset" #@param {type:"string"}
ValidationDataset_Name = "validation_dataset" # this seems to be useless??? @param {type:"string"}
SaveTrainingStates = True # @param {type:"boolean"}
Keep_Last_N_Checkpoints = 1 #@param {type:"slider", min:0, max:10, step:1}
Resume_Training = True # @param {type:"boolean"}
Training_State = 8700 # @param {type:"integer"}

#@markdown * **NOTE**: 0 means "keep all models saved", which could potentially cause out-of-storage issues.
#@markdown * Without training states, each model "only" takes up ~1.6GB. You should have ~50GB of free space to begin with.
#@markdown * With training states, each model (pth+state) takes up ~4.9 GB; Colab will crash around ~10 undeleted checkpoints in this case.

#@markdown ##_Other training parameters_
Fp16 = False #@param {type:"boolean"}
Use8bit = True #@param {type:"boolean"}
#@markdown * **NOTE**: for some reason, fp16 does not seem to improve vram use when combined with 8bit [citation needed]. To be verified later...
TrainingRate = "1e-4" #@param {type:"string"}
TortoiseCompat = True #@param {type:"boolean"}

#@markdown * **NOTE**: TortoiseCompat introduces some breaking changes to the training process. **If you want to reproduce older models**, disable this checkbox.

#@markdown ##_Calculated settings_ override
#@markdown #####Blank entries rely on the calculated defaults from the cell above.
#@markdown ######**Leave them blank unless you want to adjust them manually**
TrainBS = "" #@param {type:"string"}
ValBS = "" #@param {type:"string"}
ValFreq = "" #@param {type:"string"}
LRDecaySteps = "" #@param {type:"string"}
PrintFreq = "" #@param {type:"string"}
SaveCheckpointFreq = "" #@param {type:"string"}

def take(orig, override):
  if override == "": return orig
  return type(orig)(override)

train_bs = take(train_bs, TrainBS)
val_bs = take(val_bs, ValBS)
val_freq = take(val_freq, ValFreq)
lr_decay_steps = eval(LRDecaySteps) if LRDecaySteps else lr_decay_steps
print_freq = take(print_freq, PrintFreq)
save_checkpoint_freq = take(save_checkpoint_freq, SaveCheckpointFreq)
assert len(lr_decay_steps) == 4
gen_lr_steps = ', '.join(str(v) for v in lr_decay_steps)

#@markdown #Run this cell after you finish editing the settings.


%cd /content/DL-Art-School

if Resume_Training != True:
  !wget https://raw.githubusercontent.com/shovonjamali/DL-Art-School/master/experiments/EXAMPLE_gpt.yml -O experiments/EXAMPLE_gpt.yml
else:
  !wget https://raw.githubusercontent.com/shovonjamali/DL-Art-School/master/experiments/EXAMPLE_resume_gpt.yml -O experiments/EXAMPLE_gpt.yml
  resume_path = f'../experiments/{Experiment_Name}/training_state/{Training_State}.state'
  !sed -i 's+CHANGEME_resume_training_path+'"$resume_path"'+g' ./experiments/EXAMPLE_gpt.yml

#@markdown This will apply the settings defined above to a fresh yml config file.
import os
%cd /content/DL-Art-School
!sed -i 's/batch_size: 128/batch_size: '"$train_bs"'/g' ./experiments/EXAMPLE_gpt.yml
!sed -i 's/batch_size: 64/batch_size: '"$val_bs"'/g' ./experiments/EXAMPLE_gpt.yml
!sed -i 's/val_freq: 500/val_freq: '"$val_freq"'/g' ./experiments/EXAMPLE_gpt.yml
!sed -i 's/500, 1000, 1400, 1800/'"$gen_lr_steps"'/g' ./experiments/EXAMPLE_gpt.yml
!sed -i 's/print_freq: 100/print_freq: '"$print_freq"'/g' ./experiments/EXAMPLE_gpt.yml
!sed -i 's/save_checkpoint_freq: 500/save_checkpoint_freq: '"$save_checkpoint_freq"'/g' ./experiments/EXAMPLE_gpt.yml

!sed -i 's+CHANGEME_validation_dataset_name+'"$ValidationDataset_Name"'+g' ./experiments/EXAMPLE_gpt.yml
!sed -i 's+CHANGEME_path_to_validation_dataset+'"$ValidationDataset_Training_Path"'+g' ./experiments/EXAMPLE_gpt.yml
if(Fp16==True):
  os.system("sed -i 's+fp16: false+fp16: true+g' ./experiments/EXAMPLE_gpt.yml")
!sed -i 's/use_8bit: true/use_8bit: '"$Use8bit"'/g' ./experiments/EXAMPLE_gpt.yml

!sed -i 's/disable_state_saving: true/disable_state_saving: '"$SaveTrainingStates"'/g' ./experiments/EXAMPLE_gpt.yml
!sed -i 's/tortoise_compat: True/tortoise_compat: '"$TortoiseCompat"'/g' ./experiments/EXAMPLE_gpt.yml
!sed -i 's/number_of_checkpoints_to_save: 0/number_of_checkpoints_to_save: '"$Keep_Last_N_Checkpoints"'/g' ./experiments/EXAMPLE_gpt.yml


!sed -i 's/CHANGEME_training_dataset_name/'"$Dataset_Training_Name"'/g' ./experiments/EXAMPLE_gpt.yml
!sed -i 's/CHANGEME_your_experiment_name/'"$Experiment_Name"'/g' ./experiments/EXAMPLE_gpt.yml
!sed -i 's+CHANGEME_path_to_training_dataset+'"$Dataset_Training_Path"'+g' ./experiments/EXAMPLE_gpt.yml


if (not TrainingRate=="1e-5"):
  os.system("sed -i 's+!!float 1e-5 # CHANGEME:+!!float '" + TrainingRate + "' #+g' ./experiments/EXAMPLE_gpt.yml")

## 6. Reslove dependencies

To avoid some unwanted erros we need to downgrade the current transformer version to a specific one

In [None]:
!pip uninstall transformers -y

In [None]:
!pip install transformers==4.29.2

## 7. Training

Execute the below cell to delete a previous run dirs

In [None]:
!rm -rf /content/DL-Art-School/experiments/Test-3-28-2024

In [None]:
!rm -rf /content/DL-Art-School/experiments/Test-3-28-2024_archived_240328-111516

In [None]:
#@markdown Press the stop button for this cell when you are satisfied with the results, and have seen:

#@markdown `INFO:base:Saving models and training states.`

#@markdown If your training run saves many models, you might exceed the storage limits on the colab runtime. To prevent this, try to delete old checkpoints in /content/DL-Art-School/experiments/$Experiment_Name/(models|training_state)/* via the file explorer panel as the training runs. **Resuming training after a crash requires config editing,** so try to not let that happen.

#@markdown TODO: implement code to automatically prune useless checkpoints later && restore training states

%cd /content/DL-Art-School/codes

!python3 train.py -opt ../experiments/EXAMPLE_gpt.yml

## 8. Export to Google Drive

In [None]:
import os

if not os.path.exists("/content/gdrive/MyDrive/"):
  print("Connect your Google Drive and try again.")

if not os.path.exists("/content/gdrive/MyDrive/tortoise"):
  os.mkdir("/content/gdrive/MyDrive/tortoise")

srcdir = '/content/DL-Art-School/experiments/'+Experiment_Name
outdir = '/content/gdrive/MyDrive/tortoise/'+Experiment_Name
if not os.path.exists(outdir):
  os.mkdir(outdir)

from pathlib import Path
from shutil import copy, copytree
outdir = Path(outdir)
srcdir = Path(srcdir)

#@markdown #Pick what training results you'd like to save:
#deleteZeroStepCheckpoints = True #@param {type:"boolean"}
saveLogs = True #@param {type:"boolean"}
saveYml = True #@param {type:"boolean"}
saveInferenceCheckpoints = True #@param {type:"boolean"}
#@markdown These only need to be adjusted if you want to commit further training:
saveEMACheckpoints = True #@param {type:"boolean"}
saveTrainingStates = True #@param {type:"boolean"}

checkpointSavingStrategy = 'minimal (last ckpt only)' #@param ["Everything (I have infinite storage)", "minimal (last ckpt only)"]
#@markdown Note that each checkpoint takes up **1.6GB of space** -- if you want to save all checkpoints, you probably need more than 15GB of gdrive storage.


outdir.mkdir(exist_ok=True)
'''
if deleteZeroStepCheckpoints:
    for zero_model in srcdir.glob('models/0_gpt*.pth'):
        zero_model.unlink() # remove all 0 step files; useless
    for zero_model in srcdir.glob('training_state/0.state'):
        zero_model.unlink()
'''
if saveLogs:
    copytree(srcdir/'tb_logger', outdir/'tb_logger')
    for log in srcdir.glob('*.log'):
        copy(log, outdir)

if saveYml:
    for yml in srcdir.glob('*.yml'):
        copy(yml, outdir)

infer_models = list((srcdir/'models').glob('*_gpt.pth'))
training_states = (srcdir/'training_state').iterdir()
ema_models = (srcdir/'models').glob('*_gpt_ema.pth')
highest_step = max(int(m.stem.split('_')[0]) for m in infer_models)

def save_model_directory(glob, outsubdir):
    outsubdir.mkdir(exist_ok=True)
    for m in glob:
        if checkpointSavingStrategy != 'minimal (last ckpt only)' \
            or int(m.stem.split('_')[0]) == highest_step:
            copy(m, outsubdir)

if saveInferenceCheckpoints:
    save_model_directory(infer_models, outdir/'models')

if saveEMACheckpoints:
    save_model_directory(ema_models, outdir/'models')

if saveTrainingStates:
    save_model_directory(training_states, outdir/'training_state')



In [None]:
#@markdown ###If the above script does not work, use this old cell

#@markdown #This will blindly copy everything to your gdrive folder.

!cp -r /content/DL-Art-School/experiments/$Experiment_Name /content/gdrive/MyDrive/tortoise/

# 9. Continue/Resume training

***Execute the steps from 1-6 of this notebook.***

### High-level instructions

1. Under experiments create a new folder with the experiment name and create different sub-folders like model, training_state etc. under that. Below commands will take care of this.
2. Uoload .gpt.yml file which was saved from the previous run into that directory (**not required, found later!**).
![picture](https://drive.google.com/uc?id=1t59IvMKEsl04x5Xr-SgEn1o-gkRiLTZ6)
3. After creating the sub-directories, upload/import required files which were saved after the initial run. These includes model and step. Required commands are also provided below.
4. Uncomment the resume training path and comment the training path from the Example_gpt.yml file for resume training (**automated this stuff at step 5**).
![picture](https://drive.google.com/uc?id=1snaLw38Sd6KVqX1DkxOVJLGeR_MQvhnt)

## 10. Create desired directories

In [None]:
# Delete a previous run's direcotry (if required).
!rm -rf /content/DL-Art-School/experiments/{Experiment_Name}

# Create directories under 'experiments' as resume training requires certain folder structure.
%cd /content/DL-Art-School/experiments

# Create models folder
!mkdir -p /content/DL-Art-School/experiments/{Experiment_Name}/models

# Create tb_logger folder
!mkdir -p /content/DL-Art-School/experiments/{Experiment_Name}/tb_logger

# Create training_state folder
!mkdir -p /content/DL-Art-School/experiments/{Experiment_Name}/training_state

# Create val_images folder
!mkdir -p /content/DL-Art-School/experiments/{Experiment_Name}/val_images

In [None]:
# Unzip model file (not using this one anymore, importing the model from the google drive)
# !unzip -u "/content/DL-Art-School/experiments/{experiment_name}/models/480.zip" -d "/content/DL-Art-School/experiments/{experiment_name}/models"

## 11. Import model and step from the drive

### Instructions

Below image shows how can we get id of the model or step from the drive.

![picture](https://drive.google.com/uc?id=1vHQ-JpTY5CuZMLYS8dSr-EeiWsTS3oKZ)

### Go to root and upgrade gdown

We need to go to the base directory first. Execute the below cell untill we are at the root/base of our current runtime

In [None]:
%cd /content
%cd ..
# Upgrade the current version of gdown, which gives permission denined error while importing model or step from the drive
!pip install --upgrade --no-cache-dir gdown

### Check we are at the root/base (optional)

In [None]:
%pwd

In [None]:
!dir

### Import the model and training state

In [None]:
Model_ID= "1-IlMOAvY4hsb-49MK_6KEzzD-SBmpteu" #@param {type:"string"}
Training_State_ID = "1-Nowq8V2dr61KZpUmUK9CsfQZbPanjGN" #@param {type:"string"}

In [None]:
# Navigate to the models folder to import the desired model saved from previous run
%cd /content/DL-Art-School/experiments/{Experiment_Name}/models
# Import the model
!gdown --id $Model_ID

# Import the state
%cd /content/DL-Art-School/experiments/{Experiment_Name}/training_state
!gdown --id $Training_State_ID

## 12. Reviewing (tried tensorboard, not working)

In [None]:
!pip uninstall tb-nightly tensorboardX tensorboard

In [None]:
!pip install tensorboard

In [None]:
%load_ext tensorboard

In [None]:
import os

logdir_path = "/content/DL-Art-School/experiments/Test2/tb_logger"  # Replace with your log directory path
print(os.listdir(logdir_path))

In [None]:
import os
import subprocess
import time

def find_latest_logfile(logdir_path):
    # Get list of event files in the log directory
    event_files = [f for f in os.listdir(logdir_path) if f.startswith('events.out.tfevents.')]
    if not event_files:
        return None  # No event files found

    # Sort event files by modification time
    event_files.sort(key=lambda x: os.path.getmtime(os.path.join(logdir_path, x)), reverse=True)

    # Return the path to the latest event file
    return os.path.join(logdir_path, event_files[0])

# Function to start TensorBoard in the background
def start_tensorboard(logdir_path):
    latest_logfile = find_latest_logfile(logdir_path)
    if latest_logfile is None:
        print("No log files found. TensorBoard cannot be started.")
        return

    # Start TensorBoard in the background
    tensorboard_proc = subprocess.Popen(["tensorboard", "--logdir", latest_logfile])

    # Continuously monitor the log directory for changes
    while True:
        latest_logfile = find_latest_logfile(logdir_path)
        if latest_logfile is not None and latest_logfile != tensorboard_proc.args[2]:
            tensorboard_proc.terminate()
            tensorboard_proc.wait()
            tensorboard_proc = subprocess.Popen(["tensorboard", "--logdir", latest_logfile])
        time.sleep(10)  # Check for changes every 10 seconds

# Specify the path to your log directory
logdir_path = "/content/DL-Art-School/experiments/Test2/tb_logger"

# Start TensorBoard in the background
start_tensorboard(logdir_path)

In [None]:
%tensorboard --logdir /content/DL-Art-School/experiments/Test2/tb_logger/events.out.tfevents.1710379384.e3e44fc5ce54.64171.0

## 13. Resume training

In [None]:
#@markdown Press the stop button for this cell when you are satisfied with the results, and have seen:

#@markdown `INFO:base:Saving models and training states.`

#@markdown If your training run saves many models, you might exceed the storage limits on the colab runtime. To prevent this, try to delete old checkpoints in /content/DL-Art-School/experiments/$Experiment_Name/(models|training_state)/* via the file explorer panel as the training runs. **Resuming training after a crash requires config editing,** so try to not let that happen.

#@markdown TODO: implement code to automatically prune useless checkpoints later && restore training states

%cd /content/DL-Art-School/codes

!python3 train.py -opt ../experiments/EXAMPLE_gpt.yml

## 14. Export to Google Drive for saving the resume training's artifacts

In [None]:
import os

if not os.path.exists("/content/gdrive/MyDrive/"):
  print("Connect your Google Drive and try again.")

if not os.path.exists("/content/gdrive/MyDrive/tortoise"):
  os.mkdir("/content/gdrive/MyDrive/tortoise")

srcdir = '/content/DL-Art-School/experiments/'+Experiment_Name
outdir = '/content/gdrive/MyDrive/tortoise/'+Experiment_Name
if not os.path.exists(outdir):
  os.mkdir(outdir)

from pathlib import Path
from shutil import copy, copytree
outdir = Path(outdir)
srcdir = Path(srcdir)

#@markdown #Pick what training results you'd like to save:
#deleteZeroStepCheckpoints = True #@param {type:"boolean"}
saveLogs = True #@param {type:"boolean"}
saveYml = True #@param {type:"boolean"}
saveInferenceCheckpoints = True #@param {type:"boolean"}
#@markdown These only need to be adjusted if you want to commit further training:
saveEMACheckpoints = True #@param {type:"boolean"}
saveTrainingStates = True #@param {type:"boolean"}

checkpointSavingStrategy = 'minimal (last ckpt only)' #@param ["Everything (I have infinite storage)", "minimal (last ckpt only)"]
#@markdown Note that each checkpoint takes up **1.6GB of space** -- if you want to save all checkpoints, you probably need more than 15GB of gdrive storage.


outdir.mkdir(exist_ok=True)
'''
if deleteZeroStepCheckpoints:
    for zero_model in srcdir.glob('models/0_gpt*.pth'):
        zero_model.unlink() # remove all 0 step files; useless
    for zero_model in srcdir.glob('training_state/0.state'):
        zero_model.unlink()
'''
if saveLogs:
    copytree(srcdir/'tb_logger', outdir/'tb_logger')
    for log in srcdir.glob('*.log'):
        copy(log, outdir)

if saveYml:
    for yml in srcdir.glob('*.yml'):
        copy(yml, outdir)

infer_models = list((srcdir/'models').glob('*_gpt.pth'))
training_states = (srcdir/'training_state').iterdir()
ema_models = (srcdir/'models').glob('*_gpt_ema.pth')
highest_step = max(int(m.stem.split('_')[0]) for m in infer_models)

def save_model_directory(glob, outsubdir):
    outsubdir.mkdir(exist_ok=True)
    for m in glob:
        if checkpointSavingStrategy != 'minimal (last ckpt only)' \
            or int(m.stem.split('_')[0]) == highest_step:
            copy(m, outsubdir)

if saveInferenceCheckpoints:
    save_model_directory(infer_models, outdir/'models')

if saveEMACheckpoints:
    save_model_directory(ema_models, outdir/'models')

if saveTrainingStates:
    save_model_directory(training_states, outdir/'training_state')