# Train Wav2Vec2 model using Miniconda

Rolando Coto-Solano (rolando.a.coto.solano@dartmouth.edu)<br>
Last update: 20260114

## Prepare preliminary data

In [None]:
from datetime import datetime; print(datetime.now().time())

In [None]:
#==================================================
# Which files do you want to process?
#==================================================

currentSandbox = "sandbox-user" # Please type sandbox-user or all-wavs
installationFolder = "202506-ood-asr"

runId = "01"
desiredTrainEpochs = 21

trainFile = "ood-wav2vec2-train.csv"
validFile = "ood-wav2vec2-valid.csv"
testFile = "ood-wav2vec2-test.csv"

asrLang = "ood"

# N-grams for the KenLM n-gram model
ngrams = 4

In [None]:
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)

In [None]:
datasetPath = "/content/drive/MyDrive/"+installationFolder+"/" + currentSandbox + "/"

csvTrain = datasetPath + trainFile
csvValid = datasetPath + validFile
csvTest = datasetPath + testFile
corpusFile = datasetPath + trainFile.replace("-train.csv","-corpus.txt")

filenameKenlmModel = "lm-" + asrLang + "-" + str(ngrams) + ".arpa"
filenameCorrectKenlmModel = filenameKenlmModel.replace(".arpa", "-correct.arpa")

folderLogFiles = datasetPath + "logs-wav2vec2-res/"
folderModelFiles = "/content/wav2vec2-large-xlsr/"

condition = "wav2vec2"

outputPrefix = asrLang + "-" + condition
transferModelPath = ""

In [None]:
print(folderLogFiles)
print(folderModelFiles)

## Install packages

In [None]:
!wget https://rcweb.dartmouth.edu/RCoto/tocc-asr-workshop-202506/train-wav2vec2lm-miniconda-202505.py

In [None]:
%env PYTHONPATH=

In [None]:
%%bash
MINICONDA_INSTALLER_SCRIPT=Miniconda3-py310_25.3.1-1-Linux-x86_64.sh
MINICONDA_PREFIX=/usr/local
wget https://repo.continuum.io/miniconda/$MINICONDA_INSTALLER_SCRIPT
chmod +x $MINICONDA_INSTALLER_SCRIPT
./$MINICONDA_INSTALLER_SCRIPT -b -f -p $MINICONDA_PREFIX

In [None]:
!conda --version # should return 4.12.0
!python --version

In [None]:
%%bash
conda install --channel defaults conda python=3.10 --yes
conda update --channel defaults --all --yes

In [None]:
import sys
sys.path

_ = (sys.path
        .append("/usr/local/lib/python3.10/site-packages"))

In [None]:
#=====================================================
# Installing packages
# This should take up to 9~13 minutes
#=====================================================

!conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/main
!conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/r

!conda install -c conda-forge numpy==1.23.5 -y
!conda install -c conda-forge datasets==2.15.0 -y
!conda install -c conda-forge transformers==4.28.0 -y
!conda install -c conda-forge pandas==1.5.3 -y
!conda install -c conda-forge pyctcdecode==0.3.0 -y
!conda install -c conda-forge librosa==0.11.0 -y
!conda install -c conda-forge typing==3.7.4.3 -y
!conda install -c conda-forge statistics==1.0.3.5 -y
!conda install -c conda-forge huggingface_hub==0.21.4 -y
!conda install -c conda-forge kenlm
!pip install https://github.com/kpu/kenlm/archive/master.zip
!pip install torch==1.11.0+cu113 torchaudio==0.11.0+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html
!pip install jiwer==3.1.0

In [None]:
# Install KenLM
#!apt install libboost-all-dev libeigen3-dev
#!wget -O - https://kheafield.com/code/kenlm.tar.gz | tar xz
#!mkdir -p kenlm/build
#!cd kenlm/build && cmake .. && make -j2

!rm -rf /content/kenlm  # Remove old directory completely
!apt-get update
!apt install -y libboost-all-dev libeigen3-dev build-essential
!wget -O - https://kheafield.com/code/kenlm.tar.gz | tar xz
!mkdir -p kenlm/build
!cd kenlm/build && /usr/bin/cmake .. && make -j2

## Make KenLM

In [None]:
#==================================================================
# Create corpus
#==================================================================

import csv

def createCorpus(file1, file2, file3, fileOutput):
    sentences = []

    for path in [file1, file2, file3]:
        with open(path, newline='', encoding='utf-8') as csvfile:
            reader = csv.DictReader(csvfile)
            for row in reader:
                sentence = row.get('sentence')
                if sentence is not None:
                    sentences.append(sentence)

    with open(fileOutput, 'w', encoding='utf-8') as outfile:
        outfile.write('\n'.join(sentences))

createCorpus(csvTrain, csvValid, csvTest, corpusFile)

In [None]:
!kenlm/build/bin/lmplz -o {str(ngrams)} <"{corpusFile}" > "{filenameKenlmModel}"

In [None]:
with open(filenameKenlmModel, "r") as read_file, open(filenameCorrectKenlmModel, "w") as write_file:
  has_added_eos = False
  for line in read_file:
    if not has_added_eos and "ngram 1=" in line:
      count=line.strip().split("=")[-1]
      write_file.write(line.replace(f"{count}", f"{int(count)+1}"))
    elif not has_added_eos and "<s>" in line:
      write_file.write(line)
      write_file.write(line.replace("<s>", "</s>"))
      has_added_eos = True
    else:
      write_file.write(line)

In [None]:
!head -20 {filenameCorrectKenlmModel}

## Train model

In [None]:
!python train-wav2vec2lm-miniconda-202505.py $asrLang $csvTrain $csvValid $csvTest $filenameCorrectKenlmModel $folderLogFiles $folderModelFiles $runId $desiredTrainEpochs $outputPrefix

## Visualize Results

In [None]:
# Look for the trained checkpoints

import os

sub_checkpoints = [name for name in os.listdir(folderModelFiles) if os.path.isdir(os.path.join(folderModelFiles, name))]
checkpoints = []
for f in sub_checkpoints:
  if ("checkpoint" in f and "ipynb" not in f):
    checkpoints.append(os.path.join(folderModelFiles, f))

checkpointNums = []
for ch in checkpoints:
  checkpointNums.append(int(ch.split("-")[-1]))
checkpointNums.sort()
checkpoints.sort()

print(checkpointNums)
print(checkpoints)

lastcheckpoint = max(checkpointNums)
print("Last checkpoint: " + str(lastcheckpoint))

## Save the model onto Google Colab

In [None]:
# Which checkpoint do you want to save?

# You can select specific checkpoint to save.
# However, each of them is more than 3GB
saveCheckpoints = [str(lastcheckpoint)]

In [None]:
# Erase previous models
modelFolder = datasetPath + "wav2vec2-model"
!rm -r {modelFolder}
!mkdir {modelFolder}

# Save new model

for s in saveCheckpoints:

  originFolder = folderModelFiles + "checkpoint-" + s
  destinationFolder = datasetPath + "wav2vec2-model/checkpoint-" + s
  !cp -r $originFolder $destinationFolder

!cp {folderModelFiles}preprocessor_config.json {datasetPath}wav2vec2-model/
!cp {folderModelFiles}special_tokens_map.json {datasetPath}wav2vec2-model/
!cp {folderModelFiles}tokenizer_config.json {datasetPath}wav2vec2-model/
!cp {folderModelFiles}vocab.json {datasetPath}wav2vec2-model/

!cp /content/{filenameCorrectKenlmModel} {datasetPath}wav2vec2-model/
!cp /content/{filenameKenlmModel} {datasetPath}wav2vec2-model/
!cp {corpusFile} {datasetPath}wav2vec2-model/

In [None]:
from datetime import datetime; print(datetime.now().time())

In [None]:
#from google.colab import runtime
#runtime.unassign()