Notebook to push both trained tokenizer and model to the huggingface hub.

In [36]:
import os
from pathlib import Path
from huggingface_hub import HfApi
from dotenv import load_dotenv

### load the envrionment variables using dotenv that loads .env file as well

In [37]:
load_dotenv()

True

### Set up the api with HF_Api and other details

In [38]:
api = HfApi(token=os.getenv("HF_TOKEN"))
repo_id = "smathad/eng-indic-transformer"

### Set the directories to tokenizer and model directories

In [39]:
base_dir = Path().absolute().parent
base_dir

PosixPath('/Users/sameergururajmathad/eng-indic-transformer')

In [40]:
tokenizer_dir = base_dir / 'tokenizer'
models_dir = base_dir / 'models'
reports_dir = base_dir / 'reports'
checkpoints_dir = base_dir / 'checkpoints'

### Push tokenizer directory if it is trained and exists.

In [41]:
if tokenizer_dir.exists():
    # push tokenizer to hf.
    api.upload_folder(
    folder_path=tokenizer_dir,
    repo_id=repo_id,
    repo_type="model",
    path_in_repo='tokenizer',
    create_pr=True,
    commit_message='pushing tokenizer to hf',
    commit_description='tokenizer folder contains tokenizer.vocab and tokenizer.model trained using sentence piece.'
)

else:
    print('Tokenizer directory doesn\'t exist..' )

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

### Push reports to hf directory, so its easy for others to see the training results.

In [44]:
if reports_dir.exists():
    # push reports to hf.
    api.upload_folder(
    folder_path=reports_dir,
    repo_id=repo_id,
    repo_type="model",
    path_in_repo='reports',
    create_pr=True,
    commit_message='pushing training reports to hf',
    commit_description='The model is trained for 5 epochs for english to hindi translation and contains a csv file with train, test and bleu scores.'
)

else:
    print('Reports directory doesn\'t exist..' )

### Push models directory if it is trained and exists

In [42]:
if models_dir.exists():
    api.upload_folder(
        folder_path=models_dir,
        repo_id=repo_id,
        repo_type='model',
        path_in_repo='models',
        create_pr=True,
        commit_message='Pushing model into hf',
        commit_description='Models folder contains base(initialized) models.'
    )
else:
    print('Models directory doesn\'t exist..' )

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

### Push checkpoints directory if it is trained and exists

In [43]:
if checkpoints_dir.exists():
    api.upload_folder(
        folder_path=checkpoints_dir,
        repo_id=repo_id,
        repo_type='model',
        path_in_repo='checkpoints',
        create_pr=True,
        commit_message='Pushing optimizer state into hf',
        commit_description='Models folder contains base(initialized) model state.'
    )
else:
    print('Models directory doesn\'t exist..' )

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

### Create the tag to the version to use it later.

In [45]:
# get the existing refs for extracting repo names.
refs = api.list_repo_refs(repo_id=repo_id, repo_type='model')


In [46]:
# extract the tags created so far
tags = [ ref.name for ref in refs.tags ]

### check the tag version before creating new tag

In [47]:
tags

[]

In [48]:
latest_tag = 'v1.0' # trained version. 
tags.append(latest_tag) # add new versions to the end, so as to use the latest version.
tags

['v1.0']

In [49]:
# uncomment the next part only if you are
# sure of merging the pr and creating a
# latest tag
if models_dir.exists():
    api.create_tag(
        repo_id=repo_id,
        tag=tags[-1],
        tag_message= f'tagging the model with a tag of ${tags[-1]}.'
    )