# Run this file if there is already a tokenizer present in the hf repository

In [1]:
import os
from pathlib import Path
from huggingface_hub import HfApi,hf_hub_download
from dotenv import load_dotenv

### load the envrionment variables using dotenv that loads .env file as well

In [2]:
load_dotenv()

True

### Set up the api with HF_Api and other details

In [3]:
api = HfApi(token=os.getenv("HF_TOKEN"))
repo_id = "smathad/eng-indic-transformer" # hf repo_id

### Set the directories to tokenizer and model directories

In [4]:
base_dir = Path().absolute().parent
base_dir

PosixPath('/root/eng-indic-transformer')

In [5]:
tokenizer_dir = base_dir / 'tokenizer'
models_dir = base_dir / 'models'

### List all the files present in hf-repo

In [6]:
files = api.list_repo_files(repo_id)
tokenizer_files = []
models_files = []

for file in files:
    if file.startswith('tokenizer'):
        tokenizer_files.append(file)
    elif file.startswith('models'):
        models_files.append(file)       

In [7]:
tokenizer_files

['tokenizer/tokenizer.model', 'tokenizer/tokenizer.vocab']

In [8]:
models_files

['models/transformer.pt']

In [9]:
if not tokenizer_dir.exists():
    for file in tokenizer_files:
        # download the files to default
        # hf downlaod path.
        local_path = hf_hub_download(
            repo_id = repo_id,
            filename=file,
            repo_type = 'model',
            local_dir = base_dir
        )
else:
    print('Tokenizer directory already present')

tokenizer/tokenizer.model:   0%|          | 0.00/1.48M [00:00<?, ?B/s]

tokenizer.vocab:   0%|          | 0.00/1.28M [00:00<?, ?B/s]

In [None]:
if not models_dir.exists():
    for file in models_files:
        # download the files to default
        # hf downlaod path.
        local_path = hf_hub_download(
            repo_id = repo_id,
            filename=file,
            repo_type = 'model',
            local_dir = base_dir
        )
else:
    print('Models directory already present')

models/transformer.pt:   0%|          | 0.00/432M [00:00<?, ?B/s]