# Run this file if there is already a tokenizer present in the hf repository

In [1]:
import os
from pathlib import Path
from huggingface_hub import HfApi,hf_hub_download
from dotenv import load_dotenv

### load the envrionment variables using dotenv that loads .env file as well

In [2]:
load_dotenv()

True

### Set up the api with HF_Api and other details

In [3]:
api = HfApi(token=os.getenv("HF_TOKEN"))
repo_id = "smathad/eng-indic-transformer" # hf repo_id

### Set the directories to tokenizer and model directories

In [4]:
base_dir = Path().absolute().parent
base_dir

PosixPath('/Users/sameergururajmathad/eng-indic-transformer')

In [5]:
tokenizer_dir = base_dir / 'tokenizer'
models_dir = base_dir / 'models'
reports_dir = base_dir / 'reports'

### List all the files present in hf-repo

In [6]:
files = api.list_repo_files(repo_id)
filtered_files = []

for file in files:
    # only download files that are needed.
    # needed directories: [tokenizer, models, reports]
    if file.startswith('tokenizer') or file.startswith('models') or file.startswith('reports') or file.startswith('checkpoints'):
        filtered_files.append(file)

In [7]:
filtered_files

['checkpoints/optimizer.pt',
 'models/model.pt',
 'reports/en_hindi_train_report.csv',
 'tokenizer/tokenizer.model',
 'tokenizer/tokenizer.vocab']

In [8]:
for file in filtered_files:
    # download the files to default
    # hf downlaod path.
    local_path = hf_hub_download(
        repo_id = repo_id,
        filename=file,
        repo_type = 'model',
        local_dir = base_dir
    )

checkpoints/optimizer.pt:   0%|          | 0.00/740M [00:00<?, ?B/s]

models/model.pt:   0%|          | 0.00/377M [00:00<?, ?B/s]

en_hindi_train_report.csv:   0%|          | 0.00/296 [00:00<?, ?B/s]

tokenizer/tokenizer.model:   0%|          | 0.00/1.48M [00:00<?, ?B/s]

tokenizer.vocab:   0%|          | 0.00/1.28M [00:00<?, ?B/s]

In [9]:
print('Downloaded sucessfully!!!')

Downloaded sucessfully!!!
