# Downlaod a FineTuned Model 
This notebook demonstrates how to download a finetuned model that you've created using LLM Engine and add it to huggingface!

**This notebook is an extension of the previous finetuning notebook on ScienceQA**

# Packages Required
For this demo, we'll be using the `scale-llm-engine` package, the `datasets` package for downloading our finetuning dataset, `transformers`, and `huggingface_hub` for uploading our model to huggingface.


In [None]:
!pip install scale-llm-engine
!pip install transformers
!pip install huggingface_hub
!pip install datasets
!pip install aiohttp   

# Data Preparation
Let's load in the dataset using Huggingface and view the features.

In [None]:
from datasets import load_dataset
from smart_open import smart_open
import pandas as pd

dataset = load_dataset('derek-thomas/ScienceQA')
dataset['train'].features

Now, let's format the dataset into what's acceptable for LLM Engine - a CSV file with 'prompt' and 'response' columns.

In [None]:
choice_prefixes = [chr(ord('A') + i) for i in range(26)] # A-Z
def format_options(options, choice_prefixes):
    return ' '.join([f'({c}) {o}' for c, o in zip(choice_prefixes, options)])

def format_prompt(r, choice_prefixes):
    options = format_options(r['choices'], choice_prefixes)
    return f'''Context: {r["hint"]}\nQuestion: {r["question"]}\nOptions:{options}\nAnswer:'''

def format_label(r, choice_prefixes):
    return choice_prefixes[r['answer']]

def convert_dataset(ds):
    prompts = [format_prompt(i, choice_prefixes) for i in ds if i['hint'] != '']
    labels = [format_label(i, choice_prefixes) for i in ds if i['hint'] != '']
    df = pd.DataFrame.from_dict({'prompt': prompts, 'response': labels})
    return df

save_to_s3 = False
df_train = convert_dataset(dataset['train'])
if save_to_s3:
    train_url = 's3://...'
    val_url = 's3://...'
    df_train = convert_dataset(dataset['train'])
    with smart_open(train_url, 'wb') as f:
        df_train.to_csv(f)

    df_val = convert_dataset(dataset['validation'])
    with smart_open(val_url, 'wb') as f:
        df_val.to_csv(f)
else:
    # Gists of the already processed datasets
    train_url = 'https://gist.githubusercontent.com/jihan-yin/43f19a86d35bf22fa3551d2806e478ec/raw/91416c09f09d3fca974f81d1f766dd4cadb29789/scienceqa_train.csv'
    val_url = 'https://gist.githubusercontent.com/jihan-yin/43f19a86d35bf22fa3551d2806e478ec/raw/91416c09f09d3fca974f81d1f766dd4cadb29789/scienceqa_val.csv'

df_train

# Fine-tune
Now, we can fine-tune the model using LLM Engine.

In [None]:
import os
os.environ['SCALE_API_KEY'] = 'xxx'

from llmengine import FineTune

response = FineTune.create(
    model="llama-2-7b",
    training_file=train_url,
    validation_file=val_url,
    hyperparameters={
        'lr':2e-4,
    },
    suffix='science-qa-llama'
)
run_id = response.id

We can sleep until the job completes.

In [None]:
import time

while True:
    job_status = FineTune.get(run_id).status
    print(job_status)
    if job_status == 'SUCCESS':
        break
    time.sleep(60)

fine_tuned_model = FineTune.get(run_id).fine_tuned_model

# Downloading our Finetuned model 
Let's download the weights for the new fine-tuned model using LLM Engine.

In [None]:
from llmengine import Model

response = Model.download(FineTune.get(run_id).fine_tune_model, download_format="huggingface")
print(response.urls)

We now have a list of urls that point to the file(s) where our finetuned model lives. We can download the associated finetuned model either synchronously or asynchronously.

In [None]:
import os
import aiohttp
import asyncio
from urllib.parse import urlparse

async def download_file(session, url, output_dir):
    # Parse the URL to get the filename
    parsed = urlparse(url)
    filename = os.path.basename(parsed.path)

    # Download the file
    async with session.get(url) as response:
        response.raise_for_status()

        # Write the file
        with open(os.path.join(output_dir, filename), 'wb') as f:
            f.write(await response.read())

        print(f"Downloaded {filename}")

async def download_files(urls, output_dir):
    os.makedirs(output_dir, exist_ok=True)

    async with aiohttp.ClientSession() as session:
        tasks = []
        for url in urls:
            tasks.append(download_file(session, url, output_dir))

        await asyncio.gather(*tasks)



In [None]:
output_directory = "YOUR_MODEL_DIR"
asyncio.run(download_files(response.urls, output_directory))

In [None]:
import os
import tarfile
from getpass import getpass
from huggingface_hub import HfApi

def create_tarball(source_dir, output_filename):
    with tarfile.open(output_filename, "w:gz") as tar:
        tar.add(source_dir, arcname=os.path.basename(source_dir))

def upload_to_huggingface_model_hub(source_dir, model_name, hf_username, hf_password=None):
    # Get password if not provided
    if hf_password is None:
        hf_password = getpass("Enter Hugging Face password: ")

    # Log in to Hugging Face
    api = HfApi()
    token = api.login(hf_username, hf_password)

    # Create a new repository
    print(f"Creating new model {model_name}.")
    repo_url = api.create_repo(token, model_name, exist_ok=True)

    # Create a tarball of the source directory
    tarball_name = f"{model_name}.tar.gz"
    create_tarball(source_dir, tarball_name)

    # Upload the tarball
    print(f"Uploading {tarball_name} to {repo_url}.")
    HfApi().upload_file(
        path_or_fileobj=tarball_name,
        path_in_repo=tarball_name,  # The name of the file in the repo
        repo_id=f"{hf_username}/{model_name}",  # The id of the repo
        token=token,
    )

    # Delete the tarball
    os.remove(tarball_name)

In [None]:
upload_to_huggingface_model_hub('YOUR_MODEL_DIR', 'YOUR_MODEL_NAME', 'YOUR_HUGGINGFACE_USER_NAME', 'YOUR_HUGGINGFACE_PASSWORD')