## Preparing the Data

We will demonstrate fine-tuning open source models in order to classify emails into two categories, based on their content.

We will prepare 950 examples to fine-tune on, and use 50 examples to test accuracy. 



In [None]:
from sklearn.datasets import fetch_20newsgroups
import pandas as pd

categories = ['rec.sport.baseball', 'rec.sport.hockey']
sports_dataset = fetch_20newsgroups(subset='train', shuffle=True, random_state=42, categories=categories)

labels = [sports_dataset.target_names[x].split('.')[-1] for x in sports_dataset['target']]
texts = [text.strip() for text in sports_dataset['data']]
df = pd.DataFrame(zip(texts, labels), columns = ['raw_prompt','response'])[:1000]
df_train = df[:950]
df_test = df[950:]

In [None]:
df_train['raw_prompt'].iloc[0]

In [None]:
print(df_train['response'].value_counts())

df_test['response'].value_counts()

In [None]:
def build_prompt(text: str):
    return f"Prompt: {text}\nCategory: "

def prepare_df(df: pd.DataFrame):
    df['prompt'] = df['raw_prompt'].apply(build_prompt)
    df.drop('raw_prompt', axis=1, inplace=True)

In [None]:
prepare_df(df_train)

In [None]:
df_train.head()

The data needs to end up in a CSV file that has two columns: `prompt` and `response`, and that is publicly accessible.

In [None]:
df_train.to_csv("sports_training_dataset.csv")

Behind the scenes, we upload our csv to `s3://scale-demo-datasets/sports/sports_training_dataset.csv`

In [None]:
# We need to set an environment variable SCALE_API_KEY. 
#   Since we're in an ipynb, we can do this. Otherwise, you just need to set the environment variable
import dotenv
env_file = ".env"
dotenv.load_dotenv(env_file, override=True)



In [None]:
from llmengine import FineTune, Completion, Model

FineTune.validate_api_key()

In [None]:


create_fine_tune_response = FineTune.create(
    model="llama-7b",
    training_file="s3://scale-demo-datasets/sports/sports_training_dataset.csv",
    validation_file=None,
    hyperparameters={},
    suffix="my-first-finetune"
)

fine_tune_id = create_fine_tune_response.fine_tune_id


"""
Args:
            training_file (`str`):
                A path to the file containing the training dataset.
                Dataset must be a CSV file with columns 'prompt' and 'response'.
                The value must be the URI of a publicly accessible file on bulk storage, e.g.
                s3://{public_s3_bucket}/{public_s3_key} for a file stored on s3.
            validation_file (`str`):
                A path to the file containing the validation dataset.
                Has the same format as training_file.
                If not provided, we will generate a split from the training dataset.
            model_name (`str`):
                The name of the fine-tuned model
            base_model (`str`):
                Base model to train from
            fine_tuning_method (`str`):
                Fine-tuning method
            hyperparameters (`str`):
                Hyperparameters
"""

In [None]:
# Wait for fine tune to complete

import time
while True:
    fine_tune_status = FineTune.retrieve(fine_tune_id).status
    print(fine_tune_status)
    if fine_tune_status == "SUCCESS":
        break
    elif fine_tune_status in ["FAILURE", "CANCELLED"]:
        raise ValueError("Fine Tune failed")
    time.sleep(10)


In [None]:
all_models = Model.list().model_endpoints

# We want to get just your fine-tuned models.
your_personal_fine_tunes = [model for model in all_models if not model.spec.public_inference]

your_fine_tuned_model = your_personal_fine_tunes[0].name

In [None]:
# TODO run through everything in test set
def get_classification(prompt: str):
    response = Completion.create(model_name=your_fine_tuned_model, prompt=build_prompt(prompt), max_new_tokens=2)
    print(response)
    return response

In [None]:
df_test["predicted_response"] = df_test["prompt"].apply(get_classification)

In [None]:
df_test.head()

In [None]:
num_correct = len(df_test[df_test["predicted_response"].startswith(df_test["response"])])