## Preparing the Data

We will demonstrate fine-tuning open source models in order to classify emails into two categories, based on their content.

We will prepare 950 examples to fine-tune on, and use 50 examples to test accuracy. 



In [None]:
from sklearn.datasets import fetch_20newsgroups
import pandas as pd

categories = ['rec.sport.baseball', 'rec.sport.hockey']
sports_dataset = fetch_20newsgroups(subset='train', shuffle=True, random_state=42, categories=categories)

labels = [sports_dataset.target_names[x].split('.')[-1] for x in sports_dataset['target']]
texts = [text.strip() for text in sports_dataset['data']]
df = pd.DataFrame(zip(texts, labels), columns = ['raw_prompt','response'])[:1000]
df_train = df[:950]
df_test = df[950:]

In [None]:
df_train['raw_prompt'].iloc[0]

In [None]:
print(df_train['response'].value_counts())

df_test['response'].value_counts()

In [None]:
def build_prompt(text: str):
    return f"Prompt: {text}\nCategory: "

def prepare_df(df: pd.DataFrame):
    df['prompt'] = df.apply(lambda row: build_prompt(row['raw_prompt']), axis=1)
    df.drop('raw_prompt', axis=1, inplace=True)

In [None]:
prepare_df(df_train)

In [None]:
df_train.head()

The data needs to end up in a CSV file that has two columns: `prompt` and `response`, and that is publicly accessible.

In [None]:
df_train.to_csv("sports_training_dataset.csv")

Behind the scenes, we upload our csv to `s3://scale-demo-datasets/sports/sports_training_dataset.csv`, which maps to a URL of `https://scale-demo-datasets.s3.us-west-2.amazonaws.com/sports/sports_training_dataset.csv`. We make sure this is publicly accessible.

In [None]:
# We need to set an environment variable SCALE_API_KEY. 
#   Since we're in an ipynb, we can do this. Otherwise, you just need to set the environment variable
import dotenv
env_file = ".env"
dotenv.load_dotenv(env_file, override=True)



## Fine-Tuning the Model

In [None]:
from llmengine import FineTune, Completion, Model

FineTune.validate_api_key()

In [None]:


create_fine_tune_response = FineTune.create(
    model="llama-7b",
    training_file="https://scale-demo-datasets.s3.us-west-2.amazonaws.com/sports/sports_training_dataset.csv",
    validation_file=None,
    hyperparameters={},
    suffix="my-first-finetune"
)

fine_tune_id = create_fine_tune_response.fine_tune_id




In [None]:
# Wait for fine tune to complete

import time
while True:
    fine_tune_status = FineTune.retrieve(fine_tune_id).status
    print(fine_tune_status)
    if fine_tune_status == "SUCCESS":
        break
    elif fine_tune_status in ["FAILURE", "CANCELLED"]:
        raise ValueError("Fine Tune failed")
    time.sleep(10)


In [None]:
all_models = Model.list().model_endpoints

# We want to get just your fine-tuned models.
your_personal_fine_tunes = [model for model in all_models if not model.spec.public_inference]

your_fine_tuned_model = your_personal_fine_tunes[0].name

In [15]:
# hard-coded value from a previous run of this script
your_fine_tuned_model = "llama-7b.my-first-finetune.2023-07-17-19-44-20"

## Test the Fine-Tune

In [36]:
def get_classification(prompt: str):
    for _ in range(5):
        try:
            response = Completion.create(model_name=your_fine_tuned_model, prompt=build_prompt(prompt), max_new_tokens=2)
            # print(response.outputs[0].text)
        
            return response.outputs[0].text.rstrip("\n")
        except Exception as e:
            print(e)
    return "Error"

In [37]:
df_test["predicted_response"] = df_test["raw_prompt"].apply(get_classification)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["predicted_response"] = df_test["raw_prompt"].apply(get_classification)


In [32]:
df_test.head()

Unnamed: 0,raw_prompt,response,predicted_response
950,From: tedward@cs.cornell.edu (Edward [Ted] Fis...,baseball,baseball
951,From: smorris@venus.lerc.nasa.gov (Ron Morris ...,hockey,hockey
952,From: shah@pitt.edu (Ravindra S Shah)\nSubject...,hockey,hockey
953,From: timlin@spot.Colorado.EDU (Michael Timlin...,baseball,baseball
954,From: gp2011@andy.bgsu.edu (George Pavlic)\nSu...,hockey,hockey


In [38]:
num_correct = len(df_test[df_test["predicted_response"] == (df_test["response"])])

In [39]:
num_correct / len(df_test)

0.94

In [40]:
df_test[df_test["predicted_response"] != df_test["response"]]

Unnamed: 0,raw_prompt,response,predicted_response
974,From: maX <maX@maxim.rinaco.msk.su>\nSubject: ...,hockey,
988,From: jca2@cec1.wustl.edu (Joseph Charles Achk...,hockey,NHL
997,From: apland@mala.bc.ca (Ron Apland)\nSubject:...,hockey,
