In [2]:
from sklearn.datasets import fetch_20newsgroups
import pandas as pd
import openai

In [3]:
categories = ['rec.sport.baseball', 'rec.sport.hockey']
sports_dataset = fetch_20newsgroups(subset='train', shuffle=True, random_state=42, categories=categories)

In [4]:
print(sports_dataset['data'][0])

From: dougb@comm.mot.com (Doug Bank)
Subject: Re: Info needed for Cleveland tickets
Reply-To: dougb@ecs.comm.mot.com
Organization: Motorola Land Mobile Products Sector
Distribution: usa
Nntp-Posting-Host: 145.1.146.35
Lines: 17

In article <1993Apr1.234031.4950@leland.Stanford.EDU>, bohnert@leland.Stanford.EDU (matthew bohnert) writes:

|> I'm going to be in Cleveland Thursday, April 15 to Sunday, April 18.
|> Does anybody know if the Tribe will be in town on those dates, and
|> if so, who're they playing and if tickets are available?

The tribe will be in town from April 16 to the 19th.
There are ALWAYS tickets available! (Though they are playing Toronto,
and many Toronto fans make the trip to Cleveland as it is easier to
get tickets in Cleveland than in Toronto.  Either way, I seriously
doubt they will sell out until the end of the season.)

-- 
Doug Bank                       Private Systems Division
dougb@ecs.comm.mot.com          Motorola Communications Sector
dougb@nwu.edu       

In [5]:
sports_dataset.target_names[sports_dataset['target'][0]]

'rec.sport.baseball'

In [6]:
len_all, len_baseball, len_hockey = len(sports_dataset.data), len([e for e in sports_dataset.target if e == 0]), len([e for e in sports_dataset.target if e == 1])
print(f"Total examples: {len_all}, Baseball examples: {len_baseball}, Hockey examples: {len_hockey}")

Total examples: 1197, Baseball examples: 597, Hockey examples: 600


In [7]:
labels = [sports_dataset.target_names[x].split('.')[-1] for x in sports_dataset['target']]
print(len(labels))

1197


In [8]:
texts = [text.strip() for text in sports_dataset['data']]

In [18]:
df = pd.DataFrame(zip(texts, labels), columns = ['prompt','completion'])
print(len(df))

1197


In [19]:
df.head()
df.to_json("sport.jsonl", orient='records', lines=True)

In [21]:
!openai tools fine_tunes.prepare_data -f sport.jsonl -q

Analyzing...

- Your file contains 1197 prompt-completion pairs
- Based on your data it seems like you're trying to fine-tune a model for classification
- For classification, we recommend you try one of the faster and cheaper models, such as `ada`
- For classification, you can estimate the expected model performance by keeping a held out dataset, which is not used for training
- There are 11 examples that are very long. These are rows: [134, 200, 281, 320, 404, 595, 704, 838, 1113, 1139, 1174]
For conditional generation, and for classification the examples shouldn't be longer than 2048 tokens.
- Your data does not contain a common separator at the end of your prompts. Having a separator string appended to the end of the prompt makes it clearer to the fine-tuned model where the completion should begin. See https://platform.openai.com/docs/guides/fine-tuning/preparing-your-dataset for more detail and examples. If you intend to do open-ended generation, then you should leave the prompts e

In [22]:
!openai api fine_tunes.create \
-t "sport_prepared_train.jsonl" \
-v "sport_prepared_valid.jsonl" \
-m ada \
--suffix "sport_classification_demo"

Upload progress: 100%|█████████████████████| 1.52M/1.52M [00:00<00:00, 768Mit/s]
Uploaded file from sport_prepared_train.jsonl: file-ccSwWXlOmCYqImRWIGq5sODy
Upload progress: 100%|███████████████████████| 387k/387k [00:00<00:00, 499Mit/s]
Uploaded file from sport_prepared_valid.jsonl: file-9Cn3XbnkHvy5DxWu6pWKYzYg
Created fine-tune: ft-OCZAvHiqD170W8WkNWI8Kpmr
Streaming events until fine-tuning is complete...

(Ctrl-C will interrupt the stream, but not cancel the fine-tune)
[2023-03-29 14:47:32] Created fine-tune: ft-OCZAvHiqD170W8WkNWI8Kpmr

Stream interrupted (client disconnected).
To resume the stream, run:

  openai api fine_tunes.follow -i ft-OCZAvHiqD170W8WkNWI8Kpmr



In [36]:
!openai api fine_tunes.follow -i ft-OCZAvHiqD170W8WkNWI8Kpmr

[2023-03-29 14:47:32] Created fine-tune: ft-OCZAvHiqD170W8WkNWI8Kpmr
[2023-03-29 14:49:55] Fine-tune costs $0.78
[2023-03-29 14:49:55] Fine-tune enqueued. Queue number: 19
[2023-03-29 14:49:57] Fine-tune is in the queue. Queue number: 18
[2023-03-29 14:50:03] Fine-tune is in the queue. Queue number: 17
[2023-03-29 14:50:04] Fine-tune is in the queue. Queue number: 16
[2023-03-29 14:50:05] Fine-tune is in the queue. Queue number: 15
[2023-03-29 14:50:38] Fine-tune is in the queue. Queue number: 14
[2023-03-29 14:55:24] Fine-tune is in the queue. Queue number: 13
[2023-03-29 14:56:07] Fine-tune is in the queue. Queue number: 12
[2023-03-29 14:56:08] Fine-tune is in the queue. Queue number: 10
[2023-03-29 14:56:08] Fine-tune is in the queue. Queue number: 10
[2023-03-29 14:56:10] Fine-tune is in the queue. Queue number: 9
[2023-03-29 14:57:23] Fine-tune is in the queue. Queue number: 8
[2023-03-29 14:57:47] Fine-tune is in the queue. Queue number: 7
[2023-03-29 14:58:05] Fine-tune is in t

In [38]:
!openai api fine_tunes.results -i ft-OCZAvHiqD170W8WkNWI8Kpmr > result.csv

In [39]:
results = pd.read_csv('result.csv')

In [42]:
results[results['validation_sequence_accuracy'].notnull()].tail(1)

Unnamed: 0,step,elapsed_tokens,elapsed_examples,training_loss,training_sequence_accuracy,training_token_accuracy,validation_loss,validation_sequence_accuracy,validation_token_accuracy
3720,3721,1692625,3721,0.017377,1.0,1.0,0.022844,1.0,1.0


In [44]:
import pandas as pd

In [47]:
ft_model = 'ada:ft-personal:sport-classification-demo-2023-03-29-07-20-45'

In [50]:
test = pd.read_json('sport_prepared_valid.jsonl', lines=True)
test.head()

Unnamed: 0,prompt,completion
0,From: gld@cunixb.cc.columbia.edu (Gary L Dare)...,hockey
1,From: smorris@venus.lerc.nasa.gov (Ron Morris ...,hockey
2,From: golchowy@alchemy.chem.utoronto.ca (Geral...,hockey
3,From: krattige@hpcc01.corp.hp.com (Kim Krattig...,baseball
4,From: warped@cs.montana.edu (Doug Dolven)\nSub...,baseball


In [51]:
res = openai.Completion.create(model=ft_model, prompt=test['prompt'][0] + '\n\n###\n\n', max_tokens=1, temperature=0)

In [52]:
print(res)

{
  "choices": [
    {
      "finish_reason": "length",
      "index": 0,
      "logprobs": null,
      "text": " hockey"
    }
  ],
  "created": 1680075205,
  "id": "cmpl-6zKb7XvGBQHkmMwRbW2REYJEBT895",
  "model": "ada:ft-personal:sport-classification-demo-2023-03-29-07-20-45",
  "object": "text_completion",
  "usage": {
    "completion_tokens": 1,
    "prompt_tokens": 256,
    "total_tokens": 257
  }
}


In [53]:
res['choices'][0]['text']

' hockey'

In [54]:
res = openai.Completion.create(model=ft_model, prompt=test['prompt'][0] + '\n\n###\n\n', max_tokens=1, temperature=0, logprobs=2)
res['choices'][0]['logprobs']['top_logprobs'][0]

<OpenAIObject at 0x13b2e4f90> JSON: {
  " baseball": -8.169246,
  " hockey": -0.00049562723
}

In [55]:
sample_hockey_tweet = """Thank you to the 
@Canes
 and all you amazing Caniacs that have been so supportive! You guys are some of the best fans in the NHL without a doubt! Really excited to start this new chapter in my career with the 
@DetroitRedWings
 !!"""
res = openai.Completion.create(model=ft_model, prompt=sample_hockey_tweet + '\n\n###\n\n', max_tokens=1, temperature=0, logprobs=2)
res['choices'][0]['text']

' hockey'

In [56]:
res['choices'][0]['logprobs']['top_logprobs'][0]

<OpenAIObject at 0x13b31b590> JSON: {
  " Hockey": -10.671424,
  " hockey": -7.028876e-05
}

In [60]:
sample_baseball_tweet="""BREAKING: The Tampa Bay Rays are finalizing a deal to acquire slugger Nelson Cruz from the Minnesota Twins, sources tell ESPN."""
res = openai.Completion.create(model=ft_model, prompt=sample_baseball_tweet + '\n\n###\n\n', max_tokens=1, temperature=0, logprobs=2)
res['choices'][0]['text']

' baseball'