In [1]:
import csv
import os 
import json
import random
import openai
import sys
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from wordcloud import WordCloud
from collections import Counter
import matplotlib.pyplot as plt
import pandas as pd 
import tiktoken

pd.set_option('max_colwidth', 200)
sys.path.append('../..')

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

openai.api_key  = os.environ['OPENAI_API_KEY']

In [28]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\roman.y.melnyk\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\roman.y.melnyk\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [19]:
df = pd.read_csv("./dataset/bbc.csv")
df = df.drop_duplicates()
df = df.dropna()
df.head()

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home theatre systems plasma high-definition tvs and digital video recorders moving into the living room the way people watch tv will be radically differen...
1,business,worldcom boss left books alone former worldcom boss bernie ebbers who is accused of overseeing an $11bn (£5.8bn) fraud never made accounting decisions a witness has told jurors. david myers ...
2,sport,tigers wary of farrell gamble leicester say they will not be rushed into making a bid for andy farrell should the great britain rugby league captain decide to switch codes. we and anybody else...
3,sport,yeading face newcastle in fa cup premiership side newcastle united face a trip to ryman premier league leaders yeading in the fa cup third round. the game - arguably the highlight of the draw - i...
4,entertainment,ocean s twelve raids box office ocean s twelve the crime caper sequel starring george clooney brad pitt and julia roberts has gone straight to number one in the us box office chart. it took $4...


In [20]:
def cal_num_tokens_from_row(string: str, encoding_name: str) -> int:
    encoding = tiktoken.encoding_for_model(encoding_name)
    token_count = len(encoding.encode(string))
    return token_count

def cal_num_tokens_from_df(df, encoding_name: str) -> int:
    total_tokens = 0
    for text in df['text']:
        total_tokens += cal_num_tokens_from_row (text, encoding_name)
    return total_tokens


### Data preparation

In [21]:
openai_model = 'gpt-3.5-turbo'

total_tokens = cal_num_tokens_from_df(df, openai_model)
print(f"Total numbver of tokens in the dataframe: {total_tokens}")

Total numbver of tokens in the dataframe: 1051754


In [22]:
rows = [{'text': row['text'].strip(),
                    'label': row['category'],
                    } for idx, row in df.iterrows()]
random.seed(42)
random.shuffle(rows)

In [23]:
num_test = 500
splits = {'test':rows[0:num_test], 'train':rows[num_test:]}

In [24]:
for split in ['train', 'test']:
    with open(f'./dataset/{split}.csv', 'w', newline='', encoding="utf-8") as fOut:
        writer = csv.DictWriter(fOut, fieldnames=['text', 'label'])
        writer.writeheader()
        for row in splits[split]:
            writer.writerow(row)

In [25]:
df = pd.read_csv("dataset/test.csv", encoding='unicode_escape')
final_df = df.head(500)
total_tokens = cal_num_tokens_from_df(final_df, "gpt-3.5-turbo")
print(f"Total numbver of tokens in the dataframe: {total_tokens}", "\n")

final_df.info()

Total numbver of tokens in the dataframe: 251377 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    500 non-null    object
 1   label   500 non-null    object
dtypes: object(2)
memory usage: 7.9+ KB


In [28]:
def create_openai_files_format(system_message, final_df, out_file_name):
    with open(out_file_name, "w") as out_file_name:
        for _, row in final_df.iterrows():
            openai_format = {
                "messages": [
                    {"role": "system", "content": system},
                    {"role": "user", "content": row["text"]},
                    {"role": "assistant", "content": row["label"]}
                ]
            }
            json.dump(openai_format, out_file_name)
            out_file_name.write('\n')   

In [32]:
### Train data
train_final_df = pd.read_csv("./dataset/train.csv", encoding='unicode_escape').head(10)
total_tokens = cal_num_tokens_from_df(train_final_df, "gpt-3.5-turbo")
print(f"Total numbver of tokens in the dataframe: {total_tokens}")
print("Price: ", total_tokens/1000*0.008, "$")

system = "You are a intelligent assistant designed to classify news articles into these categories: business, entertainment, politics, sport, tech"
out_file_name = "./dataset/train.jsonl"
create_openai_files_format(system_message=system, final_df=train_final_df, out_file_name=out_file_name)

Total numbver of tokens in the dataframe: 5709
Price:  0.045672 $


In [33]:
### Test data
test_final_df = pd.read_csv("./dataset/test.csv", encoding='unicode_escape').head(10)
total_tokens = cal_num_tokens_from_df(test_final_df, "gpt-3.5-turbo")
print(f"Total numbver of tokens in the dataframe: {total_tokens}")
print("Price: ", total_tokens/1000*0.006, "$")

system = "You are a intelligent assistant designed to classify news articles into these categories: business, entertainment, politics, sport, tech"
out_file_name = "./dataset/test.jsonl"
create_openai_files_format(system_message=system, final_df=test_final_df, out_file_name=out_file_name)

Total numbver of tokens in the dataframe: 5333
Price:  0.031998 $


### LLM Fine Tuning

In [47]:
from openai import OpenAI

In [48]:
client = OpenAI()

In [57]:
client.files.create(file=open("./dataset/train.jsonl","rb"), purpose="fine-tune")

FileObject(id='file-ELqHJFb7Pmuk5TUHanfmVD', bytes=30269, created_at=1733316949, filename='train.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)

In [58]:
client.files.create(file=open("./dataset/test.jsonl","rb"), purpose="fine-tune")

FileObject(id='file-6HParUXBYXgtcpNL9pZL1K', bytes=27789, created_at=1733316950, filename='test.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)

In [51]:
client.files.list()

SyncPage[FileObject](data=[FileObject(id='file-7Pp1PnvJRHp6TyNp6HXiD2', bytes=27789, created_at=1733316728, filename='test.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)], object='list', has_more=False, first_id='file-7Pp1PnvJRHp6TyNp6HXiD2', last_id='file-7Pp1PnvJRHp6TyNp6HXiD2')

In [53]:
client.files.retrieve("file-7Pp1PnvJRHp6TyNp6HXiD2")

FileObject(id='file-7Pp1PnvJRHp6TyNp6HXiD2', bytes=27789, created_at=1733316728, filename='test.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)

In [59]:
client.fine_tuning.jobs.create(model = "gpt-3.5-turbo",
                               training_file="file-ELqHJFb7Pmuk5TUHanfmVD",
                               hyperparameters={
                                   "n_epochs":1
                               },
                               validation_file="file-6HParUXBYXgtcpNL9pZL1K")

FineTuningJob(id='ftjob-yLid4v88mbGyBTAVZhohBh8R', created_at=1733316980, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs=1, batch_size='auto', learning_rate_multiplier='auto'), model='gpt-3.5-turbo-0125', object='fine_tuning.job', organization_id='org-3EtQLVSX02GEHJtlAwCVPJRJ', result_files=[], status='validating_files', trained_tokens=None, training_file='file-ELqHJFb7Pmuk5TUHanfmVD', validation_file='file-6HParUXBYXgtcpNL9pZL1K', user_provided_suffix=None, seed=2143003385, estimated_finish=None, integrations=[])

In [60]:
client.fine_tuning.jobs.list(limit=10)

SyncCursorPage[FineTuningJob](data=[FineTuningJob(id='ftjob-yLid4v88mbGyBTAVZhohBh8R', created_at=1733316980, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs=1, batch_size=1, learning_rate_multiplier=2), model='gpt-3.5-turbo-0125', object='fine_tuning.job', organization_id='org-3EtQLVSX02GEHJtlAwCVPJRJ', result_files=[], status='running', trained_tokens=None, training_file='file-ELqHJFb7Pmuk5TUHanfmVD', validation_file='file-6HParUXBYXgtcpNL9pZL1K', user_provided_suffix=None, seed=2143003385, estimated_finish=None, integrations=[])], object='list', has_more=False)

### Evaluation

In [None]:
from openai import OpenAI
import pandas as pd

In [68]:
fine_tuned_model = 'ft:gpt-3.5-turbo-0125:personal::AajHZ3TT'
client = OpenAI()

In [103]:
### Selected very small data set for fine tunning to save costs. 
### If you want to see fine tunning influence more significant on the result, please increse train!!! and test data set size
df = pd.read_csv('./dataset/test.csv', encoding='unicode_escape')
labels = df.iloc[:,1].tolist()
texts = df.iloc[:,0].tolist()

texts = texts[480:]
labels = labels[480:]
total_classifications = len(labels)
print(len(texts))

20


In [104]:
def inference_for_eval(text, m):
    completion = client.chat.completions.create(
    model=m,
    messages=[
        {"role": "system", "content": "You are a intelligent assistant designed to classify news articles into these categories: business, entertainment, politics, sport, tech. Plaese, return only one word from categories"},
        {"role": "user", "content": text}
        ]
    )
    return completion.choices[0].message.content


In [105]:
### Base gpt-3.5-turbo model
output_base = [inference_for_eval(text, "gpt-3.5-turbo") for text in texts]

In [106]:
### Fine tuned gpt-3.5-turbo model
output = [inference_for_eval(text, fine_tuned_model) for text in texts]

In [110]:
### For base model
correct_classifications = sum(classification ==label for classification, label in zip(output_base,labels))

accuracy_percentage = (correct_classifications/total_classifications)*100
print(f"Accuracy base model:{accuracy_percentage:.2f}%")

Accuracy base model:80.00%


In [111]:
### For fine tuned model
correct_classifications = sum(classification ==label for classification, label in zip(output,labels))

accuracy_percentage = (correct_classifications/total_classifications)*100
print(f"Accuracy fine tunned model:{accuracy_percentage:.2f}%")

Accuracy fine tunned model:85.00%


In [109]:
val_df = pd.DataFrame()
val_df['Correct'] = labels
val_df['Classification'] = output
val_df['Classification_base'] = output_base
display(val_df)

Unnamed: 0,Correct,Classification,Classification_base
0,entertainment,entertainment,entertainment
1,entertainment,entertainment,entertainment
2,politics,politics,business
3,sport,sport,sport
4,politics,politics,politics
5,politics,politics,politics
6,sport,sport,sport
7,sport,sport,sport
8,business,politics,politics
9,politics,politics,politics
