# Preparing Dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
os.chdir('/content/drive/MyDrive/xian')

In [None]:
import pandas as pd
df = pd.read_csv('sentences_with_sentiment.csv')
df = df[['text','sentiment']]
df.head()

Unnamed: 0,text,sentiment
0,Yes.,0
1,Our solutions address those challenges by rapi...,1
2,"On the film side, we are seeing an unprecedent...",1
3,"And with that, I'd like to turn the call over ...",0
4,Welcome to our fourth quarter and full year 20...,0


In [None]:
sentiment_mapping = {0: 'Neutral', 1: 'Positive', -1: 'Negative'}
df['sentiment'] = df['sentiment'].map(sentiment_mapping)
df

Unnamed: 0,text,sentiment
0,Yes.,Neutral
1,Our solutions address those challenges by rapi...,Positive
2,"On the film side, we are seeing an unprecedent...",Positive
3,"And with that, I'd like to turn the call over ...",Neutral
4,Welcome to our fourth quarter and full year 20...,Neutral
...,...,...
514,We'll work with our customers on their require...,Positive
515,I think that's fair.,Neutral
516,"Our customers have chemical plants, typically ...",Neutral
517,I should probably also point out that we had o...,Positive


In [None]:
import numpy as np
from sklearn.model_selection import train_test_split

# Split into training+validation and testing sets
X_temp, X_test, y_temp, y_test = train_test_split(df['text'], df['sentiment'], test_size=0.2, random_state=42)

# Split the training+validation set into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42)

# Print the sizes of each set
print(f"Training set size: {X_train.shape[0]}")
print(f"Validation set size: {X_val.shape[0]}")
print(f"Testing set size: {X_test.shape[0]}")

Training set size: 311
Validation set size: 104
Testing set size: 104


In [None]:
df_train = pd.DataFrame({'text': X_train, 'sentiment': y_train})
df_val = pd.DataFrame({'text': X_val, 'sentiment': y_val})
df_test = pd.DataFrame({'text': X_test, 'sentiment': y_test})

In [None]:
df_val

Unnamed: 0,text,sentiment
284,Got it.,Neutral
36,"Finally, our third objective was to work with ...",Positive
183,"So in Q4, for example, that can go down by as ...",Negative
23,I'm looking forward to working closely with Cy...,Positive
263,"A selling responsibility, there are two things...",Neutral
...,...,...
450,I'd like to thank everyone for joining in on t...,Neutral
196,"We've incorporated for our next flight, a new ...",Positive
406,Dividend that was decided yesterday by the gen...,Positive
256,"Financial inclusion, the best part is that the...",Positive


In [None]:
df_train.value_counts('sentiment')

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
Neutral,126
Positive,124
Negative,61


In [None]:
neutral = df_val[df_val['sentiment'] == 'Neutral']
positive = df_val[df_val['sentiment'] == 'Positive']
negative = df_val[df_val['sentiment'] == 'Negative']
df_train = pd.concat([neutral, positive, negative]).sample(frac=1).reset_index(drop=True)
df_train.head(5)

Unnamed: 0,text,sentiment
0,"Yes, sir.",Neutral
1,"Thank you, Bo.",Neutral
2,Are you actually seeing an impact yet?,Neutral
3,Welcome to our fourth quarter and full year 20...,Neutral
4,"Can I ask two questions, unfortunately come ba...",Negative


In [None]:
neutral = df_val[df_val['sentiment'] == 'Neutral']
positive = df_val[df_val['sentiment'] == 'Positive']
negative = df_val[df_val['sentiment'] == 'Negative']
# neutral = neutral.sample(n=180, random_state=42)
df_val = pd.concat([neutral, positive, negative]).sample(frac=1).reset_index(drop=True)
df_val.head(5)

Unnamed: 0,text,sentiment
0,Just curious as to what you are seeing with re...,Neutral
1,"Again, I'm doing sort of quick and dirty math ...",Neutral
2,And one of the things that we are trying to do...,Positive
3,"Thank you, Bo.",Neutral
4,I'll pass it over to Wendy.,Neutral


In [None]:
df_val.value_counts('sentiment')

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
Neutral,42
Positive,40
Negative,22


In [None]:
df_test.value_counts('sentiment')

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
Neutral,38
Positive,38
Negative,28


In [None]:
import json

def convert_to_new_format(old_data):
    # Initialize an empty list to store the newly formatted entries
    new_data = []

    # Iterate over each row in the old_data DataFrame
    for _, row in old_data.iterrows():
        # This is where the transformation from the old format (dataframe) to new (structured JSON-like object) happens
        new_entry = {
            "messages": [
                # System message: This is a predefined question asked in the conversation format
                {"role": "system", "content": 'Would the sentence positively, negatively, or neutrally influence the stock price?'},

                # User message: This contains the actual text from the 'text' column of the DataFrame
                {"role": "user", "content": row["text"]},

                # Assistant message: This contains the sentiment label from the 'sentiment' column of the DataFrame
                {"role": "assistant", "content": row["sentiment"]}
            ]
        }

        # Append the newly formatted entry to the list
        new_data.append(new_entry)

    # Return the fully transformed dataset as a list of formatted entries
    return new_data


In [None]:
# Convert the old data to the new format
converted_data = convert_to_new_format(df_train)
# converted_data = convert_to_new_format(df_val)
# converted_data = convert_to_new_format(df_test)

# The file to write to
file_path = 'train_gpt_trust_issue.jsonl'
# file_path = 'val_gpt_trust_issue.jsonl'
# file_path = 'test_gpt_trust_issue.jsonl'


# Open the file in write mode
with open(file_path, 'w') as file:
    for record in converted_data:
        # Convert the dictionary to a JSON string
        json_record = json.dumps(record)
        # Write the JSON string to the file with a newline character
        file.write(json_record + '\n')

In [None]:
# Copy to new directory for later use
!cp train_gpt_trust_issue.jsonl '/content/drive/MyDrive/xian'
!cp val_gpt_trust_issue.jsonl '/content/drive/MyDrive/xian'
!cp test_gpt_trust_issue.jsonl '/content/drive/MyDrive/xian'

cp: 'train_gpt_trust_issue.jsonl' and '/content/drive/MyDrive/xian/train_gpt_trust_issue.jsonl' are the same file
cp: 'val_gpt_trust_issue.jsonl' and '/content/drive/MyDrive/xian/val_gpt_trust_issue.jsonl' are the same file
cp: 'test_gpt_trust_issue.jsonl' and '/content/drive/MyDrive/xian/test_gpt_trust_issue.jsonl' are the same file


In [None]:
df_train = df_train.rename(columns={'text':'input','sentiment':'output'})
df_train['instruction'] = 'Would the news positively, negatively, or neutrally influence the stock price?'
df_train.to_csv('/content/drive/MyDrive/xian/train_llama2_trust_issue.csv', sep=',', index=False)

df_val = df_val.rename(columns={'text':'input','sentiment':'output'})
df_val['instruction'] = 'Would the news positively, negatively, or neutrally influence the stock price?'
df_val.to_csv('/content/drive/MyDrive/xian/val_llama2_trust_issue.csv', sep=',', index=False)

df_test = df_test.rename(columns={'text':'input','sentiment':'output'})
df_test['instruction'] = 'Would the news positively, negatively, or neutrally influence the stock price?'

df_test.to_csv('/content/drive/MyDrive/xian/test_llama2_trust_issue.csv', sep=',', index=False)

# Finetuning ChatGPT

In [None]:
!pip install -Uq openai

In [None]:
from pathlib import Path
from tqdm import tqdm
import os
from openai import OpenAI
import getpass

if 'OPENAI_API_KEY' not in os.environ:
    os.environ['OPENAI_API_KEY'] = getpass.getpass(prompt='Enter your API key: ')

client = OpenAI(
    # This is the default and can be omitted
    api_key=os.environ.get("OPENAI_API_KEY"),
)


In [None]:
client.files.create(
    file=Path("/content/drive/MyDrive/xian/train_gpt_trust_issue.jsonl"),
    purpose="fine-tune",
)


FileObject(id='file-rWRwLO5feBpEXQU0P6lWjYYg', bytes=94899, created_at=1729991901, filename='train_gpt_trust_issue.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)

In [None]:
client.files.create(
    file=Path("/content/drive/MyDrive/xian/val_gpt_trust_issue.jsonl"),
    purpose="fine-tune",
)

FileObject(id='file-rGLh3s2y3o8WjnNjeVo6mkM6', bytes=31970, created_at=1729991904, filename='val_gpt_trust_issue.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)

In [None]:
import openai

try:
    client.fine_tuning.jobs.create(
        model="gpt-3.5-turbo-1106",
        training_file="file-rWRwLO5feBpEXQU0P6lWjYYg",
        validation_file="file-rGLh3s2y3o8WjnNjeVo6mkM6"
    )
except openai.APIConnectionError as e:
    print("The server could not be reached")
    print(e.__cause__)  # an underlying Exception, likely raised within httpx.
except openai.RateLimitError as e:
    print("A 429 status code was received; we should back off a bit.")
except openai.APIStatusError as e:
    print("Another non-200-range status code was received")
    print(e.status_code)
    print(e.response)

In [None]:
from openai import OpenAI
client = OpenAI()

client.files.create(
  file=Path("/content/drive/MyDrive/xian/train_gpt_trust_issue.jsonl"),
  purpose="fine-tune"
)

FileObject(id='file-aRERfM3I2i5PpjsEb5IZyZSo', bytes=32842, created_at=1729994351, filename='train_gpt_trust_issue.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)

In [None]:
client.files.create(
    file=Path("/content/drive/MyDrive/xian/val_gpt_trust_issue.jsonl"),
    purpose="fine-tune",
)

FileObject(id='file-OjP0FrFJ8A5sm7ywLILlAanB', bytes=32842, created_at=1729994355, filename='val_gpt_trust_issue.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)

In [None]:
client.fine_tuning.jobs.create(
  training_file="file-aRERfM3I2i5PpjsEb5IZyZSo",
  validation_file="file-OjP0FrFJ8A5sm7ywLILlAanB",
  model="gpt-3.5-turbo"
)

FineTuningJob(id='ftjob-XQDqvBjzNJoN2ZX6vCNOjWdO', created_at=1729994406, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs='auto', batch_size='auto', learning_rate_multiplier='auto'), model='gpt-3.5-turbo-0125', object='fine_tuning.job', organization_id='org-hAAz15NLvvYlssMjq4h9toTO', result_files=[], seed=1400877246, status='validating_files', trained_tokens=None, training_file='file-aRERfM3I2i5PpjsEb5IZyZSo', validation_file='file-OjP0FrFJ8A5sm7ywLILlAanB', estimated_finish=None, integrations=[], user_provided_suffix=None)

In [None]:
# Retrieve the state of a fine-tune
client.fine_tuning.jobs.retrieve("ftjob-XQDqvBjzNJoN2ZX6vCNOjWdO")

FineTuningJob(id='ftjob-XQDqvBjzNJoN2ZX6vCNOjWdO', created_at=1729994406, error=Error(code=None, message=None, param=None), fine_tuned_model='ft:gpt-3.5-turbo-0125:personal::AMn1BsAN', finished_at=1729995004, hyperparameters=Hyperparameters(n_epochs=3, batch_size=1, learning_rate_multiplier=2), model='gpt-3.5-turbo-0125', object='fine_tuning.job', organization_id='org-hAAz15NLvvYlssMjq4h9toTO', result_files=['file-sx6jlYzJySLVb2RXa33lGkMG'], seed=1400877246, status='succeeded', trained_tokens=16140, training_file='file-aRERfM3I2i5PpjsEb5IZyZSo', validation_file='file-OjP0FrFJ8A5sm7ywLILlAanB', estimated_finish=None, integrations=[], user_provided_suffix=None)

In [None]:
completion = client.chat.completions.create(
  model="ft:gpt-3.5-turbo-0125:personal::AMn1BsAN",
  messages=[
    {"role": "system", "content": "Would the sentence positively, negatively, or neutrally influence the stock price?."},
    {"role": "user", "content": "We are donating 100 million dollars to Boston University to name the Graduate School of Art and Science."},
    {"role": "user", "content": "Good back home and drive a beer."},
    {"role": "user", "content": "Good Evening"},
    {"role": "user", "content": "The Chinese firms control the market."},
    {"role": "user", "content": "We sold the player to ensure our financial health."},
  ]
)
print(completion.choices[0].message)


ChatCompletionMessage(content='Negative', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None)


In [None]:
tooot = client.chat.completions.create(
    model="ft:gpt-3.5-turbo-0125:personal::AMn1BsAN",
    messages=[
        {"role": "system", "content": "For each sentence below, respond whether it would positively, negatively, or neutrally influence the stock price. Respond to each sentence separately:"},
        {"role": "user", "content": """
1. We are donating 100 million dollars to Boston University to name the Graduate School of Art and Science.
2. Good, go back home and drink a cup of beer.
3. Good morning, good afternoon and good evening, depending on where you are.
4. The Chinese firms control the market.
5. We sold the player to ensure our financial health.
        """}
    ]
)
print(tooot.choices[0].message)

ChatCompletionMessage(content='1. Positive\n2. Neutral\n3. Neutral\n4. Negative\n5. Negative', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None)
