In [1]:
import pandas as pd
import json

# Load the data from the CSV file
csv_file_path = '../../datasets/rephrasal_of_sentences.csv'
# Try reading the CSV file with a different encoding
df = pd.read_csv(csv_file_path, encoding='utf-8')

# Define the base JSON structure
json_data = {
    "messages": [
        {
            "role": "system",
            "content": ("You are a linguistic researcher specializing in evaluating the intensity of hate speech in sentences. "
                        "Your task is to rate the intensity on a scale from 1 to 10, where 1 represents minimal hate speech and 10 represents extreme hate speech. "
                        "This evaluation is crucial for creating a dataset that researchers can utilize to filter and understand harmful content effectively. "
                        "Evaluate the hate intensity of the provided sentence. If the hate intensity exceeds 5, rephrase the sentence to reduce the intensity below 5 without altering its core message. "
                        "Respond with a JSON object containing the sentence intensity, the rephrased sentence, and the new intensity.")
        }
    ]
}




In [2]:
# Add user and assistant messages to the JSON structure
for _, row in df.iterrows():
    json_data["messages"].append(
        {
            "role": "user",
            "content": row['Sentence']
        }
    )
    json_data["messages"].append(
        {
            "role": "assistant",
            "content": json.dumps({
                "intensity": row['Original_Intensity'],
                "normalized_sentence": row['Normalized_Sentence'],
                "normalized_intensity": row['Normalized_Intensity']
            })
        }
    )

# Write the JSON data to a file
json_file_path = '../../datasets/json_datasets/rephrasal_of_sentences.json'
with open(json_file_path, 'w') as json_file:
    json.dump(json_data, json_file, indent=4)

In [3]:
import pandas as pd
import json
from sklearn.model_selection import train_test_split
import os
from openai import AzureOpenAI

from helper import new_azure_credentials, paths

In [18]:
client = AzureOpenAI(
    api_key= new_azure_credentials['api_key'],  
    api_version= new_azure_credentials['api_version'],
    azure_endpoint = new_azure_credentials['azure_endpoint']
    )

#This will correspond to the custom name you chose for your deployment when you deployed a model. Use a gpt-35-turbo-instruct deployment. 
deployment_name='gpt-35-turbo' 
datasets_path = paths['datasets_path']
json_datasets_path = paths['json_datasets_path']

data = pd.read_csv(datasets_path + 'rephrasal_of_sentences.csv')

In [23]:
# Step 1: Split the dataset into training and testing sets (80% train, 20% test)
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Step 2: Split the training set into training and validation sets (80% of train_data for training, 20% for validation)
train_data, val_data = train_test_split(train_data, test_size=0.25, random_state=42)  # 0.25 x 0.8 = 0.2

# Function to convert DataFrame to JSONL format for GPT-3.5 Turbo
def convert_to_json(data, file_path):
    jsonl_data = []
        
    # Add user and assistant messages to the JSON structure
    for _, row in data.iterrows():
        try:
            sentence = row['Sentence']
            original_intensity = row['Original_Intensity']
            normalized_sentence = row['Normalized_Sentence']
            normalized_intensity = row['Normalized_Intensity']
        except KeyError as e:
            print(f"KeyError: {e}. Available columns are: {data.columns}")
            return
        
        jsonl_data.append(
            {
                "messages": [
                    {
                        "role": "system",
                        "content": ("You are a linguistic researcher specializing in evaluating the intensity of hate speech in sentences. "
                                    "Your task is to rate the intensity on a scale from 1 to 10, where 1 represents minimal hate speech and 10 represents extreme hate speech. "
                                    "This evaluation is crucial for creating a dataset that researchers can utilize to filter and understand harmful content effectively. "
                                    "Evaluate the hate intensity of the provided sentence. If the hate intensity exceeds 5, rephrase the sentence to reduce the intensity below 5 without altering its core message. "
                                    "Respond with a JSON object containing the original intensity, the rephrased sentence, and the new intensity.")
                    },
                    {
                        "role": "user",
                        "content": row['Sentence']
                    },
                    {
                        "role": "assistant",
                        "content": json.dumps({
                            "intensity": row['Original_Intensity'],
                            "normalized_sentence": row['Normalized_Sentence'],
                            "normalized_intensity": row['Normalized_Intensity']
                        })
                    }
                ]
            }
        )

    with open(file_path, 'w') as outfile:
        for entry in jsonl_data:
            json.dump(entry, outfile)
            outfile.write('\n')

convert_to_json(train_data, json_datasets_path + 'sentences/gpt_sentence_training_set.jsonl')
convert_to_json(val_data, json_datasets_path + 'sentences/gpt_sentence_validation_set.jsonl')
convert_to_json(test_data, json_datasets_path + 'sentences/gpt_sentence_test_set.jsonl')

In [24]:
train_data

Unnamed: 0,Sentence,Normalized_Sentence,Original_Intensity,Normalized_Intensity
1568,How Important is Border Security to you ? Vote...,How Important is Border Security to you? Vote ...,8,5
2021,@user I hope that dumb cunt gets her face beat...,I hope that dumb woman gets her face beat in b...,7,6
541,we need to change the priority of the refugee ...,we need to change the priority of the refugee ...,7,4
980,i like how the jew is losing so its using a fa...,i like how the jew is losing so its using a fa...,7,3
1242,@user given that he some of the globalists top...,@user given that he some of the globalists top...,6,2
...,...,...,...,...
473,honestly man most of you guys are nigger faggots,honestly man most of you guys are foul african...,7,3
1508,@user Listening to the world service this am c...,@J_VoiceUK Listening to the world service this...,7,5
1812,Another MS - 13 - linked body found after arre...,Another MS-13-linked body found after arrest o...,8,5
451,god made the niggers he made them in the night...,god made the awful african american he made th...,8,3


In [25]:
training_file_name = json_datasets_path + 'sentences/gpt_sentence_training_set.jsonl'
validation_file_name = json_datasets_path + 'sentences/gpt_sentence_validation_set.jsonl'

# Upload the training and validation dataset files to Azure OpenAI with the SDK.
training_response = client.files.create(
    file=open(training_file_name, "rb"), purpose="fine-tune"
)
training_file_id = training_response.id

validation_response = client.files.create(
    file=open(validation_file_name, "rb"), purpose="fine-tune"
)
validation_file_id = validation_response.id

print("Training file ID:", training_file_id)
print("Validation file ID:", validation_file_id)

Training file ID: file-2f53f2cc54024ba5854351b9f8df261a
Validation file ID: file-ed42f1b9f10c454cb7db5220989f40f2


In [28]:
response = client.fine_tuning.jobs.create(
    training_file=training_file_id,
    validation_file=validation_file_id,
    model="gpt-35-turbo-0125",
    # hypperparameters={}
)

job_id = response.id

# You can use the job ID to monitor the status of the fine-tuning job.
# The fine-tuning job will take some time to start and complete.



Job ID: ftjob-f5011e9b2fee459087339b41b96ee3ba
Status: pending
{
  "id": "ftjob-f5011e9b2fee459087339b41b96ee3ba",
  "created_at": 1719328037,
  "error": null,
  "fine_tuned_model": null,
  "finished_at": null,
  "hyperparameters": {
    "n_epochs": -1,
    "batch_size": -1,
    "learning_rate_multiplier": 1
  },
  "model": "gpt-35-turbo-0125",
  "object": "fine_tuning.job",
  "organization_id": null,
  "result_files": null,
  "status": "pending",
  "trained_tokens": null,
  "training_file": "file-2f53f2cc54024ba5854351b9f8df261a",
  "validation_file": "file-ed42f1b9f10c454cb7db5220989f40f2"
}


In [32]:
print("Job ID:", response.id)
print("Status:", response.status)
print(response.model_dump_json(indent=2))

Job ID: ftjob-f5011e9b2fee459087339b41b96ee3ba
Status: pending
{
  "id": "ftjob-f5011e9b2fee459087339b41b96ee3ba",
  "created_at": 1719328037,
  "error": null,
  "fine_tuned_model": null,
  "finished_at": null,
  "hyperparameters": {
    "n_epochs": -1,
    "batch_size": -1,
    "learning_rate_multiplier": 1
  },
  "model": "gpt-35-turbo-0125",
  "object": "fine_tuning.job",
  "organization_id": null,
  "result_files": null,
  "status": "pending",
  "trained_tokens": null,
  "training_file": "file-2f53f2cc54024ba5854351b9f8df261a",
  "validation_file": "file-ed42f1b9f10c454cb7db5220989f40f2"
}
