### Fine-tuning and training Azure GPT 3.5 Turbo model to produce an intensity on a scale of 1 to 10. 

Link: https://learn.microsoft.com/en-us/azure/ai-services/openai/how-to/fine-tuning?tabs=turbo%2Cpython-new&pivots=programming-language-python

In [3]:
import pandas as pd
import json
from sklearn.model_selection import train_test_split
import os
from openai import AzureOpenAI

from helper import new_azure_credentials, paths

In [4]:
client = AzureOpenAI(
    api_key= new_azure_credentials['api_key'],  
    api_version= new_azure_credentials['api_version'],
    azure_endpoint = new_azure_credentials['azure_endpoint']
    )

#This will correspond to the custom name you chose for your deployment when you deployed a model. Use a gpt-35-turbo-instruct deployment. 
deployment_name='gpt-35-turbo' 
datasets_path = paths['datasets_path']
json_datasets_path = paths['json_datasets_path']

data = pd.read_csv(datasets_path + 'hate_int_prof_SVO.tsv', sep='\t')

In [5]:
train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)

# Function to convert DataFrame to JSONL format for GPT-3.5 Turbo
def convert_to_jsonl(data, file_path):
    jsonl_data = []
    for index, row in data.iterrows():
        jsonl_data.append({
            "messages": [
                {"role": "system", "content": "You are a linguistic researcher specializing in evaluating the intensity of hate speech in sentences. Your task is to rate the intensity on a scale from 1 to 10, where 1 represents minimal hate speech and 10 represents extreme hate speech. This evaluation is crucial for creating a dataset that researchers can utilize to filter and understand harmful content effectively."},
                {"role": "user", "content": row['Sentence']},
                {"role": "assistant", "content": str(row['Intensity'])}
            ]
        })

    with open(file_path, 'w') as outfile:
        for entry in jsonl_data:
            json.dump(entry, outfile)
            outfile.write('\n')

convert_to_jsonl(train_data, json_datasets_path + 'gpt_training_set.jsonl')
convert_to_jsonl(val_data, json_datasets_path + 'gpt_validation_set.jsonl')

In [6]:
training_file_name = json_datasets_path + 'gpt_training_set.jsonl'
validation_file_name = json_datasets_path + 'gpt_validation_set.jsonl'

# Upload the training and validation dataset files to Azure OpenAI with the SDK.
training_response = client.files.create(
    file=open(training_file_name, "rb"), purpose="fine-tune"
)
training_file_id = training_response.id

validation_response = client.files.create(
    file=open(validation_file_name, "rb"), purpose="fine-tune"
)
validation_file_id = validation_response.id

print("Training file ID:", training_file_id)
print("Validation file ID:", validation_file_id)

Training file ID: file-44cf10a0f7a94763b16e0573ab410756
Validation file ID: file-ccdaec573fa14af6b06152b222541fbf


In [7]:
response = client.fine_tuning.jobs.create(
    training_file=training_file_id,
    validation_file=validation_file_id,
    model="gpt-4-0613",
    # hypperparameters={}
)

job_id = response.id

# You can use the job ID to monitor the status of the fine-tuning job.
# The fine-tuning job will take some time to start and complete.

print("Job ID:", response.id)
print("Status:", response.status)
print(response.model_dump_json(indent=2))

Job ID: ftjob-0e61ba8ff38c41aeb885f64928004295
Status: pending
{
  "id": "ftjob-0e61ba8ff38c41aeb885f64928004295",
  "created_at": 1718179628,
  "error": null,
  "fine_tuned_model": null,
  "finished_at": null,
  "hyperparameters": {
    "n_epochs": -1,
    "batch_size": -1,
    "learning_rate_multiplier": 1
  },
  "model": "gpt-4-0613",
  "object": "fine_tuning.job",
  "organization_id": null,
  "result_files": null,
  "seed": null,
  "status": "pending",
  "trained_tokens": null,
  "training_file": "file-44cf10a0f7a94763b16e0573ab410756",
  "validation_file": "file-ccdaec573fa14af6b06152b222541fbf",
  "estimated_finish": null,
  "integrations": null
}


In [8]:
response

FineTuningJob(id='ftjob-0e61ba8ff38c41aeb885f64928004295', created_at=1718179628, error=None, fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs=-1, batch_size=-1, learning_rate_multiplier=1), model='gpt-4-0613', object='fine_tuning.job', organization_id=None, result_files=None, seed=None, status='pending', trained_tokens=None, training_file='file-44cf10a0f7a94763b16e0573ab410756', validation_file='file-ccdaec573fa14af6b06152b222541fbf', estimated_finish=None, integrations=None)

In [9]:
print("Status:", response.status)

Status: pending


### To get a token, go to Azure portal and in the CLI, run the following command:

```az account get-access-token```

In [13]:
import json
import os
import requests

token= "eyJ0eXAiOiJKV1QiLCJhbGciOiJSUzI1NiIsIng1dCI6IkwxS2ZLRklfam5YYndXYzIyeFp4dzFzVUhIMCIsImtpZCI6IkwxS2ZLRklfam5YYndXYzIyeFp4dzFzVUhIMCJ9.eyJhdWQiOiJodHRwczovL21hbmFnZW1lbnQuY29yZS53aW5kb3dzLm5ldC8iLCJpc3MiOiJodHRwczovL3N0cy53aW5kb3dzLm5ldC9mYjY1NjUwNC04ODI5LTQ0OTAtOGUzOC0yZTQwZDM3OTA4ZGEvIiwiaWF0IjoxNzE3Njg3OTgzLCJuYmYiOjE3MTc2ODc5ODMsImV4cCI6MTcxNzY5Mjg1MiwiYWNyIjoiMSIsImFpbyI6IkFXUUFtLzhXQUFBQXVvb2tReHZySk5UUzExT1JrK1NPVUZKTWt3NEhqSmxWWWJvRE1IczZqNS9rSE5lKzV3QVhCK3VVczZSYUVvUmpIWjdnVHlWeTNNTXVGWENxTk5valpKcXRZVkhCZkdmUUZraENrR2pldnFGSWxqVTA1WDZYdlIxaXJxVmFWWG5LIiwiYWx0c2VjaWQiOiIxOmxpdmUuY29tOjAwMDMwMDAwNzhEQkU3MDQiLCJhbXIiOlsicHdkIl0sImFwcGlkIjoiYjY3N2MyOTAtY2Y0Yi00YThlLWE2MGUtOTFiYTY1MGE0YWJlIiwiYXBwaWRhY3IiOiIwIiwiZW1haWwiOiJtaWNyb3NvZnRAZWZhZHJpbi5jb20iLCJmYW1pbHlfbmFtZSI6IkdhbmRoaSIsImdpdmVuX25hbWUiOiJNYWhhdG1hIiwiZ3JvdXBzIjpbImVhNTI1NDgxLTM1YzAtNDY3Ni1iY2M0LWE5MTQ4MmE3MDEzYSJdLCJpZHAiOiJsaXZlLmNvbSIsImlkdHlwIjoidXNlciIsImlwYWRkciI6IjEwMy4xNjMuMjIwLjIzNSIsIm5hbWUiOiJNYWhhdG1hIEdhbmRoaSIsIm9pZCI6ImIyODg5ODhjLWFmNmUtNDM1ZC04MzRhLWUzYmU3OTI1Yjk1ZCIsInB1aWQiOiIxMDAzMjAwMzhENzY5QTA1IiwicmgiOiIwLkFiNEFCR1ZsLXltSWtFU09PQzVBMDNrSTJrWklmM2tBdXRkUHVrUGF3ZmoyTUJPLUFIWS4iLCJzY3AiOiJ1c2VyX2ltcGVyc29uYXRpb24iLCJzdWIiOiJqcG45XzNwSUpjbk1YWjhnc1IxbnVGTElWRUIwWnJoRkFsZHRud01DVVlJIiwidGlkIjoiZmI2NTY1MDQtODgyOS00NDkwLThlMzgtMmU0MGQzNzkwOGRhIiwidW5pcXVlX25hbWUiOiJsaXZlLmNvbSNtaWNyb3NvZnRAZWZhZHJpbi5jb20iLCJ1dGkiOiJpYi1vb3ZlXzcwV0txYTdxUTdwbkFBIiwidmVyIjoiMS4wIiwid2lkcyI6WyI2MmU5MDM5NC02OWY1LTQyMzctOTE5MC0wMTIxNzcxNDVlMTAiLCJiNzlmYmY0ZC0zZWY5LTQ2ODktODE0My03NmIxOTRlODU1MDkiXSwieG1zX2Vkb3YiOnRydWUsInhtc190Y2R0IjoxNzE3NDk0NTkzfQ.hsgkP7PM_jg6yhHZi4oB5XWYU-MXzDiAwgyGdg6wYSbwIUXpxoSEh7iJzgRGzGKbAk7v9a7tb2dEVzwBMzbX3QQQz1KCUTfJ3C6rKIEMM-nz2ZTBLB2-2RzL5MwPcA61TPzuU8E6LhGvGaSpsF81Ske6xJ8PcjJ4cIVaTNxySA261zXY-aoInb917VxhmnxFIkWZF716J0cvhww0p7FKfSsCq591K2mM-cwt-pLZNPt9v6VnTYgX6GMPHj7M2_yZt_JFGb3XoSV6ATNKWKZQP2_V975UE1fWhGqJcGpVIZnPpBo27Bt17v17P6vY9bEn14pHXcQLyZd-o6Kbysd9mg"
subscription = "4e05894f-dae4-4e4c-9213-f2a81f019b24"  
resource_group = "research"
resource_name = "research-gpt"
model_deployment_name ="gpt-35-turbo-ft" # custom deployment name that you will use to reference the model when making inference calls.

deploy_params = {'api-version': "2024-04-01-preview"} 
deploy_headers = {'Authorization': 'Bearer {}'.format(token), 'Content-Type': 'application/json'}

deploy_data = {
    "sku": {"name": "standard", "capacity": 1}, 
    "properties": {
        "model": {
            "format": "OpenAI",
            "name": "gpt-35-turbo-0125.ft-1cedaf4241954b63bd1b7ebd1cc2370b", #retrieve this value from the previous call, it will look like gpt-35-turbo-0613.ft-b044a9d3cf9c4228b5d393567f693b83
            "version": "1"
        }
    }
}
deploy_data = json.dumps(deploy_data)

request_url = f'https://management.azure.com/subscriptions/{subscription}/resourceGroups/{resource_group}/providers/Microsoft.CognitiveServices/accounts/{resource_name}/deployments/{model_deployment_name}'

print('Creating a new deployment...')

r = requests.put(request_url, params=deploy_params, headers=deploy_headers, data=deploy_data)

print(r)
print(r.reason)
print(r.json())

Creating a new deployment...
<Response [201]>
Created
{'id': '/subscriptions/4e05894f-dae4-4e4c-9213-f2a81f019b24/resourceGroups/research/providers/Microsoft.CognitiveServices/accounts/research-gpt/deployments/gpt-35-turbo-ft', 'type': 'Microsoft.CognitiveServices/accounts/deployments', 'name': 'gpt-35-turbo-ft', 'sku': {'name': 'standard', 'capacity': 1}, 'properties': {'model': {'format': 'OpenAI', 'name': 'gpt-35-turbo-0125.ft-1cedaf4241954b63bd1b7ebd1cc2370b', 'version': '1'}, 'versionUpgradeOption': 'NoAutoUpgrade', 'currentCapacity': 1, 'capabilities': {'chatCompletion': 'true'}, 'provisioningState': 'Creating', 'rateLimits': [{'key': 'request', 'renewalPeriod': 10, 'count': 1}, {'key': 'token', 'renewalPeriod': 60, 'count': 1000}]}, 'systemData': {'createdBy': 'microsoft@efadrin.com', 'createdByType': 'User', 'createdAt': '2024-06-06T15:53:16.6215901Z', 'lastModifiedBy': 'microsoft@efadrin.com', 'lastModifiedByType': 'User', 'lastModifiedAt': '2024-06-06T15:53:16.6215901Z'}, 'et

### Using a custom deployment

In [36]:
content = "Not good news at all but this happens in all European nations and is no doubt pushed by the likes of Jews like Summer Redstone and MTV ! !"

response = client.chat.completions.create(
    model="gpt-35-turbo-ft", # model = "Custom deployment name you chose for your fine-tuning model"
    messages=[
        {"role": "system", "content": "You are a linguistic researcher specializing in evaluating the intensity of hate speech in sentences. Your task is to rate the intensity on a scale from 1 to 10, where 1 represents minimal hate speech and 10 represents extreme hate speech. This evaluation is crucial for creating a dataset that researchers can utilize to filter and understand harmful content effectively."}, 
        {"role": "user", "content": content},
    ]
)

print(response.choices[0].message.content)

6.0


In [None]:
# Retrieve the file ID of the first result file from the fine-tuning job
# for the customized model.
response = client.fine_tuning.jobs.retrieve(job_id)
if response.status == 'succeeded':
    result_file_id = response.result_files[0]

retrieve = client.files.retrieve(result_file_id)

# Download the result file.
print(f'Downloading result file: {result_file_id}')

with open(retrieve.filename, "wb") as file:
    result = client.files.content(result_file_id).read()
    file.write(result)