In [0]:
!pip install -r requirements.txt

In [0]:
import pandas as pd
from openai import AzureOpenAI
import yaml
import os

In [0]:
df_path = 'Dataset/normalization_assesment_dataset_10k.csv'
df = pd.read_csv(df_path)
display(df)

In [0]:
df['raw_comp_writers_text'] = df['raw_comp_writers_text'].str.strip()
df.replace("", pd.NA, inplace=True)
df = df.dropna()
print(df['raw_comp_writers_text'].isnull().sum())

In [0]:
def read_config(config_file):
    with open(config_file, 'r') as file:
        config = yaml.safe_load(file)
    return config

config = read_config('configuration.yaml')

api_key = config['api_key']
api_base =  config['api_base']
api_version = config['api_version']
model = config['model']

client = AzureOpenAI(
        api_key=api_key,
        api_version=api_version,
        base_url=f"{api_base}/openai/deployments/{model}"
    )

In [0]:
prompt = """
Normalize the following raw text by  removing redundant information, and keeping only the writer names in the output.\n
Raw Text: "{raw_text}"
Normalized Text:
"""

def normalize_with_gpt(raw_text, few_shot_prompt):
    prompt = few_shot_prompt.format(raw_text=raw_text)

    # Call Azure OpenAI API
    response = client.chat.completions.create(
        model="gpt-4o",  # Replace with your Azure deployment name
        messages = [
            {"role": "system", "content": """You are a helpful assistant who aids with text normalization on music industry. The goal is to normalize given text, removing redundant information, and keeping only the writer names in the output. \n 
            Here are some examples: \n\n

            Example1:
            RAW TEXT: <Unknown>/Wright, Justyce Kaseem
            Normalized Text: Justyce Kaseem Wright
            Example 2:
            RAW TEXT: Pixouu/Abdou Gambetta/Copyright Control
            Normalized Text: Pixouu/Abdou Gambetta
            Example 3:
            RAW TEXT: Mike Hoyer/JERRY CHESNUT/SONY/ATV MUSIC PUBLISHING (UK) LIMITED
            Normalized Text: JERRY CHESNUT/Mike Hoyer"""},
            {"role": "user", "content": prompt}
        ],
        max_tokens=100,  # Limit the output tokens
        temperature=0,  # Make the output deterministic
        stop=["\n"]  # Stop generation after the normalized text
    )

    # Extract the generated text
    normalized_text = response.choices[0].message.content.strip()
    return normalized_text


In [0]:
# Example usage
raw_texts = [
    "Jordan Riley/Adam Argyle/Copyright Control",
    "Budde Music/Lorenz Brunner",
    "Tony Grace/Rob DeBoer"
]

# Normalize each raw text
for raw_text in raw_texts:
    normalized_text = normalize_with_gpt(raw_text, prompt)
    print(f"Raw Text: {raw_text}")
    print(f"Normalized Text: {normalized_text}")
    print("-" * 40)