In [None]:
import pandas as pd
import numpy as np
import random
import os
import openai

#### GPT-4 configuration

In [None]:
azure_endpoint = "https://rumi-gpt4.openai.azure.com/"
api_key = ""
api_version = "2024-02-15-preview"
openai.api_type = "azure"
openai.api_base = azure_endpoint
openai.api_key = api_key
openai.api_version = api_version

### Load Prompt

In [None]:
def load_prompt_fs():
    """
    Reads and formats the prompt template with the few-shot examples.
    """
    with open('/Users/innerpiece92/Desktop/NLP_Workspace/AArec/prompt/generating/prompt_for_generating_user_profiles_fs.txt', 'r') as file:
        prompt = file.read()
        
        with open('/Users/innerpiece92/Desktop/NLP_Workspace/AArec/prompt/generating/few_shot_for_generating_user_profiles.txt', 'r') as fewshot_file:
            few_shot_examples = fewshot_file.read()
        
    return prompt.format(few_shot_examples=few_shot_examples)

def load_prompt_params(sampled_AtA):
    
    """
    Constructs the query-specific string for the prompt.
    """
    prompt_params = f"""
    
    {<sampled_AtA>}' 

    """
    return prompt_params

In [None]:
def generateCreativeBio(prompt, prompt_params):
    messages = [
        {"role": "system", "content": "You are a helpful AI assistant"},
        {"role": "user", "content": prompt},
        {"role": "user", "content": prompt_params}
    ]

    formatted_messages = "\n".join([f"{message['role']}: {message['content']}" for message in messages])
    print("Messages being sent to GPT:\n", formatted_messages)

    response = openai.ChatCompletion.create(
        messages=messages,
        engine="gpt-4",
        temperature=0,
        frequency_penalty=0,
        presence_penalty=0,
        max_tokens=100
    )
    result_content = response['choices'][0]['message']['content']
    return result_content

### Data Extraction

In [None]:
def data_extraction(df):

    review = [x for x in df['review'] if not pd.isna(x)]
    true_strong = [x for x in df['true_strong'] if not pd.isna(x)]
    true_weak = [x for x in df['true_weak'] if not pd.isna(x)]
    true_other = [x for x in df['true_other'] if not pd.isna(x)]
    true_strong_weak = [x for x in df['true_strong_weak'] if not pd.isna(x)]
    abs_true_strong = [x for x in df['abs_true_strong'] if not pd.isna(x)]
    abs_true_weak = [x for x in df['abs_true_weak'] if not pd.isna(x)]
    abs_true_other = [x for x in df['abs_true_other'] if not pd.isna(x)]
    abs_true_strong_weak = [x for x in df['abs_true_strong_weak'] if not pd.isna(x)]
    

    return {
        'review': review,
        'true_strong': true_strong,
        'true_weak': true_weak,
        'true_other': true_other,
        'true_strong_weak': true_strong_weak,
        'abs_true_strong': abs_true_strong,
        'abs_true_weak': abs_true_weak,
        'abs_true_other': abs_true_other,
        'abs_true_strong_weak': abs_true_strong_weak
    }


### Atypical Aspects Sampling

In [None]:
def sampling_atypical_aspects(atypical_aspects):
    """
    Select 1 to 5 atypical aspects according to distribution p(1) = 0.1, p(2) = 0.3, p(3) = 0.3, p(4) = 0.2, p(5) = 0.1.
    """
    
    numbers = [1, 2, 3, 4, 5]
    probabilities = [0.1, 0.3, 0.3, 0.2, 0.1]
    
    sampling_number = np.random.choice(numbers, p = probabilities)
    
    sampled_atypical_aspects = np.random.choice(atypical_aspects, size=sampling_number, replace=False).tolist()
    
    return sampled_atypical_aspects


### Atypical Aspects from 'dev' dataset

In [None]:
restaurants_dev_df = pd.read_excel('/Users/innerpiece92/Desktop/NLP_Workspace/AARec/code/restaurants/dev.xlsx', engine='openpyxl')
restaurants_dev_data = data_extraction(restaurants_dev_df)
restaurants_AtA = restaurants_dev_data['true_strong_weak']
restaurants_AtA = [item.strip() for sublist in restaurants_AtA for item in sublist.split(', ')]

In [None]:
hotels_dev_df = pd.read_excel('/Users/innerpiece92/Desktop/NLP_Workspace/AARec/code/hotels/dev.xlsx', engine='openpyxl')
hotels_dev_data = data_extraction(hotels_dev_df)
hotels_AtA = hotels_dev_data['true_strong_weak']
hotels_AtA = [item.strip() for sublist in hotels_AtA for item in sublist.split(', ')]

In [None]:
hairsalons_dev_df = pd.read_excel('/Users/innerpiece92/Desktop/NLP_Workspace/AARec/code/hair_salons/dev.xlsx', engine='openpyxl')
hairsalons_dev_data = data_extraction(hairsalons_dev_df)
hairsalons_AtA = hairsalons_dev_data['true_strong_weak']
hairsalons_AtA = [item.strip() for sublist in hairsalons_AtA for item in sublist.split(', ')]

### Atypical Aspects 'train-test' dataset

In [None]:
restaurants_traintest_df = pd.read_excel('/Users/innerpiece92/Desktop/NLP_Workspace/AARec/code/restaurants/train_test.xlsx', engine='openpyxl')
restaurants_traintest_data = data_extraction(restaurants_traintest_df)
restaurants_traintest_AtA = restaurants_traintest_data['true_strong_weak']
restaurants_traintest_AtA = [item.strip() for sublist in restaurants_traintest_AtA for item in sublist.split(', ')]

In [None]:
hotels_traintest_df = pd.read_excel('/Users/innerpiece92/Desktop/NLP_Workspace/AARec/code/hotels/train_test.xlsx', engine='openpyxl')
hotels_traintest_data = data_extraction(hotels_traintest_df)
hotels_traintest_AtA = hotels_traintest_data['true_strong_weak']
hotels_traintest_AtA = [item.strip() for sublist in hotels_traintest_AtA for item in sublist.split(', ')]

In [None]:
hairsalons_traintest_df = pd.read_excel('/Users/innerpiece92/Desktop/NLP_Workspace/AARec/code/hair_salons/train_test.xlsx', engine='openpyxl')
hairsalons_traintest_data = data_extraction(hairsalons_traintest_df)
hairsalons_traintest_AtA = hairsalons_traintest_data['true_strong_weak']
hairsalons_traintest_AtA = [item.strip() for sublist in hairsalons_traintest_AtA for item in sublist.split(', ')]

### Inference

In [None]:
sampled_AtA = sampling_atypical_aspects(hairsalons_traintest_AtA)
generateCreativeBio(sampled_AtA)