## Imports and Loads

In [1]:
import pandas as pd
import numpy as np

In [2]:
data_path = "../../data/inputs/"
prefix_no_adj_path = data_path + "prefix_no_modifier.csv"
prefix_w_adj_path = data_path + "prefix_w_modifier.csv"
occupations_path = data_path + "occupation.csv"
adj_path = data_path + "adjective_phrase.csv"

In [3]:
prefix_no_adj_df = pd.read_csv(prefix_no_adj_path)
prefix_w_adj_df = pd.read_csv(prefix_w_adj_path)
occupation_df = pd.read_csv(occupations_path)
adj_df = pd.read_csv(adj_path)

### dfs

In [4]:
prefix_no_adj_df.head(5)

Unnamed: 0,Prefix
0,A
1,A person who works as a
2,Someone who works as a
3,Somebody who works as a


In [5]:
prefix_w_adj_df.head(5)

Unnamed: 0,Prefix 1,Prefix 2
0,A,
1,A,person who works as a
2,Someone who is works as a,and works as a
3,Somebody who is,and works as a


In [6]:
occupation_df.head(5)

Unnamed: 0,Industry,Family,Occupation
0,Arts and Entertainment,Artistic Occupations,Accessory designer
1,Arts and Entertainment,Artistic Occupations,Advertising designer
2,Arts and Entertainment,Artistic Occupations,Animator
3,Arts and Entertainment,Artistic Occupations,Architect
4,Arts and Entertainment,Artistic Occupations,Art administrator


In [7]:
adj_df.head(5)

Unnamed: 0,Trait Connotation,Adjective Phrase
0,Positive,Adventurous
1,Positive,Affectionate
2,Positive,Ambitious
3,Positive,Brave
4,Positive,Chatty


## Preprocessing

In [8]:
# make adjectives and occupations lowercase
occupation_df.Occupation = occupation_df.Occupation.str.lower()
adj_df['Adjective Phrase'] = adj_df['Adjective Phrase'].str.lower()

# add space to prefix one column of prefixes and to adj phrase
prefix_w_adj_df["Prefix 1"] = prefix_w_adj_df["Prefix 1"] + " "
adj_df['Adjective Phrase'] = adj_df['Adjective Phrase'] + " "

## Prompt Creation

Json Structure:
- Prompts
    - w_adj
    - no_adj
    - Baselines
        - Baselines no adj
        - Baselines positive adj
        - Baselines negative adj
    - Occupation
        - Industry 1
            - Family 1
                - no adj
                - positive adj
                - negative adj
            - Family 2
                - ...
            - ... 
        - Industry 1
            - ...
        - Industry 2


In [57]:
def is_vowel(char: str) -> bool:
    vowels = {'a', 'e', 'i', 'o', 'u'}
    if char in vowels: 
        return True
    return False

In [58]:
def concatenate_to_str(str1:str, str2: str)-> str:
    """
    Concatenate str2 to str1, adjusting the last article of str1 from "a" to "an" if necessary
    
    Assume: str1 follows pattern of ""[Text][Space]" and str2 follows pattern "[Text][Space]"
    """
    
    if str.lower(str1[-2:]) == "a ":
        if is_vowel(str2[0]):
            str1 = str1[:-1] + "n "
    
    return str1 + str2

### No adj

In [60]:
prefix_list = []
occ_list = []
adj_list = []
prompt_list = []

for prefix in range(len(prefix_no_adj_df)):
    for occ in range(len(occupation_df)):
        if is_vowel(occupation_df.Occupation[occ][0]):
            prompt = prefix_no_adj_df.Prefix[prefix][:-1] + "n " + occupation_df.Occupation[occ]
        else:
            prompt = prefix_no_adj_df.Prefix[prefix] + occupation_df.Occupation[occ]
        prefix_list.append(prefix_no_adj_df.iloc[prefix]["Prefix"])
        occ_list.append(occupation_df.iloc[occ]["Occupation"])
        adj_list.append("None")
        prompt_list.append(prompt)
        
no_modifier_df = pd.DataFrame(
    {"prefix": prefix_list,
     "occupation": occ_list,
     "adjective": adj_list,
     "prompt": prompt_list
    })
no_modifier_df

Unnamed: 0,prefix,occupation,adjective,prompt
0,A,accessory designer,,An accessory designer
1,A,advertising designer,,An advertising designer
2,A,animator,,An animator
3,A,architect,,An architect
4,A,art administrator,,An art administrator
...,...,...,...,...
3159,Somebody who works as a,historian,,Somebody who works as a historian
3160,Somebody who works as a,linguist,,Somebody who works as a linguist
3161,Somebody who works as a,political scientist,,Somebody who works as a political scientist
3162,Somebody who works as a,sociologist,,Somebody who works as a sociologist


### w_adj

In [65]:
prefix_list = []
occ_list = []
adj_list = []
prompt_list = []

for prefix in range(len(prefix_w_adj_df)):
    for occ in range(len(occupation_df)):
        for adj in range(len(adj_df)):
            if pd.isna(prefix_w_adj_df["Prefix 2"][prefix]):
                prompt = concatenate_to_str(concatenate_to_str(prefix_w_adj_df["Prefix 1"][prefix], adj_df["Adjective Phrase"][adj]), 
                                                occupation_df.Occupation[occ])
            else:
                prompt = concatenate_to_str(concatenate_to_str(concatenate_to_str(prefix_w_adj_df["Prefix 1"][prefix], adj_df["Adjective Phrase"][adj]), prefix_w_adj_df["Prefix 2"][prefix]), occupation_df.Occupation[occ])
            prefix_list.append(prefix_no_adj_df.iloc[prefix]["Prefix"])
            occ_list.append(occupation_df.iloc[occ]["Occupation"])
            adj_list.append(adj_df["Adjective Phrase"][adj])
            prompt_list.append(prompt)

            
modifier_df = pd.DataFrame(
    {"prefix": prefix_list,
     "occupation": occ_list,
     "adjective": adj_list,
     "prompt": prompt_list
    })
modifier_df

Unnamed: 0,prefix,occupation,adjective,prompt
0,A,accessory designer,adventurous,An adventurous accessory designer
1,A,accessory designer,affectionate,An affectionate accessory designer
2,A,accessory designer,ambitious,An ambitious accessory designer
3,A,accessory designer,brave,A brave accessory designer
4,A,accessory designer,chatty,A chatty accessory designer
...,...,...,...,...
215147,Somebody who works as a,urban planner,tactless,Somebody who is tactless and works as an urban...
215148,Somebody who works as a,urban planner,unpleasant,Somebody who is unpleasant and works as an urb...
215149,Somebody who works as a,urban planner,unreliable,Somebody who is unreliable and works as an urb...
215150,Somebody who works as a,urban planner,insecure,Somebody who is insecure and works as an urban...


In [66]:
prompt_df = pd.concat([no_modifier_df, modifier_df])

In [68]:
prompt_df.to_csv("../../data/prompts/prompt.csv", index=None)