# Generative poems

## Prompting for un/rhyming poems

In [None]:
import sys
sys.path.append('../')
from generative_formalism import *

In [None]:
PROMPTS = {
    'DO_rhyme': [
        'Write a poem in ballad stanzas.',
        "Write an ryhmed poem in the style of Shakespeare's sonnets.",
        'Write a long poem that does rhyme.',
        'Write a poem in the style of Emily Dickinson.',
        'Write a poem in heroic couplets.',
        'Write an rhyming poem.',
        'Write a poem (with 20+ lines) that rhymes.',
        'Write a poem that does rhyme.',
        'Write a short poem that does rhyme.'
    ],
    
    'do_NOT_rhyme': [
        'Write a poem that does NOT rhyme.',
        'Write a poem (with 20+ lines) that does NOT rhyme.',
        'Write a long poem that does NOT rhyme.',
        'Write a poem in the style of Walt Whitman.',
        'Write a poem in free verse.',
        'Write a poem in blank verse.',
        'Write an unrhymed poem.',
        'Write a short poem that does NOT rhyme.'],
    'MAYBE_rhyme': [
        'Write a poem (with 20+ lines).',
        'Write a long poem.',
        'Write a poem in groups of two lines.',
        'Write a poem.',
        'Write a poem in stanzas of 4 lines each.',
        'Write a short poem.'
    ]
}


## generate prompt to type
print('> Setting prompts from constant')
PROMPT_TO_TYPE = {}
for prompt_type, prompt_list in PROMPTS.items():
    for prompt in prompt_list:
        PROMPT_TO_TYPE[prompt] = prompt_type
PROMPT_SET = set(PROMPT_TO_TYPE.keys())
PROMPT_LIST = list(PROMPT_TO_TYPE.keys())

print(f'  * {len(PROMPT_SET)} unique prompts')
print(f'  * {len(PROMPTS)} prompt types')

In [None]:
MODEL_LIST = [
    'claude-3-haiku-20240307',
    'claude-3-opus-20240229',
    'claude-3-sonnet-20240229',
    'deepseek/deepseek-chat',
    'gemini-pro',
    'gpt-3.5-turbo',
    'gpt-4-turbo',
    'ollama/llama3.1:70b',
    'ollama/llama3.1:8b',
    'ollama/olmo2',
    'ollama/olmo2:13b'
]

print('> Setting models from constant')
MODEL_TO_TYPE = {m:get_model_renamed(m) for m in MODEL_LIST}
MODEL_TO_NAME = {m:get_model_cleaned(m) for m in MODEL_LIST}


print(f'  * {len(MODEL_LIST)} models (counting parameter changes)')
print(f'  * {len(set(MODEL_TO_NAME.values()))} model (discounting parameter changes)')
print(f'  * {len(set(MODEL_TO_TYPE.values()))} model providers ({", ".join(sorted(set(MODEL_TO_TYPE.values())))})')

In [None]:
def collect_prev_genai_promptings(path_pkl=PATH_RAW_PKL, path_json=PATH_RAW_JSON, prompts=PROMPTS, min_lines=10, max_lines=100, overwrite=False,):
    printm('#### Collecting previous genai promptings')
    valid_prompts = set(PROMPT_TO_TYPE.keys())
    valid_models = set(MODEL_TO_TYPE.keys())

    def get_df_poems():
        print(f'  * Collecting from {path_pkl}')
        if path_pkl and os.path.exists(path_pkl):
            df_poems1 = pd.read_pickle(path_pkl).fillna('').query('prompt!=""').rename(columns={'poem':'response'})
            print(f'    * {len(df_poems1)} generated poems')
        else:
            df_poems1 = pd.DataFrame()
        return df_poems1
    
    df1 = get_df_poems()

    def get_jsonl_data():
        import gzip

        if path_json and os.path.exists(path_json):
            print(f'  * Collecting from {path_json}')
            newdata = []
            with gzip.open(path_json, 'rt') as f:
                ld = json.loads(f.read())
                for d in ld:
                    prompt = d['prompt']['user_prompt']
                    model = d['prompt']['model']
                    temp = d['prompt']['temperature']
                    txt = d['response'].split('</think>')[-1].strip()
                    newdata.append({
                        'model':model,
                        'temp':temp,
                        'prompt':prompt,
                        'response':txt,
                    })
            
            print(f'    * {len(newdata)} generated poems')
            df2=pd.DataFrame(newdata)
            return df2
        else:
            return pd.DataFrame()

    df2 = get_jsonl_data()
    df_prompts = pd.concat([df1, df2])
    df_prompts['txt'] = df_prompts.response.apply(clean_genai_poem)
    df_prompts['num_lines'] = df_prompts.txt.apply(get_num_lines)
    df_prompts['prompt_type'] = df_prompts.prompt.apply(lambda x: PROMPT_TO_TYPE.get(x, 'Unknown'))


    print(f'  * Aggregated and filtered')
    df_prompts = df_prompts[df_prompts.prompt.isin(valid_prompts)]
    df_prompts = df_prompts[df_prompts.model.isin(valid_models)]
    
    print(f'    * {len(df_prompts):,} generated responses')
    print(f'    * {df_prompts.response.nunique():,} unique responses')
    print(f'    * {df_prompts.txt.nunique():,} unique poems')
    print(f'    * {df_prompts.prompt.nunique():,} unique prompts')
    print(f'    * {df_prompts.prompt_type.nunique():,} unique prompt types')



    for ncol in ['temp','num_lines']:
        df_prompts[ncol] = pd.to_numeric(df_prompts[ncol], errors='coerce')

    cols = {
        'prompt_type':'prompt_type',
        'prompt':'prompt',
        'model':'model',
        'temp':'temp',
        'txt':'txt',
        'num_lines':'num_lines',
    }

    id_list = [get_id_hash_str(f'{model}__{temp:.4f}__{prompt}__{txt}') for model,temp,prompt,txt in zip(df_prompts.model,df_prompts.temp,df_prompts.prompt,df_prompts.txt)]
    df_prompts['id_hash'] = [get_id_hash(id) for id in id_list]
    df_prompts = df_prompts.sort_values('id_hash')
    df_prompts['txt'] = df_prompts.txt.apply(clean_genai_poem)
    df_prompts['num_lines'] = df_prompts.txt.apply(get_num_lines)
    
    df_prompts = df_prompts.query(f'num_lines >= {min_lines} and num_lines <= {max_lines}')
    odf = df_prompts.drop_duplicates('id_hash').set_index('id_hash').sort_index()
    odf=odf[cols.keys()].rename(columns=cols)
    save_sample(odf, f'{PATH_DATA}/corpus_genai_promptings.csv.gz', overwrite=overwrite)
    PATH_SAMPLE
    return odf

In [None]:
df = collect_prev_genai_promptings()

In [None]:
df

In [None]:
## add / remove models here
# MODELS.remove('ollama/darkmoon/olmo:7B-instruct-q6-k')
# MODELS.add('claude-4-sonnet)



### Replicating

In [None]:
DEMO_MODEL = MODEL_LIST[0]
DEMO_PROMPT = PROMPTS['do_NOT_rhyme'][0]

printm(f'#### Demo for poem generation')
print(f'  * Demo model: {DEMO_MODEL}')
print(f'  * Demo prompt: {DEMO_PROMPT}')
print(f'\n>>>\n')
response = generate_text(
    DEMO_MODEL,
    DEMO_PROMPT,
    verbose=True
)

printm(f'##### Extracting poem from response:')
printm(f"```{clean_genai_poem(response)}```")

In [None]:
def generate_more_poems(n=1000, models=MODEL_LIST, prompts=PROMPT_LIST, temperatures=None, verbose=True):
    iterr = tqdm(total=n, position=0)
    bad_models = set()
    for n in range(n):
        if not models: break
        if not prompts: break
        model = random.choice(models)
        prompt = random.choice(prompts)
        temperature = round((random.choice(temperatures) if temperatures else random.uniform(0.0, 1.0)), 4)
        iterr.set_description(f'>>> {model} ({temperature}): "{prompt}"')
        try:
            if verbose:
                printm('----')
            response = generate_text(
                model,
                prompt,
                temperature=temperature,
                verbose=verbose
            )
            if verbose:
                printm('----')
        except Exception as e:
            print(f'!!! Error on model: {model}')
            models = [m for m in models if m != model]
        iterr.update(1)
                

In [None]:
generate_more_poems(verbose=True)