In [1]:
# imports

from pathlib import Path
from openai import OpenAI
from dotenv import load_dotenv
import os
import random
import time
import itertools
import pandas as pd
from collections import defaultdict
import re

In [2]:
# load API key

dotenv_path = Path(r"C:\Storage\python_projects\ashvin\.env")
load_dotenv(dotenv_path=dotenv_path)

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [3]:
# main constants

GPT_MODEL_TEXT_ALIAS = "gpt-4-turbo-preview"
GPT_MODEL_VISION_ALIAS = "gpt-4-vision-preview"

# additional constants as at 31/01/24
GPT_MODEL_TEXT = "gpt-4-0125-preview"
IMAGE_MODEL = "dall-e-3"
EMBEDDINGS_MODEL = "text-moderation-latest"

In [4]:
# variations
n = 300
runs = 3

In [17]:
# simple prompt
base_prompt = f"""
Generate {n} trisociations as a list.
"""

instruction_prompt = f"""
Generate {n} trisociations as a list. No additional comments or elaborations.
"""
format_prompt = f"""
Generate {n} trisociations in the format below. No additional comments or elaborations.
1. Word - Word - Word
2. Word - Word - Word
...
"""
language_prompt = f"""
Generate {n} trisociations in the format below. No additional comments or elaborations.
1. Word (language) - Word (language) - Word (language)
2. Word (language) - Word (language) - Word (language)
...
"""
unique_prompt = f"""
Generate {n} trisociations in the format below. No additional comments or elaborations. Make each word unique.
1. Word - Word - Word
2. Word - Word - Word
...
"""
avoid_prompt = f"""
Generate {n} trisociations in the format below. No additional comments or elaborations. avoid duplicate words within and across trisociations.
1. Word - Word - Word
2. Word - Word - Word
...
"""

In [12]:
#instantiate client
client = OpenAI()

In [18]:
final_prompt = avoid_prompt

In [19]:
# Function to perform a single run and collect data
def perform_run(prompt_text):
    start_time = time.time()
    completion = client.chat.completions.create(
        model=GPT_MODEL_TEXT_ALIAS,
        messages=[
            {"role": "system", "content": prompt_text},
        ]
    )
    end_time = time.time()
    
    trisociation_answer = completion.choices[0].message.content
    execution_time = end_time - start_time
    prompt_tokens = completion.usage.prompt_tokens
    completion_tokens = completion.usage.completion_tokens
    total_tokens = completion.usage.total_tokens

    return {
        'execution_time': execution_time,
        'prompt_tokens': prompt_tokens,
        'completion_tokens': completion_tokens,
        'total_tokens': total_tokens,
        'trisociations': trisociation_answer
    }

In [20]:
# List to store results from all runs
results = []

# Perform multiple runs
for _ in range(runs):
    result = perform_run(final_prompt)
    results.append(result)

# Calculate average metrics
avg_execution_time = sum(r['execution_time'] for r in results) / runs
avg_prompt_tokens = sum(r['prompt_tokens'] for r in results) / runs
avg_completion_tokens = sum(r['completion_tokens'] for r in results) / runs
avg_total_tokens = sum(r['total_tokens'] for r in results) / runs

# Print out the results
print(f"Average execution time: {avg_execution_time:.2f} seconds")
print(f"Average prompt tokens: {avg_prompt_tokens:.2f}")
print(f"Average completion tokens: {avg_completion_tokens:.2f}")
print(f"Average total tokens: {avg_total_tokens:.2f}")

# Additionally, print each run's trisociations
for i, r in enumerate(results, 1):
    print(f"Run {i} trisociations:\n{r['trisociations']}\n")

Average execution time: 167.88 seconds
Average prompt tokens: 55.00
Average completion tokens: 3063.00
Average total tokens: 3118.00
Run 1 trisociations:
1. Mountain - River - Valley
2. Chocolate - Vanilla - Strawberry
3. Sun - Moon - Stars
4. Winter - Spring - Summer
5. Pencil - Pen - Marker
6. Lion - Tiger - Bear
7. Bread - Cheese - Wine
8. Triangle - Square - Circle
9. Gold - Silver - Bronze
10. Rain - Snow - Hail
11. Book - Magazine - Newspaper
12. Apple - Banana - Cherry
13. Table - Chair - Sofa
14. Guitar - Piano - Violin
15. Red - Blue - Yellow
16. Hat - Scarf - Gloves
17. Coffee - Tea - Milk
18. Car - Bicycle - Train
19. Clock - Watch - Calendar
20. Toothbrush - Toothpaste - Floss
21. Fork - Knife - Spoon
22. Ocean - Sea - Lake
23. Eagle - Falcon - Hawk
24. Rose - Tulip - Daisy
25. Painting - Sculpture - Drawing
26. Duck - Goose - Swan
27. Onion - Garlic - Ginger
28. Candle - Lamp - Flashlight
29. Cricket - Football - Basketball
30. Cowboy - Knight - Pirate
31. Bee - Butterfly 

In [16]:
def per_run_aggregate_summary(results):
    detailed_data = []
    for run_index, result in enumerate(results, start=1):
        trisociations = result['trisociations'].strip()
        trisociation_list = trisociations.split('\n')

        for trisociation_index, trisociation in enumerate(trisociation_list, start=1):
            words = re.sub(r'^\d+\.\s*', '', trisociation).replace(' - ', ' ').split()
            for word in words:
                detailed_data.append({'run': run_index, 'word': word})

    df = pd.DataFrame(detailed_data)

    summary_stats = []
    unique_runs = df['run'].unique()
    
    for run in sorted(unique_runs):
        run_df = df[df['run'] == run]
        
        total_words = run_df.shape[0]
        total_unique_words = run_df['word'].nunique()
        total_duplicates = run_df[run_df.duplicated('word', keep=False)]['word'].nunique()
        
        if not run_df[run_df.duplicated('word', keep=False)].empty:
            mode_duplicate = run_df[run_df.duplicated('word', keep=False)]['word'].value_counts().idxmax()
            max_dup_freq = run_df[run_df.duplicated('word', keep=False)]['word'].value_counts().max()
            modal_dup_freq = run_df[run_df.duplicated('word', keep=False)]['word'].value_counts().mode()[0]
        else:
            mode_duplicate = 'None'
            max_dup_freq = 0
            modal_dup_freq = 0
        
        min_dup_freq = run_df[run_df.duplicated('word', keep=False)]['word'].value_counts().min() if not run_df[run_df.duplicated('word', keep=False)].empty else 0
        
        summary_stats.append({
            'Total Words': total_words,
            'Total Unique Words': total_unique_words,
            'Total Duplicates': total_duplicates,
            'Max Dup Word': mode_duplicate,
            'Max Dup Freq': max_dup_freq,
            'Modal Dup Freq': modal_dup_freq,
            'Min Dup Freq': min_dup_freq
        })
    
    summary_df = pd.DataFrame(summary_stats, index=[f'Run {run}' for run in sorted(unique_runs)])
    
    # Apply styling
    styled_df = summary_df.style\
        .format({'Total Words': "{:}", 'Total Unique Words': "{:}", 'Total Duplicates': "{:}", 
                 'Max Dup Word': "{}", 'Max Dup Freq': "{:}", 'Modal Dup Freq': "{:}", 'Min Dup Freq': "{:}"})\
        .set_table_styles([{
            'selector': 'th',
            'props': [
                ('background-color', '#f4f4f4'),
                ('color', '#6d6d6d'),
                ('font-weight', 'bold')
            ]}])\
        .set_caption("Summary Statistics per Run")\
        .set_properties(**{'text-align': 'left'})

    return styled_df

# Assuming 'results' is populated from previous code blocks
# Simply call the function to display the styled DataFrame in your Jupyter Notebook
per_run_aggregate_summary_styled = per_run_aggregate_summary(results)
per_run_aggregate_summary_styled


Unnamed: 0,Total Words,Total Unique Words,Total Duplicates,Max Dup Word,Max Dup Freq,Modal Dup Freq,Min Dup Freq
Run 1,900,735,122,Glacier,5,2,2
Run 2,941,620,167,Soda,5,2,2
Run 3,900,693,137,Zenith,7,2,2


In [None]:
# task 1 : display the duplicates in a nicer visual. a table maybe is simple with pandas?
# task 2 : can the duplicates code handle all the prompt type returns
# task 3 : do the mega trisociation prompt and see if any real difference along with language
# task 4 : do cosine similarity
# task 5 : display the base run data in a nicer format
# task 6 : what about a word cloud?
# task 7 : compare it to just generating a word list from a txt file
# task 8 : maybe modify the format prompt to be something like try diverse variations. diverse_prompt.
# task 9 : vary temperature, top p. Maybe that's a separate experiment. maybe that's the main one here.
# task 10 : display each run details before computing the visual
# task 11 : try lousier GPT Models for cheaper inference

In [21]:
def detailed_duplicate_analysis(results):
    data = []
    for run_index, result in enumerate(results, start=1):
        trisociations = result['trisociations'].strip()
        trisociation_list = trisociations.split('\n')

        for trisociation_index, trisociation in enumerate(trisociation_list, start=1):
            words = re.sub(r'^\d+\.\s*', '', trisociation).replace(' - ', ' ').split()
            for word in words:
                data.append({'run': run_index, 'trisociation_index': trisociation_index, 'word': word})
    
    df = pd.DataFrame(data)
    
    # Identifying duplicates
    duplicate_words = df[df.duplicated('word', keep=False)]
    
    # Count and detail aggregation for duplicates
    detailed_duplicates = duplicate_words.groupby('word').agg(
        count=('word', 'size'),  # Use 'size' instead of 'count' for aggregation to get the actual count of rows
        runs=('run', lambda x: sorted(list(set(x)))),
        trisociation_indices=('trisociation_index', lambda x: sorted(list(set(x))))
    ).reset_index()

    # Sorting by 'count' in descending order
    detailed_duplicates = detailed_duplicates.sort_values(by='count', ascending=False).reset_index(drop=True)

    return detailed_duplicates

detailed_df = detailed_duplicate_analysis(results)

styled_df = detailed_df.style\
    .format({'count': '{:.0f}'})\
    .background_gradient(subset='count', cmap='Blues')\
    .set_properties(**{'text-align': 'left'})\
    .set_table_styles([dict(selector='th', props=[('text-align', 'left')])])\
    .highlight_max(subset='count', color='green')\
    .highlight_min(subset='count', color='red')

styled_df

Unnamed: 0,word,count,runs,trisociation_indices
0,Management,20,[2],"[259, 260, 261, 262, 263, 265, 266, 273, 274, 277, 278, 279, 280, 282, 283, 284, 285, 293, 294, 295]"
1,Studies,15,[2],"[159, 163, 164, 165, 166, 245, 246, 247, 252, 253, 254, 255, 256, 257, 258]"
2,Willow,13,"[1, 2, 3]","[18, 56, 126, 157, 184, 203, 209, 219, 235, 237, 251, 273, 287]"
3,Plateau,11,[2],"[39, 53, 75, 108, 122, 148, 174, 200, 226, 252, 278]"
4,Maple,11,"[1, 2, 3]","[14, 46, 53, 67, 82, 111, 132, 199, 203, 218, 277]"
5,Cliff,10,"[1, 2]","[15, 118, 135, 161, 173, 187, 213, 239, 265, 291]"
6,Falcon,10,"[1, 2, 3]","[12, 18, 23, 53, 55, 63, 91, 144, 225, 299]"
7,Ridge,10,"[2, 3]","[72, 120, 139, 165, 179, 191, 217, 243, 269, 295]"
8,Jasmine,9,"[1, 2, 3]","[39, 43, 88, 130, 170, 196, 248, 257, 274]"
9,Elm,9,"[1, 2, 3]","[34, 62, 139, 191, 217, 219, 269, 276, 295]"


In [None]:
# trisociation prompt

TRISOCIATION_PROMPT=f"""

---

**Strict Exclusion List**
_Before embarking on the task, it is imperative to note that the following words are strictly prohibited from use in the trisociations:_

- Hiraeth
- Zephyr
- Ubuntu
- Petrichor
- Yugen
- Saudade
- Komorebi
- Tsundoku
- Cwtch
- Wabi-Sabi
- Mamihlapinatapai
- Fernweh
- Flaneur
- Hygge
- Mangata
- Samadhi
- Apricity

---

**Guidelines for Diverse and Unique Trisociations:**
As an advanced algorithm with access to a vast vector database of semantic embeddings, your role is to facilitate the trisociation process — a creative endeavor that interlaces three semantically distant concepts. Your mission is to construct trisociations that are linguistically diverse and conceptually distinctive, avoiding commonly used words and uncovering hidden linguistic gems.

---

_Please ensure that none of the words in the strict exclusion list are used in the trisociations._

1. **Explore the Uncommon**: Avoid frequently chosen words and seek out less common vocabulary.
2. **Prioritize Lesser-Known Vocabulary**: Focus on words that are less commonly used in creative writing, poetry, and literary contexts, aiming to uncover unique and underrepresented words.
3. **Thematic Diversity**: Incorporate words from themes like technology, mythology, professions, emotions, and weather phenomena, avoiding the obvious choices.
4. **Historical and Cultural Significance**: Choose words with historical or cultural resonance, especially those that are less known or underrepresented.
5. **Vibrant Language Use**: Include regional dialects or slang, with a focus on those that are less frequently used.
6. **Varied Parts of Speech**: Mix nouns, verbs, adjectives, and adverbs, especially those that are less common in trisociation contexts.
7. **Sensory Exploration**: Select sensory words that are vivid yet not the usual go-to options.
8. **Abstract and Tangible Fusion**: Blend abstract concepts with concrete items in unconventional ways.
9. **Language Family Diversity**: Draw from a wide array of language families, especially focusing on those less represented in common discourse.
10. **Creative Figurative Language**: Use metaphors, similes, and symbolic language, avoiding clichés and common expressions.
11. **Wide Emotional Range**: Cover a spectrum of emotions with less typical word choices.
12. **Temporal and Cultural Fusion**: Connect different eras and cultures, going beyond well-trodden paths.
13. **Lesser-Known Language Exploration**: Seek out words from lesser-known languages and dialects.
14. **Balanced Elements**: Ensure that each word contributes equally to the trisociation, without any overshadowing the others.
15. **Semantic Uniqueness**: Ensure that each trisociation stands out in its conceptual composition, distinct from typical combinations.
16. **Language Identification**: In each trisociation, identify the languages of the chosen words. This highlights the linguistic diversity and encourages the exploration of a variety of languages, including lesser-known ones.

---

Task:

Generate {n} Trisociations adhering to the above guidelines. Each trisociation should be a creative blend of three semantically distant nouns, showcasing linguistic innovation and conceptual uniqueness. 
The words in the exclusion list should not be part of any trisociation. No additional comments, explanations or elaborations beyond the trisociation itself.

Output Format :

1. Word (language) - Word (language) - Word (language)
2. Word (language) - Word (language) - Word (language)
...

"""