In [30]:
# imports

from pathlib import Path
from openai import OpenAI
from dotenv import load_dotenv
import os
import random
import time
import itertools
import pandas as pd
from collections import defaultdict
import re

In [31]:
# load API key

dotenv_path = Path(r"C:\Storage\python_projects\ashvin\.env")
load_dotenv(dotenv_path=dotenv_path)

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [32]:
# main constants

GPT_MODEL_TEXT_ALIAS = "gpt-4-turbo-preview"
GPT_MODEL_VISION_ALIAS = "gpt-4-vision-preview"

# additional constants as at 31/01/24
GPT_MODEL_TEXT = "gpt-4-0125-preview"
IMAGE_MODEL = "dall-e-3"
EMBEDDINGS_MODEL = "text-moderation-latest"

In [33]:
# variations
n = 100
runs = 3

In [34]:
# simple prompt
base_prompt = f"""
Generate {n} trisociations as a list.
"""

instruction_prompt = f"""
Generate {n} trisociations as a list. No additional comments or elaborations.
"""
format_prompt = f"""
Generate {n} trisociations in the format below. No additional comments or elaborations.
1. Word - Word - Word
2. Word - Word - Word
...
"""
language_prompt = f"""
Generate {n} trisociations in the format below. No additional comments or elaborations.
1. Word (language) - Word (language) - Word (language)
2. Word (language) - Word (language) - Word (language)
...
"""

In [35]:
#instantiate client
client = OpenAI()

In [36]:
final_prompt = format_prompt

In [37]:
# Function to perform a single run and collect data
def perform_run(prompt_text):
    start_time = time.time()
    completion = client.chat.completions.create(
        model=GPT_MODEL_TEXT_ALIAS,
        messages=[
            {"role": "system", "content": prompt_text},
        ]
    )
    end_time = time.time()
    
    trisociation_answer = completion.choices[0].message.content
    execution_time = end_time - start_time
    prompt_tokens = completion.usage.prompt_tokens
    completion_tokens = completion.usage.completion_tokens
    total_tokens = completion.usage.total_tokens

    return {
        'execution_time': execution_time,
        'prompt_tokens': prompt_tokens,
        'completion_tokens': completion_tokens,
        'total_tokens': total_tokens,
        'trisociations': trisociation_answer
    }

In [38]:
# List to store results from all runs
results = []

# Perform multiple runs
for _ in range(runs):
    result = perform_run(final_prompt)
    results.append(result)

# Calculate average metrics
avg_execution_time = sum(r['execution_time'] for r in results) / runs
avg_prompt_tokens = sum(r['prompt_tokens'] for r in results) / runs
avg_completion_tokens = sum(r['completion_tokens'] for r in results) / runs
avg_total_tokens = sum(r['total_tokens'] for r in results) / runs

# Print out the results
print(f"Average execution time: {avg_execution_time:.2f} seconds")
print(f"Average prompt tokens: {avg_prompt_tokens:.2f}")
print(f"Average completion tokens: {avg_completion_tokens:.2f}")
print(f"Average total tokens: {avg_total_tokens:.2f}")

# Additionally, print each run's trisociations
for i, r in enumerate(results, 1):
    print(f"Run {i} trisociations:\n{r['trisociations']}\n")

Average execution time: 35.21 seconds
Average prompt tokens: 44.00
Average completion tokens: 906.67
Average total tokens: 950.67
Run 1 trisociations:
1. Ocean - Sand - Sunlight
2. Mountain - Stars - Solitude
3. Laughter - Family - Tea
4. Thunder - Rain - Shelter
5. Books - Imagination - Journey
6. Paint - Canvas - Inspiration
7. Piano - Melody - Emotion
8. Coffee - Morning - Quiet
9. Forest - Mystery - Green
10. Snow - Silence - Whiteness
11. Bread - Warmth - Home
12. Night - Moon - Dreams
13. Guitar - Notes - Expression
14. Street - Lights - Movement
15. Dance - Rhythm - Freedom
16. Paper - Ink - Thoughts
17. Fire - Warmth - Stories
18. Clock - Time - Memories
19. Clouds - Sky - Flight
20. Gold - Treasure - Quest
21. Garden - Flowers - Bees
22. Wine - Friends - Conversation
23. Ice - Cold - Clarity
24. Train - Tracks - Destination
25. Chocolate - Sweet - Pleasure
26. Keys - Door - Opportunities
27. Wind - Leaves - Dance
28. Tea - Comfort - Ritual
29. Bridge - River - Connection
30. F

In [39]:
# Function to analyze trisociations for duplicate words within each run
def analyze_duplicates_per_run(results):
    for i, result in enumerate(results, 1):
        trisociations = result['trisociations'].strip()  # Ensure to strip whitespace for clean splitting
        word_map = defaultdict(lambda: {'count': 0, 'trisociation_indices': []})

        # Split trisociations and analyze words
        trisociation_list = trisociations.split('\n')
        for index, trisociation in enumerate(trisociation_list, start=1):
            # Assuming trisociations are exactly "word - word - word" format
            words = trisociation.replace(' - ', ' ').split()  # Split all words while removing " - "
            for word in words:
                word_map[word]['count'] += 1
                word_map[word]['trisociation_indices'].append(index)

        # Filter for duplicates within this run
        duplicates = {word: info for word, info in word_map.items() if info['count'] > 1}

        # Print out duplicates and their details for this run
        print(f"Run {i}: Duplicate word analysis")
        if duplicates:  # If there are any duplicates
            for word, info in duplicates.items():
                indices = ', '.join(map(str, info['trisociation_indices']))
                print(f"Word '{word}' is used {info['count']} times in trisociations: {indices}")
        else:
            print("No duplicate words found in this run.")
        print("\n")  # Add a newline for better separation between runs

# After collecting all results, call the function
analyze_duplicates_per_run(results)


Run 1: Duplicate word analysis
Word 'Ocean' is used 3 times in trisociations: 1, 36, 77
Word 'Sand' is used 2 times in trisociations: 1, 82
Word 'Mountain' is used 2 times in trisociations: 2, 31
Word 'Stars' is used 2 times in trisociations: 2, 66
Word 'Tea' is used 2 times in trisociations: 3, 28
Word 'Rain' is used 2 times in trisociations: 4, 48
Word 'Journey' is used 2 times in trisociations: 5, 37
Word 'Morning' is used 2 times in trisociations: 8, 78
Word 'Mystery' is used 4 times in trisociations: 9, 59, 76, 95
Word 'Snow' is used 2 times in trisociations: 10, 74
Word 'Silence' is used 2 times in trisociations: 10, 59
Word 'Bread' is used 2 times in trisociations: 11, 69
Word 'Warmth' is used 3 times in trisociations: 11, 17, 67
Word 'Night' is used 4 times in trisociations: 12, 56, 63, 76
Word 'Dreams' is used 4 times in trisociations: 12, 52, 76, 100
Word 'Guitar' is used 2 times in trisociations: 13, 39
Word 'Dance' is used 2 times in trisociations: 15, 27
Word 'Freedom' is 

In [40]:
# Function to analyze trisociations for duplicate words within each run
def analyze_duplicates_across_runs(results):
    # Initialize a dictionary to track word occurrences across all runs and trisociations
    all_words_map = defaultdict(lambda: {'count': 0, 'details': []})

    for run_index, result in enumerate(results, start=1):
        trisociations = result['trisociations'].strip()  # Clean up whitespace
        trisociation_list = trisociations.split('\n')

        for trisociation_index, trisociation in enumerate(trisociation_list, start=1):
            # Remove leading indices like "1." from each trisociation before splitting
            cleaned_trisociation = re.sub(r'^\d+\.\s*', '', trisociation)
            words = cleaned_trisociation.replace(' - ', ' ').split()  # Extract words, removing " - "
            for word in words:
                all_words_map[word]['count'] += 1
                all_words_map[word]['details'].append((run_index, trisociation_index))

    # Filter for words appearing in more than one trisociation across all runs
    duplicates_across_runs = {word: info for word, info in all_words_map.items() if info['count'] > 1}

    # Print out the results
    if duplicates_across_runs:
        print("Duplicate words across runs and their occurrences:")
        for word, info in duplicates_across_runs.items():
            print(f"Word '{word}' is used {info['count']} times across runs.")
            for detail in info['details']:
                print(f" - In Run {detail[0]}, Trisociation {detail[1]}")
            print()  # Blank line for readability
    else:
        print("No duplicate words found across all runs.")

# After collecting all results, call the function
analyze_duplicates_across_runs(results)


Duplicate words across runs and their occurrences:
Word 'Ocean' is used 6 times across runs.
 - In Run 1, Trisociation 1
 - In Run 1, Trisociation 36
 - In Run 1, Trisociation 77
 - In Run 2, Trisociation 5
 - In Run 2, Trisociation 25
 - In Run 3, Trisociation 1

Word 'Sand' is used 3 times across runs.
 - In Run 1, Trisociation 1
 - In Run 1, Trisociation 82
 - In Run 2, Trisociation 13

Word 'Mountain' is used 6 times across runs.
 - In Run 1, Trisociation 2
 - In Run 1, Trisociation 31
 - In Run 2, Trisociation 18
 - In Run 2, Trisociation 52
 - In Run 2, Trisociation 79
 - In Run 3, Trisociation 3

Word 'Stars' is used 2 times across runs.
 - In Run 1, Trisociation 2
 - In Run 1, Trisociation 66

Word 'Tea' is used 3 times across runs.
 - In Run 1, Trisociation 3
 - In Run 1, Trisociation 28
 - In Run 2, Trisociation 34

Word 'Thunder' is used 6 times across runs.
 - In Run 1, Trisociation 4
 - In Run 2, Trisociation 3
 - In Run 2, Trisociation 43
 - In Run 2, Trisociation 58
 - I

In [None]:
# task 1 : display the duplicates in a nicer visual
# task 2 : can the duplicates code handle all the prompt type returns
# task 3 : do the mega trisociation prompt and see if any real difference along with language
# task 4 : do cosine similarity
# task 5 : display the base run data in a nicer format
# task 6 : what about a word cloud?

In [None]:
# trisociation prompt

TRISOCIATION_PROMPT=f"""

---

**Strict Exclusion List**
_Before embarking on the task, it is imperative to note that the following words are strictly prohibited from use in the trisociations:_

- Hiraeth
- Zephyr
- Ubuntu
- Petrichor
- Yugen
- Saudade
- Komorebi
- Tsundoku
- Cwtch
- Wabi-Sabi
- Mamihlapinatapai
- Fernweh
- Flaneur
- Hygge
- Mangata
- Samadhi
- Apricity

---

**Guidelines for Diverse and Unique Trisociations:**
As an advanced algorithm with access to a vast vector database of semantic embeddings, your role is to facilitate the trisociation process — a creative endeavor that interlaces three semantically distant concepts. Your mission is to construct trisociations that are linguistically diverse and conceptually distinctive, avoiding commonly used words and uncovering hidden linguistic gems.

---

_Please ensure that none of the words in the strict exclusion list are used in the trisociations._

1. **Explore the Uncommon**: Avoid frequently chosen words and seek out less common vocabulary.
2. **Prioritize Lesser-Known Vocabulary**: Focus on words that are less commonly used in creative writing, poetry, and literary contexts, aiming to uncover unique and underrepresented words.
3. **Thematic Diversity**: Incorporate words from themes like technology, mythology, professions, emotions, and weather phenomena, avoiding the obvious choices.
4. **Historical and Cultural Significance**: Choose words with historical or cultural resonance, especially those that are less known or underrepresented.
5. **Vibrant Language Use**: Include regional dialects or slang, with a focus on those that are less frequently used.
6. **Varied Parts of Speech**: Mix nouns, verbs, adjectives, and adverbs, especially those that are less common in trisociation contexts.
7. **Sensory Exploration**: Select sensory words that are vivid yet not the usual go-to options.
8. **Abstract and Tangible Fusion**: Blend abstract concepts with concrete items in unconventional ways.
9. **Language Family Diversity**: Draw from a wide array of language families, especially focusing on those less represented in common discourse.
10. **Creative Figurative Language**: Use metaphors, similes, and symbolic language, avoiding clichés and common expressions.
11. **Wide Emotional Range**: Cover a spectrum of emotions with less typical word choices.
12. **Temporal and Cultural Fusion**: Connect different eras and cultures, going beyond well-trodden paths.
13. **Lesser-Known Language Exploration**: Seek out words from lesser-known languages and dialects.
14. **Balanced Elements**: Ensure that each word contributes equally to the trisociation, without any overshadowing the others.
15. **Semantic Uniqueness**: Ensure that each trisociation stands out in its conceptual composition, distinct from typical combinations.
16. **Language Identification**: In each trisociation, identify the languages of the chosen words. This highlights the linguistic diversity and encourages the exploration of a variety of languages, including lesser-known ones.

---

Task:

Generate {n} Trisociations adhering to the above guidelines. Each trisociation should be a creative blend of three semantically distant nouns, showcasing linguistic innovation and conceptual uniqueness. 
The words in the exclusion list should not be part of any trisociation. No additional comments, explanations or elaborations beyond the trisociation itself.

Output Format :

1. Word (language) - Word (language) - Word (language)
2. Word (language) - Word (language) - Word (language)
...

"""