In [1]:
# imports

from pathlib import Path
from openai import OpenAI
from dotenv import load_dotenv
import os
import time
import pandas as pd
from collections import defaultdict
import re

In [2]:
# load API key

dotenv_path = Path(r"C:\Storage\python_projects\ashvin\.env")
load_dotenv(dotenv_path=dotenv_path)

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [3]:
# main constants

GPT_MODEL_TEXT_ALIAS = "gpt-4-turbo-preview" # points to latest GPT model
GPT_MODEL_TEXT = "gpt-4-0125-preview"


In [4]:
# variations
n = 100 # number of trisociations in a run
runs = 10 # number of runs

In [5]:
# base prompt
base_prompt = f"""
Generate {n} trisociations in the format below. No additional comments or elaborations. Avoid duplicate words across trisociations.
1. Word - Word - Word
2. Word - Word - Word
...
"""

In [6]:
#instantiate client
client = OpenAI()

In [7]:
# Function to perform a single run and collect data
def perform_run(prompt_text):
    start_time = time.time()
    completion = client.chat.completions.create(
        model=GPT_MODEL_TEXT_ALIAS,
        messages=[
            {"role": "system", "content": prompt_text},
        ]
    )
    end_time = time.time()
    
    trisociation_answer = completion.choices[0].message.content
    execution_time = end_time - start_time
    prompt_tokens = completion.usage.prompt_tokens
    completion_tokens = completion.usage.completion_tokens
    total_tokens = completion.usage.total_tokens

    return {
        'execution_time': execution_time,
        'prompt_tokens': prompt_tokens,
        'completion_tokens': completion_tokens,
        'total_tokens': total_tokens,
        'trisociations': trisociation_answer
    }

In [8]:
# List to store results from all runs
results = []

# Perform multiple runs
for _ in range(runs):
    result = perform_run(base_prompt)
    results.append(result)

# Additionally, print each run's trisociations
for i, r in enumerate(results, 1):
    print(f"Run {i} trisociations:\n{r['trisociations']}\n")

Run 1 trisociations:
1. Apple - Book - Wind
2. Mountain - Keyboard - Symphony
3. Ocean - Lamp - Theory
4. Tree - Database - Kettle
5. Cloud - Desk - Violin
6. Moon - Wallet - Puzzle
7. Star - Chair - Equation
8. Sun - Cup - Guitar
9. City - Spoon - Opera
10. River - Shoe - Novel
11. Sky - Phone - Ballet
12. Forest - Pen - Sculpture
13. Grass - Computer - Sonata
14. Beach - Bag - Poem
15. Flower - Glasses - Symphony
16. Bird - Watch - Play
17. Fish - Jacket - Photograph
18. Earth - Hat - Movie
19. Comet - Socks - Dance
20. Galaxy - Belt - Painting
21. Planet - Scarf - Drawing
22. Sunflower - Boots - Album
23. Rain - Tie - Fiction
24. Snow - Necklace - Lyric
25. Ice - Earrings - Script
26. Thunder - Bracelet - Comedy
27. Lightning - Ring - Documentary
28. Wind - Anklet - Fantasy
29. Fog - Brooch - Thriller
30. Hail - Gloves - Mystery
31. Volcano - Scarves - Memoir
32. Island - Candle - Essay
33. Desert - Blanket - Biography
34. Canyon - Pillow - Satire
35. Valley - Sheet - Periodical
36.

In [9]:
# function to calculate and display per run time and token metrics
def calculate_and_display_metrics(results):
    # Prepare data for DataFrame
    data = [{
        'Execution Time': result['execution_time'],
        'Prompt Tokens': result['prompt_tokens'],
        'Completion Tokens': result['completion_tokens'],
        'Total Tokens': result['total_tokens']
    } for result in results]
    
    # Create DataFrame
    df_metrics = pd.DataFrame(data)
    
    # Set custom index names (Run 1, Run 2, ...)
    run_indices = [f'Run {i+1}' for i in range(len(results))]
    df_metrics.index = run_indices

    # Apply styling for better readability
    styled_df = df_metrics.style.format({
        'Execution Time': "{:.2f} seconds",
        'Prompt Tokens': "{:.0f}",
        'Completion Tokens': "{:.0f}",
        'Total Tokens': "{:.0f}"
    }).background_gradient(cmap='viridis', subset=['Total Tokens'])

    return styled_df



metrics_df = calculate_and_display_metrics(results)
metrics_df

Unnamed: 0,Execution Time,Prompt Tokens,Completion Tokens,Total Tokens
Run 1,37.90 seconds,53,961,1014
Run 2,36.05 seconds,53,1006,1059
Run 3,28.71 seconds,53,965,1018
Run 4,64.10 seconds,53,1144,1197
Run 5,34.80 seconds,53,1032,1085
Run 6,40.20 seconds,53,997,1050
Run 7,32.64 seconds,53,839,892
Run 8,48.54 seconds,53,1035,1088
Run 9,44.17 seconds,53,1003,1056
Run 10,37.76 seconds,53,973,1026


In [10]:
# function to get duplicate stats per run
def per_run_aggregate_summary(results):
    detailed_data = []
    for run_index, result in enumerate(results, start=1):
        trisociations = result['trisociations'].strip()
        trisociation_list = trisociations.split('\n')

        for trisociation_index, trisociation in enumerate(trisociation_list, start=1):
            words = re.sub(r'^\d+\.\s*', '', trisociation).replace(' - ', ' ').split()
            for word in words:
                detailed_data.append({'run': run_index, 'word': word})

    df = pd.DataFrame(detailed_data)

    summary_stats = []
    unique_runs = df['run'].unique()
    
    for run in sorted(unique_runs):
        run_df = df[df['run'] == run]
        
        total_words = run_df.shape[0]
        total_unique_words = run_df['word'].nunique()
        total_duplicates = run_df[run_df.duplicated('word', keep=False)]['word'].nunique()
        
        if not run_df[run_df.duplicated('word', keep=False)].empty:
            mode_duplicate = run_df[run_df.duplicated('word', keep=False)]['word'].value_counts().idxmax()
            max_dup_freq = run_df[run_df.duplicated('word', keep=False)]['word'].value_counts().max()
            modal_dup_freq = run_df[run_df.duplicated('word', keep=False)]['word'].value_counts().mode()[0]
        else:
            mode_duplicate = 'None'
            max_dup_freq = 0
            modal_dup_freq = 0
        
        min_dup_freq = run_df[run_df.duplicated('word', keep=False)]['word'].value_counts().min() if not run_df[run_df.duplicated('word', keep=False)].empty else 0
        
        summary_stats.append({
            'Total Words': total_words,
            'Total Unique Words': total_unique_words,
            'Total Duplicates': total_duplicates,
            'Max Dup Word': mode_duplicate,
            'Max Dup Freq': max_dup_freq,
            'Modal Dup Freq': modal_dup_freq,
            'Min Dup Freq': min_dup_freq
        })
    
    summary_df = pd.DataFrame(summary_stats, index=[f'Run {run}' for run in sorted(unique_runs)])
    
    # Apply styling
    styled_df = summary_df.style\
        .format({'Total Words': "{:}", 'Total Unique Words': "{:}", 'Total Duplicates': "{:}", 
                 'Max Dup Word': "{}", 'Max Dup Freq': "{:}", 'Modal Dup Freq': "{:}", 'Min Dup Freq': "{:}"})\
        .set_table_styles([{
            'selector': 'th',
            'props': [
                ('background-color', '#f4f4f4'),
                ('color', '#6d6d6d'),
                ('font-weight', 'bold')
            ]}])\
        .set_caption("Summary Statistics per Run")\
        .set_properties(**{'text-align': 'left'})

    return styled_df

# display styled DataFrame
per_run_aggregate_summary_styled = per_run_aggregate_summary(results)
per_run_aggregate_summary_styled


Unnamed: 0,Total Words,Total Unique Words,Total Duplicates,Max Dup Word,Max Dup Freq,Modal Dup Freq,Min Dup Freq
Run 1,337,304,23,Fiction,5,2,2
Run 2,304,297,7,Telescope,2,2,2
Run 3,313,265,38,Keyboard,5,2,2
Run 4,300,152,88,Chimera,5,3,2
Run 5,342,308,29,Star,3,2,2
Run 6,332,309,19,Desk,4,2,2
Run 7,300,265,30,Fish,3,2,2
Run 8,301,280,21,Symphony,2,2,2
Run 9,301,271,27,Lantern,3,2,2
Run 10,302,288,14,Ice,2,2,2
