In [145]:
import json

from openai import OpenAI
import pandas as pd
import pronouncing
from sentence_transformers import SentenceTransformer, util
from transformers import pipeline

client = OpenAI()
sentiment_pipeline = pipeline("sentiment-analysis")
embeddings_model = SentenceTransformer('Alibaba-NLP/gte-base-en-v1.5', trust_remote_code=True)

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [87]:
llama3_8b_quant = pd.read_csv('data/poem_responses_llama3:8b-instruct-q6_K.csv')
llama3_8b = pd.read_csv('data/poem_responses_meta-llama-3-8b-instruct.csv')
llama3_70b = pd.read_csv('data/poem_responses_meta-llama-3-70b-instruct.csv')
gpt_35 = pd.read_csv('data/poem_responses_gpt-3.5-turbo.csv')
gpt_4o = pd.read_csv('data/poem_responses_gpt-4o.csv')

In [88]:
# Average Length Heuristic
def calculate_avg_length(df):
    return int(df['poem'].str.len().mean())

print(f'Llama3 8B Quantized: {calculate_avg_length(llama3_8b_quant)}')
print(f'Llama3 8B: {calculate_avg_length(llama3_8b)}')
print(f'Llama3 70B: {calculate_avg_length(llama3_70b)}')
print(f'GPT-3.5-Turbo: {calculate_avg_length(gpt_35)}')
print(f'GPT-4o: {calculate_avg_length(gpt_4o)}')

Llama3 8B Quantized: 948
Llama3 8B: 873
Llama3 70B: 954
GPT-3.5-Turbo: 657
GPT-4-Turbo: 1006


In [116]:
# Pct Rhyming Heuristic
def calculate_rhyming_fct(poem):
    stanzas = poem.split('\n\n')
    stanzas = [stanza for stanza in stanzas if len(stanza.split('\n')) > 1]
    
    num_rhyming_stanzas = 0
    for stanza in stanzas:
        lines = stanza.split('\n')
        end_words = [line.split(' ')[-1].strip('.?!"\',') for line in lines]
        found_rhyme = False
        for i in range(len(end_words)):
            for j in range(i + 1, len(end_words)):
                found_rhyme = True if found_rhyme or (end_words[j] in pronouncing.rhymes(end_words[i])) else False
                
        if found_rhyme:
            num_rhyming_stanzas += 1
            
    return num_rhyming_stanzas / len(stanzas)

print(f"Llama3 8B Quantized: {int(100 * llama3_8b_quant['poem'].apply(calculate_rhyming_fct).mean())}%")
print(f"Llama3 8B : {int(100 * llama3_8b['poem'].apply(calculate_rhyming_fct).mean())}%")
print(f"Llama3 70B: {int(100 *llama3_70b['poem'].apply(calculate_rhyming_fct).mean())}%")
print(f"GPT-3.5-Turbo: {int(100 * gpt_35['poem'].apply(calculate_rhyming_fct).mean())}%")
print(f"GPT-4o: {int(100 * gpt_4o['poem'].apply(calculate_rhyming_fct).mean())}%")

Llama3 8B Quantized: 96%
Llama3 8B : 97%
Llama3 70B: 97%
GPT-3.5-Turbo: 89%
GPT-4o: 94%


In [97]:
# Sentiment Analysis Heuristic
def has_positive_sentiment(poem):
    sentiment = sentiment_pipeline(poem)[0]
    return True if sentiment['label'] == 'POSITIVE' else False

print(f"Llama3 8B Quantized: {int(100 * llama3_8b_quant['poem'].apply(has_positive_sentiment).mean())}%")
print(f"Llama3 8B: {int(100 * llama3_8b['poem'].apply(has_positive_sentiment).mean())}%")
print(f"Llama3 70B: {int(100 * llama3_70b['poem'].apply(has_positive_sentiment).mean())}%")
print(f"GPT-3.5-Turbo: {int(100 * gpt_35['poem'].apply(has_positive_sentiment).mean())}%")
print(f"GPT-4o: {int(100 * gpt_4o['poem'].apply(has_positive_sentiment).mean())}%")

Llama3 8B Quantized: 85%
Llama3 8B: 84%
Llama3 70B: 90%
GPT-3.5-Turbo: 94%
GPT-4-Turbo: 96%


In [132]:
# Diversity Heuristic
def calculate_avg_cos_sim(poems):
    similarities = list()
    for i in range(len(poems)):
        for j in range(i + 1, len(poems)):
            embeddings1 = embeddings_model.encode(poems[i])
            embeddings2 = embeddings_model.encode(poems[j])
            similarity = util.pytorch_cos_sim(embeddings1, embeddings2).numpy()[0][0]
            similarities.append(similarity)
            
    return sum(similarities) / len(similarities)

print(f"Llama3 8B Quantized: {round(calculate_avg_cos_sim(llama3_8b_quant['poem']), 2)}")
print(f"Llama3 8B: {round(calculate_avg_cos_sim(llama3_8b['poem']), 2)}")
print(f"Llama3 70B: {round(calculate_avg_cos_sim(llama3_70b['poem']), 2)}")
print(f"GPT-3.5-Turbo: {round(calculate_avg_cos_sim(gpt_35['poem']), 2)}")
print(f"GPT-4o: {round(calculate_avg_cos_sim(gpt_4o['poem']), 2)}")

Llama3 8B Quantized: 0.49
Llama3 8B: 0.49
Llama3 70B: 0.48
GPT-3.5-Turbo: 0.47
GPT-4-Turbo: 0.48


In [152]:
# LLM evaluation
system_message = '''You are professional poet responsible for assessing the quality of AI generated poems.

Score each poem on a scale of 0 to 10, where 10 represents the best possible poem.

Scoring Guidelines:
- Is the poem original?
- Does the poem contain beauty, power, education or entertainment?
- is the message of the poem clear? Is it a good message, or is it of little value to anyone?
- Is the poem clear in its expression? Does it maintain coherence throughout?
- If the poem is written in rhyming verse, then it should be rated according to how well the rhymes fit, not only with each other, but with the flow and the intended nuance of meaning the verse demands.
- What form does the poem take? Is it a sonnet, free verse, haiku, etc.? How does the form contribute to the poem's impact?
- Does the poet us the best possible choice of words in the poem? A person can ball, cry, sob, whimper, and shed tears, but which term would best fit the mood the poet is trying to convey?

Think through your reasoning step-by-step and explain your reasoning. Steps for Judging a Poem:
1. Read the Poem Multiple Times: Read it aloud and silently to capture both the meaning and the sound.
2. Take Notes: Jot down initial impressions, notable phrases, and any questions that arise.
3. Analyze the Elements: Break down the poem into its components (content, structure, language, sound).
4. Reflect on Your Experience: Consider your emotional response and personal connection to the poem.

The last line in your response MUST be a json object {"score": XXX}, where XXX is the score you are giving the response.'''

def evaluate_poems(poems):
    scores = list()
    for poem in poems:
        poem_evaluated = False
        seed=1
        while not poem_evaluated:
            response = client.chat.completions.create(
                model="gpt-4o",
                messages=[
                    {"role": "system", "content": system_message},
                    {"role": "user", "content": poem}
                ],
                temperature=0,
                seed=seed,
            )
            try:
                response = response.choices[0].message.content
                score = int(json.loads(response.split('\n')[-1])['score'])
                if score < 0 or score > 10:
                    seed += 1
                    continue
                    
                scores.append(score)
                poem_evaluated = True
            except json.JSONDecodeError:
                seed += 1

    return sum(scores) / len(scores)

print(f"Llama3 8B Quantized: {round(evaluate_poems(llama3_8b_quant['poem']), 2)}")
print(f"Llama3 8B: {round(evaluate_poems(llama3_8b['poem']), 2)}")
print(f"Llama3 70B: {round(evaluate_poems(llama3_70b['poem']), 2)}")
print(f"GPT-3.5-Turbo: {round(evaluate_poems(gpt_35['poem']), 2)}")
print(f"GPT-4o: {round(evaluate_poems(gpt_4o['poem']), 2)}")

Llama3 8B Quantized: 8.62
Llama3 8B: 8.63
Llama3 70B: 8.71
GPT-3.5-Turbo: 8.36
GPT-4o: 8.84
