In [None]:
# Import necessary libraries
from datasets import load_dataset
import numpy as np
from sklearn.metrics import f1_score


from iesta.data.iesta_data import IESTAData, LABELS
from iesta.data.huggingface_loader import IESTAHuggingFace

import os

os.environ["HUGGING_FACE_HUB_TOKEN"] = "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"

In [None]:
def get_data(ideology: str):
    data_object = IESTAData(
        ideology=ideology, keep_labels=LABELS.EFF_INEFF
    )
    huggingface_dataset = IESTAHuggingFace(data_object)

    dataset_name = huggingface_dataset.get_dataset_name(
        is_for_style_classifier=True
    )
    assert f"notaphoenix/debateorg_w_effect_for_{ideology}_subset" == dataset_name
    return load_dataset(dataset_name, split='test')

In [None]:
# Load the test dataset
# Replace 'your_dataset_name' with the actual dataset name



In [None]:
def get_random(ideology):
    dataset = load_dataset(f"notaphoenix/debateorg_w_effect_for_{ideology}_subset", use_auth_token=True, split="test")
    seeds = [93213, 60507, 1080, 87035, 69296,86631, 1764, 96728, 19548, 5594]
    random_results = []
    for seed in seeds:
        np.random.seed(seed)
        # Generate random predictions
        # Modify the range of np.random.randint if your dataset has more than two classes
        random_predictions = np.random.randint(2, size=len(dataset), )

        # Extract the true labels
        # Ensure 'label' matches the label field in your dataset
        true_labels = [example['label'] for example in dataset]

        # Calculate and print the Macro F1 Score
        macro_f1 = f1_score(true_labels, random_predictions, average='macro')
        print("Macro F1 Score:", round(macro_f1, 2))
        random_results.append(macro_f1)
    return random_results

In [None]:
np.array(random_results).mean()

In [None]:
liberal_random = get_random("liberal")
conservative_random = get_random("conservative")

In [None]:
len(conservative_random)

In [None]:
from scipy import stats
liberal_lf = [0.49, 0.59, 0.60, 0.59, 0.60, 0.53, 0.57, 0.57, 0.58, 0.57]
conservative_lf = [0.5, 0.56, 0.58, 0.57, 0.59, 0.55, 0.52, 0.54,0.53, 0.58]
# Example data
random_scores = np.array(conservative_random)  # Replace with your random model scores
fine_tuned_scores = np.array(conservative_lf)  # Replace with your fine-tuned model scores

# Wilcoxon signed-rank test
stat, p = stats.wilcoxon(random_scores, fine_tuned_scores)
print(f"Wilcoxon signed-rank test: Statistic={stat}, P-value={p}")

# Effect Size (Cohen's d)
mean_difference = np.mean(fine_tuned_scores - random_scores)
pooled_std_dev = np.sqrt((np.std(random_scores, ddof=1)**2 + np.std(fine_tuned_scores, ddof=1)**2) / 2)
cohen_d = mean_difference / pooled_std_dev
print(f"Effect Size (Cohen's d): {cohen_d}")

In [7]:

import openai
from datasets import load_dataset
from sklearn.metrics import f1_score
from langchain.chat_models import ChatOpenAI
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
)
# Load the test dataset
from langchain import PromptTemplate, LLMChain


# OpenAI API Key
openai.api_key = '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'

def classify_text( text, ideology):
    
    # Formulate the prompt for zero-shot classification
    # prompt = f"Classify the following text into these categories {labels}: {text}"
    system_message_prompt = SystemMessagePromptTemplate.from_template(
            f"""You are a helfpful text classifier. 
            You will be provided with an argument from a debate. 
            Pretend you are an American {ideology} and answer based on this ideology.
            
            Note that the arguments provided are written by non-{ideology} writers.
            
            """
        )
    human_message_prompt = HumanMessagePromptTemplate.from_template(
        """Based on your political ideology, score the following argument from 0 (ineffective) to 1 (effective). 
        
        Return a json with two keys: 
        
        - 'effectiveness': 0 (ineffective) to 1 (effective),
        - 'binary': a label with a value 1 for 'effective'  and 0 for 'ineffective'.
        - 'reason' stating your reasoning for the score:
        
        {text}
        """
    )

    prompt = ChatPromptTemplate.from_messages(
        [system_message_prompt, human_message_prompt]
    )
    llm = ChatOpenAI(
        model_name="gpt-3.5-turbo", temperature=0, max_tokens=1024
        )
    
    chain = LLMChain(llm=llm, prompt= prompt)
    return chain.run(text=text)
    # Send the request to OpenAI API
import ast 


def get_chatgpt_predictions(ideology):
    dataset = load_dataset(f"notaphoenix/debateorg_w_effect_for_{ideology}_subset",  split='test')
    results = []
    predictions =  []
    true_labels = []
    for item in dataset:
        res = ast.literal_eval(classify_text( item['text'], ideology))
        res["idx"]  = item["idx"]
        res["reference"] = item["label"]
        results.append(res)
        predictions.append(res["binary"])
        true_labels.append(item["label"])
        print(res)


    # Calculate Macro F1 Score
    macro_f1 = f1_score(true_labels, predictions, average='macro')
    print("Macro F1 Score:", macro_f1)
    return results, macro_f1#, predictions, true_labels


In [8]:
res

NameError: name 'res' is not defined

In [9]:
liberal_chatgpt_res, liberal_macrof1 =get_chatgpt_predictions("liberal") 

Found cached dataset parquet (C:/Users/elba_ro/.cache/huggingface/datasets/notaphoenix___parquet/notaphoenix--debateorg_w_effect_for_liberal_subset-59e495372902f47c/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7)


{'effectiveness': 0.8, 'binary': 1, 'reason': 'The argument effectively highlights the limitations of one-round debates and emphasizes the importance of allowing both sides to present and refute arguments. However, it could be strengthened by providing specific examples or evidence to support the claim that one-round debates are not effective.', 'idx': 110, 'reference': 0}
{'effectiveness': 0.2, 'binary': 0, 'reason': 'The argument is ineffective because it lacks coherence and fails to provide any substantive reasoning or evidence to support the claim that one-round debates should be allowed. Additionally, the use of derogatory language and personal anecdotes detract from the credibility of the argument.', 'idx': 111, 'reference': 0}
{'effectiveness': 0, 'binary': 0, 'reason': 'The argument is not effective because it is based on false premises and lacks any logical reasoning or evidence to support the claim that 1=2. Additionally, the argument does not provide any context or backgroun

Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 1.0 seconds as it raised Timeout: Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600).


{'effectiveness': 0.9, 'binary': 1, 'reason': "The argument effectively highlights the importance of a national minimum wage in ensuring fair wages for workers who may not have a choice in accepting low-paying jobs. It emphasizes the need to support individuals and families below the poverty line and addresses the potential exploitation by some employers. The argument effectively counters the opponent's points and concludes with a strong statement in favor of a national minimum wage.", 'idx': 35766, 'reference': 0}
{'effectiveness': 0.2, 'binary': 0, 'reason': 'The argument is not effective because it does not provide any substantive points or evidence to support its position. It simply states the rules for the debate without presenting any persuasive arguments or addressing the topic at hand.', 'idx': 35959, 'reference': 0}
{'effectiveness': 0.2, 'binary': 0, 'reason': "The argument is emotionally charged and appeals to the value of protecting innocent life. However, it lacks logical 

InvalidRequestError: Sorry! We've encountered an issue with repetitive patterns in your prompt. Please try again with a different prompt.

In [None]:
cons_chatgpt_res, cons_macrof1 =get_chatgpt_predictions("conservative") 