In [1]:
import pandas as pd
import nltk
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
# nltk.download('stopwords')
from nltk.corpus import stopwords
import string
import openai
import json
import time
import tiktoken

# Load data

In [2]:
# Define dataset file paths
datasets = {
    "LIAR Train": "train.tsv",
    "LIAR Test": "test.tsv",
    "LIAR Valid": "valid.tsv",
    "Gossip Fake": "gossipcop_fake.csv",
    "Gossip Real": "gossipcop_real.csv",
    "Political Fake": "politifact_fake.csv",
    "Political Real": "politifact_real.csv"
}

# Define LIAR dataset column names
liar_columns = [
    "id", "label", "text", "subjects", "speaker", "job_title", "state",
    "party_affiliation", "barely_true_count", "false_count", "half_true_count",
    "mostly_true_count", "pants_fire_count", "context"
]

# Load datasets into a dictionary 
dataframes = {}
for name, path in datasets.items():
    sep = "\t" if path.endswith(".tsv") else ","  # Detect separator
    columns = liar_columns if "LIAR" in name else None  # Assign columns only for LIAR datasets
    dataframes[name] = pd.read_csv(path, sep=sep, header=None if columns else "infer", names=columns)

# Print first few rows of each dataset
for name, df in dataframes.items():
    print(f"\n {name} Data:")
    print(df.columns)



 LIAR Train Data:
Index(['id', 'label', 'text', 'subjects', 'speaker', 'job_title', 'state',
       'party_affiliation', 'barely_true_count', 'false_count',
       'half_true_count', 'mostly_true_count', 'pants_fire_count', 'context'],
      dtype='object')

 LIAR Test Data:
Index(['id', 'label', 'text', 'subjects', 'speaker', 'job_title', 'state',
       'party_affiliation', 'barely_true_count', 'false_count',
       'half_true_count', 'mostly_true_count', 'pants_fire_count', 'context'],
      dtype='object')

 LIAR Valid Data:
Index(['id', 'label', 'text', 'subjects', 'speaker', 'job_title', 'state',
       'party_affiliation', 'barely_true_count', 'false_count',
       'half_true_count', 'mostly_true_count', 'pants_fire_count', 'context'],
      dtype='object')

 Gossip Fake Data:
Index(['id', 'news_url', 'title', 'tweet_ids'], dtype='object')

 Gossip Real Data:
Index(['id', 'news_url', 'title', 'tweet_ids'], dtype='object')

 Political Fake Data:
Index(['id', 'news_url', 'title',

# Data Preprocessing: Standardizing Dataset Structure

In [3]:

#  retain essential columns for the liar dataset
liar_columns = ["text", "label", "subjects", "context", "speaker", "party_affiliation","barely_true_count", "false_count", "half_true_count",
    "mostly_true_count", "pants_fire_count", "state"]
for key in ["LIAR Train", "LIAR Test", "LIAR Valid"]:
    dataframes[key] = dataframes[key][liar_columns]

#  manually add 'label' column before selecting other columns
label_mapping = {
    "Gossip Fake": "fake", "Gossip Real": "real",
    "Political Fake": "fake", "Political Real": "real"
}

for key, label in label_mapping.items():
    # first, create the label column
    dataframes[key]["label"] = label  
    
    # check if 'title' and 'news_url' exist before renaming
    expected_columns = ["title", "news_url", "label"]
    available_columns = [col for col in expected_columns if col in dataframes[key].columns]
    
    if "title" in available_columns:
        dataframes[key] = dataframes[key][available_columns].rename(columns={"title": "text"})
    else:
        print(f" Warning: Column 'title' not found in {key}. Available columns: {dataframes[key].columns}")

# #  ensure all datasets have a consistent structure
# for name, df in dataframes.items():
#     print(f"\n {name} Data (After Filtering):")
#     print(df.head())


In [4]:
# check the size of the datasets
for name, df in dataframes.items():
    print(f"\n {name} Size of the datasets: {df.shape}")



 LIAR Train Size of the datasets: (10240, 12)

 LIAR Test Size of the datasets: (1267, 12)

 LIAR Valid Size of the datasets: (1284, 12)

 Gossip Fake Size of the datasets: (5323, 3)

 Gossip Real Size of the datasets: (16817, 3)

 Political Fake Size of the datasets: (432, 3)

 Political Real Size of the datasets: (624, 3)


In [5]:
# check missing value
for name, df in dataframes.items():
    missing_values = df.isnull().sum()
    print(f"\n {name} missing value:")
    print(missing_values)


 LIAR Train missing value:
text                    0
label                   0
subjects                2
context               102
speaker                 2
party_affiliation       2
barely_true_count       2
false_count             2
half_true_count         2
mostly_true_count       2
pants_fire_count        2
state                2210
dtype: int64

 LIAR Test missing value:
text                   0
label                  0
subjects               0
context               17
speaker                0
party_affiliation      0
barely_true_count      0
false_count            0
half_true_count        0
mostly_true_count      0
pants_fire_count       0
state                262
dtype: int64

 LIAR Valid missing value:
text                   0
label                  0
subjects               0
context               12
speaker                0
party_affiliation      0
barely_true_count      0
false_count            0
half_true_count        0
mostly_true_count      0
pants_fire_count       0
stat

In [6]:
# calculate the percentage of missing values for each dataset
for name, df in dataframes.items():
    missing_percentage = df.isnull().sum() / len(df) * 100  # Compute the missing value percentage    print(f"\n {name} Missing Value Percentage (%):")
    print(missing_percentage)

text                  0.000000
label                 0.000000
subjects              0.019531
context               0.996094
speaker               0.019531
party_affiliation     0.019531
barely_true_count     0.019531
false_count           0.019531
half_true_count       0.019531
mostly_true_count     0.019531
pants_fire_count      0.019531
state                21.582031
dtype: float64
text                  0.000000
label                 0.000000
subjects              0.000000
context               1.341752
speaker               0.000000
party_affiliation     0.000000
barely_true_count     0.000000
false_count           0.000000
half_true_count       0.000000
mostly_true_count     0.000000
pants_fire_count      0.000000
state                20.678769
dtype: float64
text                  0.000000
label                 0.000000
subjects              0.000000
context               0.934579
speaker               0.000000
party_affiliation     0.000000
barely_true_count     0.000000
false_cou

In [7]:
# Remove rows with missing values in specified columns for liar dataset
for key in ["LIAR Train", "LIAR Test", "LIAR Valid"]:
    initial_rows = dataframes[key].shape[0]
    dataframes[key].dropna(subset=["subjects", "speaker", "party_affiliation", "context"], inplace=True)
    removed_rows = initial_rows - dataframes[key].shape[0]
    print(f"{key} - Rows removed: {removed_rows}")
    dataframes[key]["state"].fillna("Unknown", inplace=True)

# Remove rows with missing values in news_url for gossipcop and political dataset
for key in ["Gossip Fake", "Gossip Real", "Political Fake", "Political Real"]:
    initial_rows = dataframes[key].shape[0]
    dataframes[key].dropna(subset=["news_url"], inplace=True)
    removed_rows = initial_rows - dataframes[key].shape[0]
    print(f"{key} - Rows removed: {removed_rows}")

# Make sure missing values are fixed
for name, df in dataframes.items():
    print(f"\n{name} Missing Values After Fixing:")
    print(df.isnull().sum())


LIAR Train - Rows removed: 102
LIAR Test - Rows removed: 17
LIAR Valid - Rows removed: 12
Gossip Fake - Rows removed: 256
Gossip Real - Rows removed: 13
Political Fake - Rows removed: 4
Political Real - Rows removed: 57

LIAR Train Missing Values After Fixing:
text                 0
label                0
subjects             0
context              0
speaker              0
party_affiliation    0
barely_true_count    0
false_count          0
half_true_count      0
mostly_true_count    0
pants_fire_count     0
state                0
dtype: int64

LIAR Test Missing Values After Fixing:
text                 0
label                0
subjects             0
context              0
speaker              0
party_affiliation    0
barely_true_count    0
false_count          0
half_true_count      0
mostly_true_count    0
pants_fire_count     0
state                0
dtype: int64

LIAR Valid Missing Values After Fixing:
text                 0
label                0
subjects             0
context    

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataframes[key]["state"].fillna("Unknown", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataframes[key]["state"].fillna("Unknown", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which w

# Remove punctuation and stopwords

In [8]:
# Define a function to remove punctuation and stopwords
def clean_text(text):
    text = text.lower()
    stop_words = set(stopwords.words("english"))  # Load the stopword list
    text = text.translate(str.maketrans("", "", string.punctuation))  # Remove punctuation
    words = text.split()  # Split the sentence into words
    words = [word for word in words if word.lower() not in stop_words]  # Filter out stopwords
    return " ".join(words)  # Reassemble the cleaned words into a sentence

# Apply clean_text() to all datasets
for key in dataframes.keys():
    dataframes[key]["text"] = dataframes[key]["text"].apply(clean_text)


# ChatGPT

In [9]:
client = openai.OpenAI(api_key="YOUR API KEY")  # Securely store API key

### Preparing train/test/validation datasets and standardizing labels

In [10]:
# LIAR dataset
liar_train = dataframes["LIAR Train"]
liar_test = dataframes["LIAR Test"]
liar_val = dataframes["LIAR Valid"]
# FakeNewsNet data
gossipcop_fake = dataframes["Gossip Fake"]
gossipcop_real = dataframes["Gossip Real"]
politifact_fake = dataframes["Political Fake"]
politifact_real = dataframes["Political Real"]

label_mapping_binary = {"fake": 0, "real": 1}
fake_news_df = pd.concat([gossipcop_fake, gossipcop_real, politifact_fake, politifact_real])
fake_news_df['label'] = fake_news_df['label'].map(label_mapping_binary)

# Standardize labels
label_mapping_multi = {"pants-fire": 0, "false": 1, "barely-true": 2, "half-true": 3, "mostly-true": 4, "true": 5}
liar_train['label'] = liar_train['label'].map(label_mapping_multi)
liar_test['label'] = liar_test['label'].map(label_mapping_multi)
liar_val['label'] = liar_val['label'].map(label_mapping_multi)


# Train-test-validation split for Fake News Net dataset
fake_news_train_df, temp_data = train_test_split(
        fake_news_df,
        test_size=0.3,
        random_state=42,
        stratify=fake_news_df['label']
    )

fake_news_val_data, fake_news_test_data = train_test_split(
        temp_data,
        test_size=0.5,
        random_state=42,
        stratify=temp_data['label']
    )


### Helper Functions for Fine Tuning and Token Counting

In [11]:
encoding = tiktoken.get_encoding("cl100k_base")

def count_tokens(data):
    total_tokens = 0
    for entry in data:
        for key, value in entry.items():
            if isinstance(value, str):
                tokens = encoding.encode(value)  
                total_tokens += len(tokens)
            else:
                tokens = encoding.encode(str(value))
                total_tokens += len(tokens)
    return total_tokens

# Function to calculate the cost based on the number of tokens for GPT 3.5 ( Assumed cost)
def calculate_cost(num_tokens, model="gpt-3.5-turbo"):
    cost_per_1000_tokens = 0.03  
    cost = (num_tokens / 1000) * cost_per_1000_tokens
    return cost

# Function to prepare data for fine-tuning
def prepare_fine_tune_data(df, dataset_type):    
    fine_tune_data = []
    if dataset_type == "LIAR":
        label_map = {0: "pants-fire", 1: "false", 2: "barely-true", 3: "half-true", 4: "mostly-true", 5: "true"}
    else:  
        label_map = {0: "fake", 1: "real"}

    for _, row in df.iterrows():
        label = label_map.get(row['label'], "false" if dataset_type == "LIAR" else "fake")
        system_message = "You are a fact-checking assistant that labels news with a single word."
        if dataset_type == "LIAR":
            user_message = f"""Classify as pants-fire/false/barely-true/half-true/mostly-true/true:
            Statement: {row['text']}
            Speaker: {row.get('speaker', 'Unknown')} ({row.get('party_affiliation', 'Unknown')})
            History: PF:{row.get('pants_fire_count', 0)}, F:{row.get('false_count', 0)}, BT:{row.get('barely_true_count', 0)}, HT:{row.get('half_true_count', 0)}, MT:{row.get('mostly_true_count', 0)}"""
        else:  
            
            user_message = f"""Classify as real or fake:
        News: {row['text']}
        URL: {row.get('news_url', 'N/A')}"""
        assistant_message = label
        fine_tune_data.append({
            "messages": [
                {"role": "system", "content": system_message},
                {"role": "user", "content": user_message},
                {"role": "assistant", "content": assistant_message}
            ]
        })
    return fine_tune_data


# Function to fine tune GPT using a training and validation dataset 
def fine_tune_gpt(training_data, validation_data, dataset_type, base_model="gpt-3.5-turbo", epochs=4):
    """
    Fine-tune with full dataset while ensuring balanced class distribution via oversampling.
    Utilizes a separate validation dataset provided by the user.
    """
    if training_data.empty:
        raise ValueError(f"Empty training data provided for {dataset_type}")
    if validation_data.empty:
        raise ValueError(f"Empty validation data provided for {dataset_type}")

    class_counts = training_data['label'].value_counts()
    max_samples_per_class = class_counts.max()
    
    balanced_data = []
    for class_value, count in class_counts.items():
        class_data = training_data[training_data['label'] == class_value]
        oversampled_data = class_data.sample(
            n=max_samples_per_class, 
            replace=True, 
            random_state=42
        )
        balanced_data.append(oversampled_data)
    
    
    training_data = pd.concat(balanced_data).sample(frac=1, random_state=42)
    
    fine_tune_data = prepare_fine_tune_data(training_data, dataset_type)
    validation_data = prepare_fine_tune_data(validation_data, dataset_type)
    
    # Saving training data into jsonl file
    train_file_name = f"{dataset_type.lower()}_train_fine_tune_data.jsonl"
    with open(train_file_name, "w") as f:
        for entry in fine_tune_data:
            f.write(json.dumps(entry) + "\n")
    
    # Saving validation data into jsonl file
    val_file_name = f"{dataset_type.lower()}_val_fine_tune_data.jsonl"
    with open(val_file_name, "w") as f:
        for entry in validation_data:
            f.write(json.dumps(entry) + "\n")
    
    # Count tokens for cost calculation
    train_token_count = count_tokens(fine_tune_data)
    val_token_count = count_tokens(validation_data)
    total_token_count = train_token_count + val_token_count
    
    # Calculating cost
    total_cost = calculate_cost(total_token_count)
    print(f"Total tokens used: {total_token_count}")
    print(f"Estimated cost for fine-tuning: ${total_cost:.2f}")
    
    # Upload training and validation data
    try:
        with open(train_file_name, "rb") as f:
            train_file_response = client.files.create(file=f, purpose="fine-tune")
        train_file_id = train_file_response.id
        
        with open(val_file_name, "rb") as f:
            val_file_response = client.files.create(file=f, purpose="fine-tune")
        val_file_id = val_file_response.id
        
        wait_counter = 0
        while wait_counter < 60:
            train_status = client.files.retrieve(train_file_id)
            val_status = client.files.retrieve(val_file_id)
            if train_status.status == "processed" and val_status.status == "processed":
                break
            time.sleep(5)
            wait_counter += 1
        
        suffix = f"{dataset_type.lower()}_balanced_e{epochs}"
        fine_tune_response = client.fine_tuning.jobs.create(
            training_file=train_file_id,
            validation_file=val_file_id,
            model=base_model,
            hyperparameters={
                "n_epochs": epochs,
                "batch_size": 32,
                "learning_rate_multiplier": 2e-5
            },
            suffix=suffix
        )
        
        print(f"Fine-tuning job created for {dataset_type}: {fine_tune_response.id}")
        return fine_tune_response.id
    
    except Exception as e:
        print(f"Error during fine-tuning: {e}")
        if hasattr(e, 'response') and hasattr(e.response, 'json'):
            print(f"Error details: {e.response.json()}")
        return None


### Helper functions for Querying GPT

In [12]:
# Function to query GPT with retries 
def query_gpt_batch(prompts, model="gpt-3.5-turbo", temperature=0.3, max_retries=3):
    start_time = time.time()
    responses = []
    errors = 0
    
    for prompt in prompts:
        retry_count = 0
        while retry_count < max_retries:
            try:
                response = client.chat.completions.create(
                    model=model,
                    messages=[
                        {"role": "system", "content": "You are a fact-checking assistant that labels news as accurately as possible. Respond with exactly one word from the allowed set of labels."},
                        {"role": "user", "content": prompt}
                    ],
                    temperature=temperature,
                    max_tokens=50,
                    presence_penalty=0.0,
                    frequency_penalty=0.0
                )
                
                prediction = response.choices[0].message.content.strip().lower()
                
                # Validate prediction against expected label sets
                if "pants-fire" in prompt or "half-true" in prompt:  # LIAR dataset
                    valid_labels = ["pants-fire", "false", "barely-true", "half-true", "mostly-true", "true"]
                    if not any(label in prediction for label in valid_labels):
                        # If invalid response, retry with more explicit instructions
                        if retry_count < max_retries - 1:
                            retry_count += 1
                            continue
                else:  
                    valid_labels = ["fake", "real"]
                    if not any(label in prediction for label in valid_labels):
                        if retry_count < max_retries - 1:
                            retry_count += 1
                            continue
                
                responses.append(prediction)
                break  
                
            except Exception as e:
                print(f"Error in API call (attempt {retry_count+1}/{max_retries}): {e}")
                retry_count += 1
                if retry_count >= max_retries:
                    print(f"All retries failed. Using fallback response.")
                    responses.append("uncertain")
                    errors += 1
                time.sleep(min(2 ** retry_count, 10))
        
    processing_time = time.time() - start_time
    if errors > 0:
        print(f"Encountered {errors} errors in batch processing")
    
    return responses, processing_time

In [13]:
# Function to get few shot examples (Atleast one example per class)
def get_few_shot_examples(df, label_key='label', num_examples=3):
    examples = []
    
    if label_key in df.columns:
        unique_labels = df[label_key].unique()
        
        for label in unique_labels:
            label_df = df[df[label_key] == label]
            if not label_df.empty:
                examples_per_class = max(1, min(1, num_examples // len(unique_labels)))
                examples.append(label_df.sample(n=min(examples_per_class, len(label_df)), random_state=42))
        
    examples_df = pd.concat(examples)
    
    return examples_df


### Helper function for generating efficient prompts

In [22]:
# Generate prompts
examples = get_few_shot_examples(liar_test, num_examples=3)

example_texts = "\n".join([
                f"Statement: '{row['text']}'\nSpeaker: {row['speaker']} ({row['party_affiliation']})\n"
                f"History: PF:{row['pants_fire_count']}, F:{row['false_count']}, BT:{row['barely_true_count']}, "
                f"HT:{row['half_true_count']}, MT:{row['mostly_true_count']} → {['pants-fire', 'false', 'barely-true', 'half-true', 'mostly-true', 'true'][row['label']]}"
                for _, row in examples.iterrows()
            ])

# Function to generate efficient prompt templates dynamically using GPT
def generate_efficient_prompt_styles(example_texts, dataset_type = "LIAR", model="gpt-4"):
    """Generate research-based efficient prompt templates dynamically using GPT."""

    if dataset_type == "LIAR":
        classification_type = "multi-class (pants-fire, false, barely-true, half-true, mostly-true, true)"
    else:
        classification_type = "binary (real/fake)"
    
    system_prompt = f"""
    You are an expert in prompt engineering and fake news detection. 
    Your task is to generate three highly efficient prompt templates for classifying news articles based on the dataset type. 
    The classification type is {classification_type}. The prompts should be concise, informative, and optimized for token efficiency and include act as.

    Here are some example news articles and their labels:
    {examples[:3]}

    Please generate three distinct prompt templates optimized for {classification_type}.
    """

    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "system", "content": system_prompt}]
    )

    prompts = response.choices[0].message.content.split("\n")
    return prompts



print(generate_efficient_prompt_styles(example_texts,"LIAR","ft:gpt-3.5-turbo-0125:university-edinburgh:liar-balanced-e4:BHXpZ9MA"))

['1. "Classify the news article based on its content: act as a news authenticity auditor and categorize the text as either pants-fire, false, barely-true, half-true, mostly-true, or true."', '   ', '2. "Use your expertise to categorize news articles into one of six classes: pants-fire, false, barely-true, half-true, mostly-true, or true, reflecting the degree of truthfulness in the text."', '   ', '3. "Employ your skills as a news classifier to assess and label articles as pants-fire, false, barely-true, half-true, mostly-true, or true, indicating the accuracy level of each report."']


### Helper function for various prompt templates

In [14]:
# Function to get prompt templates 
def get_prompt_templates(dataset_type, df):
    examples = get_few_shot_examples(df, num_examples=6)

    if dataset_type == "LIAR":
        if not examples.empty:
            example_texts = "\n".join([
                f"Statement: '{row['text']}'\nSpeaker: {row['speaker']} ({row['party_affiliation']})\n"
                f"History: PF:{row['pants_fire_count']}, F:{row['false_count']}, BT:{row['barely_true_count']}, "
                f"HT:{row['half_true_count']}, MT:{row['mostly_true_count']} → {['pants-fire', 'false', 'barely-true', 'half-true', 'mostly-true', 'true'][row['label']]}"
                for _, row in examples.iterrows()
            ])
            
            # Commented prompts were tried but had lower prediction accuracy
            return [
                # f"""Classify the news article based on its content: act as a news authenticity auditor 
                # and categorize the text as either pants-fire, false, barely-true, half-true, mostly-true, or true.
                # \nStatement: "{{text}}"
                # Speaker: {{speaker}} ({{party}})
                # History: PF:{{pants_fire}}, F:{{false}}, BT:{{barely_true}}, HT:{{half_true}}, MT:{{mostly_true}}
                # \nExamples:
                # {example_texts}
                # \nClassification (one word only):""",
                
                # f"""Use your expertise to categorize news articles into one of six classes: 
                # pants-fire, false, barely-true, half-true, mostly-true, or true, reflecting the degree of truthfulness in the text.
                # \nStatement: "{{text}}"
                # Speaker: {{speaker}} ({{party}})
                # History: PF:{{pants_fire}}, F:{{false}}, BT:{{barely_true}}, HT:{{half_true}}, MT:{{mostly_true}}
                # \nExamples:
                # {example_texts}
                # \nClassification (one word only):""",
                
                # f"""Employ your skills as a news classifier to assess and label articles as 
                # pants-fire, false, barely-true, half-true, mostly-true, or true, indicating the accuracy level of each report.
                # \nStatement: "{{text}}"
                # Speaker: {{speaker}} ({{party}})
                # History: PF:{{pants_fire}}, F:{{false}}, BT:{{barely_true}}, HT:{{half_true}}, MT:{{mostly_true}}
                # \nExamples:
                # {example_texts}
                # \nClassification (one word only):""",
                
                f"""You are a fact-checking assistant that labels news with a single word.
                \nClassify as pants-fire/false/barely-true/half-true/mostly-true/true:
                Statement: "{{text}}"
                Speaker: {{speaker}} ({{party}})
                History: PF:{{pants_fire}}, F:{{false}}, BT:{{barely_true}}, HT:{{half_true}}, MT:{{mostly_true}}
                \nConsider the claim's accuracy based on the speaker's history.
                \nExamples:
                {example_texts}
                \nClassification (one word only):""",
                
                f"""You are a fact-checking assistant that labels news with a single word.
                \nAnalyze the accuracy of this statement based on the speaker's history:
                \nStatement: "{{text}}"
                Speaker: {{speaker}} ({{party}})
                History: PF:{{pants_fire}} (pants-fire), F:{{false}} (false), BT:{{barely_true}} (barely-true), HT:{{half_true}} (half-true), MT:{{mostly_true}} (mostly-true)
                \nExamples:
                {example_texts}
                \n- Does the speaker have a history of making accurate claims?
                - How does this statement compare to their past truthfulness?
                \nBased on this, classify as: pants-fire, false, barely-true, half-true, mostly-true, or true.
                \nClassification (one word only):"""
                # ,


                # f"""You are a fact-checking expert. Analyze this political statement by following these steps:
                
                # Statement: "{{text}}"
                # Speaker: {{speaker}} ({{party}})
                # Speaker's history: PF:{{pants_fire}}, F:{{false}}, BT:{{barely_true}}, HT:{{half_true}}, MT:{{mostly_true}}
                
                # Step 1: Identify the core claims in this statement.
                # Step 2: Consider the speaker's history of truthfulness.
                # Step 3: Analyze linguistic patterns for deception markers.
                # Step 4: Classify as exactly one of: pants-fire, false, barely-true, half-true, mostly-true, true.
                
                # Examples:
                # {example_texts}
                
                # Classification (one word only):"""
            ]
    else:
        if not examples.empty:
            example_texts = "\n".join([
                f"News: '{row['text'][:50]}...'\nURL: {row['news_url']} → {['fake', 'real'][row['label']]}"
                for _, row in examples.iterrows()
            ])

            return [
                f"""You are a fact-checking assistant that labels news with a single word.
                \nClassify as real or fake:
                News: "{{text}}"
                URL: {{news_url}}
                \nClassification (one word only):""",
                
                f"""You are a fact-checking assistant that labels news with a single word.
                \nClassify as real or fake:
                News: "{{text}}"
                URL: {{news_url}}
                \nConsider factors such as credibility of the source, verifiability of claims, and potential biases.
                \nExamples:
                {example_texts}
                \nClassification (one word only):""",
                
                f"""You are a fact-checking assistant that labels news with a single word.
                \nAnalyze this news for authenticity:
                \nNews: "{{text}}"
                Source URL: {{news_url}}
                \n- Is the source known for reliable reporting?
                - Can the claims in the article be independently verified?
                - Does the article contain any signs of misinformation (sensationalism, lack of credible sources, etc.)?
                \nBased on this, classify as: real or fake.
                \nClassification (one word only):"""
            ]



### Main pipeline and helper function to normalize prediction

In [15]:
# Function to change predictions into integer format to match the processed true labels
def normalize_prediction(prediction, dataset_type):
    prediction = prediction.lower().strip()
    if dataset_type == "LIAR":
        if "pants" in prediction or "fire" in prediction:
            return 0
        elif "false" in prediction or "fake" in prediction and not any(x in prediction for x in ["barely", "half", "mostly"]):
            return 1
        elif "barely" in prediction:
            return 2
        elif "half" in prediction:
            return 3
        elif "mostly" in prediction:
            return 4
        elif "true" in prediction or "real" in prediction and not any(x in prediction for x in ["barely", "half", "mostly"]):
            return 5
        else:
            return 1  
    else:
        if "fake" in prediction:
            return 0
        elif "real" in prediction:
            return 1
        else:
            return 0  
        
# Main function to run the pipeline of evaluating the model with test set
def evaluate_model(df, dataset_type, batch_size=5, model="gpt-4", temperature=0.1):
    df = df.copy()
    df = df.dropna(subset=['label'])
    try:
        df['label'] = df['label'].astype(int)
    except:
        print("Warning: Could not convert labels to int. Attempting to use as-is.")
    
    prompt_templates = get_prompt_templates(dataset_type, df)
    print("Prompt Template",prompt_templates)
    results = []
    
    for template_idx, prompt_template in enumerate(prompt_templates):
        print(f"Evaluating with template {template_idx+1}...")
        
        y_true = []
        y_pred = []
        total_tokens = 0
        total_cost = 0
        total_time = 0
        batch_prompts = []
        batch_rows = []  
        
        # Update the prompt template with individual row data
        for i, row in enumerate(tqdm(df.iterrows(), total=len(df), desc=f"Template {template_idx+1}")):
            idx, row_data = row  
            
            try:
                if dataset_type == "LIAR":
                    prompt = prompt_template.format(
                        text=row_data['text'][:200],
                        speaker=row_data.get('speaker', 'Unknown'),
                        party=row_data.get('party_affiliation', 'Unknown'),
                        pants_fire=row_data.get('pants_fire_count', 0),
                        false=row_data.get('false_count', 0),
                        barely_true=row_data.get('barely_true_count', 0),
                        half_true=row_data.get('half_true_count', 0),
                        mostly_true=row_data.get('mostly_true_count', 0)
                    )
                else:  
                    prompt = prompt_template.format(
                        text=row_data['text'][:200],
                        news_url=row_data.get('news_url', 'N/A')
                    )
            except (KeyError, AttributeError) as e:
                print(f"Error formatting prompt: {e}")
                prompt = f"Classify: {row_data.get('text', '')[:200]}"
            
            batch_prompts.append(prompt)
            batch_rows.append(row_data) 

            if len(batch_prompts) == batch_size or i == len(df) - 1:
                if batch_prompts:
                    try:
                        # Query GPT with batch prompts
                        predictions, processing_time = query_gpt_batch(batch_prompts, model=model, temperature=temperature)
                    
                        for j, (row_data, prediction) in enumerate(zip(batch_rows, predictions)):
                            try:
                                true_label = int(row_data['label'])
                                normalized_pred = normalize_prediction(prediction, dataset_type)
                            
                                if dataset_type == "LIAR" and 0 <= true_label <= 5 and 0 <= normalized_pred <= 5:
                                    y_true.append(true_label)
                                    y_pred.append(normalized_pred)
                                elif dataset_type == "FAKENEWS" and true_label in [0, 1] and normalized_pred in [0, 1]:
                                    y_true.append(true_label)
                                    y_pred.append(normalized_pred)
                                else:
                                    print(f"Warning: Skipping example with invalid label or prediction: true={true_label}, pred={normalized_pred}")
                            except (ValueError, KeyError, TypeError) as e:
                                print(f"Error processing prediction: {e}")
                        
                        encoding = tiktoken.get_encoding("cl100k_base")
                        total_time += processing_time
                        prompt_tokens = sum(len(encoding.encode(p)) for p in batch_prompts)  
                        completion_tokens = sum(len(encoding.encode(p)) for p in predictions)
                        total_tokens += prompt_tokens + completion_tokens
                    except Exception as e:
                        print(f"Error in batch processing: {e}")
            
                batch_prompts = []
                batch_rows = []

        print(f"Collected {len(y_true)} valid predictions for evaluation")
        if len(y_true) < 2:
            print("WARNING: Not enough valid predictions collected, cannot calculate metrics")
            results.append({
                'accuracy': 0, 'precision': 0, 'recall': 0, 'f1': 0,
                'avg_tokens': 0, 'cost': 0, 'avg_cost': 0, 'processing_time': 0
            })
            continue
        
        try:
            accuracy = accuracy_score(y_true, y_pred)
            if dataset_type == "FAKENEWS":
                if len(set(y_true)) == 1 or len(set(y_pred)) == 1:
                    precision = accuracy
                    recall = accuracy
                    f1 = accuracy
                else:
                    precision, recall, f1, _ = precision_recall_fscore_support(
                        y_true, y_pred, average='binary', zero_division=0
                    )
            else:  
                if len(set(y_true)) == 1 or len(set(y_pred)) == 1:
                    precision = accuracy
                    recall = accuracy
                    f1 = accuracy
                else:
                    precision, recall, f1, _ = precision_recall_fscore_support(
                        y_true, y_pred, average='weighted', zero_division=0
                    )
            
            if "gpt-4" in model:
                cost_per_1k_tokens = 0.03
            else:
                cost_per_1k_tokens = 0.002
            
            total_cost = (total_tokens / 1000) * cost_per_1k_tokens
            avg_tokens_per_article = total_tokens / len(df) if len(df) > 0 else 0
            avg_cost_per_classification = total_cost / len(df) if len(df) > 0 else 0
            avg_processing_time = total_time / len(df) if len(df) > 0 else 0
            
            result = {
                'accuracy': accuracy,
                'precision': precision,
                'recall': recall,
                'f1': f1,
                'avg_tokens': avg_tokens_per_article,
                'cost': total_cost,
                'avg_cost': avg_cost_per_classification,
                'processing_time': avg_processing_time
            }
            
            results.append(result)
            
            print(f"\nResults for {dataset_type} - Template {template_idx+1}:")
            print(f"Accuracy: {accuracy:.4f}")
            print(f"Precision: {precision:.4f}")
            print(f"Recall: {recall:.4f}")
            print(f"F1 Score: {f1:.4f}")
            print(f"Average Tokens per Article: {avg_tokens_per_article:.2f}")
            print(f"Total Cost: ${total_cost:.2f}")
            print(f"Average Processing Time: {avg_processing_time:.2f}s")
        
        except Exception as e:
            print(f"Error calculating metrics: {e}")
            import traceback
            traceback.print_exc()
            results.append({
                'accuracy': 0, 'precision': 0, 'recall': 0, 'f1': 0,
                'avg_tokens': avg_tokens_per_article if 'avg_tokens_per_article' in locals() else 0,
                'cost': total_cost if 'total_cost' in locals() else 0,
                'avg_cost': avg_cost_per_classification if 'avg_cost_per_classification' in locals() else 0,
                'processing_time': avg_processing_time if 'avg_processing_time' in locals() else 0
            })
    
    return results



### Fine Tuning Models

In [16]:
# Fine-tune the model

print("\nFine-tuning LIAR classification model...")
liar_ft_id = fine_tune_gpt(liar_train, liar_val, "LIAR","gpt-3.5-turbo", epochs=4)


print("\nFine-tuning FakeNewsNet classification model...")
fakenews_ft_id = fine_tune_gpt(fake_news_train_df, fake_news_val_data, "FAKENEWS", "gpt-3.5-turbo", epochs=4)


print("\nFine-tuning jobs submitted. Please wait for them to complete.")
print(f"LIAR fine-tune ID: {liar_ft_id}")
print(f"FakeNewsNet fine-tune ID: {fakenews_ft_id}")



Fine-tuning LIAR classification model...
Total tokens used: 1908725
Estimated cost for fine-tuning: $57.26
Fine-tuning job created for LIAR: ftjob-1I3bp8CLp8FoSjUkrP8g2MrZ

Fine-tuning FakeNewsNet classification model...
Total tokens used: 3007829
Estimated cost for fine-tuning: $90.23
Fine-tuning job created for FAKENEWS: ftjob-x0J8sR22sm9QCRkzfDL5gMfR

Fine-tuning jobs submitted. Please wait for them to complete.
LIAR fine-tune ID: ftjob-1I3bp8CLp8FoSjUkrP8g2MrZ
FakeNewsNet fine-tune ID: ftjob-x0J8sR22sm9QCRkzfDL5gMfR


### Evaluation of Fine-tuned GPT model for LIAR dataset

In [20]:
# Evaluate the fine-tuned gpt model that was fine-tuned with Liar Data ( The model id is obtained using fine-tune ID generated above)
liar_model_id = 'ft:gpt-3.5-turbo-0125:university-edinburgh:liar-balanced-e4:BHkIJsSZ'

print("\nEvaluating GPT-3.5-Turbo baseline...")
liar_baseline = evaluate_model(liar_test, "LIAR", batch_size=10, model="gpt-3.5-turbo", temperature=0.1)

print("\n# Evaluating models with fine tuned model:")
liar_final_metrics = evaluate_model(liar_test, 'LIAR', batch_size=10, model=liar_model_id, temperature=0.3)


for i, metrics in enumerate(liar_final_metrics):
    baseline_f1 = liar_baseline[i]['f1']
    improved_f1 = metrics['f1']
    improvement = (improved_f1 - baseline_f1) / baseline_f1 * 100
    print(f'LIAR Template {i+1}: F1 improved by {improvement:.2f}% from {baseline_f1:.4f} to {improved_f1:.4f}')



Evaluating GPT-3.5-Turbo baseline...
Prompt Template ['You are a fact-checking assistant that labels news with a single word.\n                \nClassify as pants-fire/false/barely-true/half-true/mostly-true/true:\n                Statement: "{text}"\n                Speaker: {speaker} ({party})\n                History: PF:{pants_fire}, F:{false}, BT:{barely_true}, HT:{half_true}, MT:{mostly_true}\n                \nConsider the claim\'s accuracy based on the speaker\'s history.\n                \nExamples:\n                Statement: \'pepper kicked jock tax imposing levy sports entertainment industry\'\nSpeaker: dave-yost (republican)\nHistory: PF:0, F:0, BT:1, HT:0, MT:1 → true\nStatement: \'tell certainty capandtradewould devastating impact economy\'\nSpeaker: marco-rubio (republican)\nHistory: PF:5, F:24, BT:33, HT:32, MT:35 → false\nStatement: \'austin school district teachers lowest paid urban texas district lowest paid surrounding school district\'\nSpeaker: gina-hinojosa (no

Template 1: 100%|██████████| 1250/1250 [16:25<00:00,  1.27it/s]


Collected 1250 valid predictions for evaluation

Results for LIAR - Template 1:
Accuracy: 0.2416
Precision: 0.3121
Recall: 0.2416
F1 Score: 0.2171
Average Tokens per Article: 438.82
Total Cost: $1.10
Average Processing Time: 0.79s
Evaluating with template 2...


Template 2: 100%|██████████| 1250/1250 [16:50<00:00,  1.24it/s]


Collected 1250 valid predictions for evaluation

Results for LIAR - Template 2:
Accuracy: 0.2824
Precision: 0.3415
Recall: 0.2824
F1 Score: 0.2368
Average Tokens per Article: 489.30
Total Cost: $1.22
Average Processing Time: 0.81s

# Evaluating models with fine tuned model:
Prompt Template ['You are a fact-checking assistant that labels news with a single word.\n                \nClassify as pants-fire/false/barely-true/half-true/mostly-true/true:\n                Statement: "{text}"\n                Speaker: {speaker} ({party})\n                History: PF:{pants_fire}, F:{false}, BT:{barely_true}, HT:{half_true}, MT:{mostly_true}\n                \nConsider the claim\'s accuracy based on the speaker\'s history.\n                \nExamples:\n                Statement: \'pepper kicked jock tax imposing levy sports entertainment industry\'\nSpeaker: dave-yost (republican)\nHistory: PF:0, F:0, BT:1, HT:0, MT:1 → true\nStatement: \'tell certainty capandtradewould devastating impact econom

Template 1: 100%|██████████| 1250/1250 [33:45<00:00,  1.62s/it]


Collected 1250 valid predictions for evaluation

Results for LIAR - Template 1:
Accuracy: 0.3120
Precision: 0.4748
Recall: 0.3120
F1 Score: 0.2787
Average Tokens per Article: 437.96
Total Cost: $1.09
Average Processing Time: 1.62s
Evaluating with template 2...


Template 2: 100%|██████████| 1250/1250 [25:02<00:00,  1.20s/it] 

Collected 1250 valid predictions for evaluation

Results for LIAR - Template 2:
Accuracy: 0.3728
Precision: 0.4633
Recall: 0.3728
F1 Score: 0.3550
Average Tokens per Article: 489.21
Total Cost: $1.22
Average Processing Time: 1.20s
LIAR Template 1: F1 improved by 28.40% from 0.2171 to 0.2787
LIAR Template 2: F1 improved by 49.93% from 0.2368 to 0.3550





### Evaluation of Fine-tuned GPT model for FakeNewsNet Dataset

In [21]:
# Evaluate the fine-tuned gpt model that was trained with FakeNewsNet( The model id is obtained using fine-tune ID generated above)
fakenews_model_id = 'ft:gpt-3.5-turbo-0125:university-edinburgh:fakenews-balanced-e4:BHlR3dso'

print("\nEvaluating GPT-3.5-Turbo baseline...")
fakenews_baseline = evaluate_model(fake_news_test_data, "FAKENEWS", batch_size=10, model="gpt-3.5-turbo", temperature=0.1)


print("\n# Evaluating models with fine tuned model:")
fakenews_final_metrics = evaluate_model(fake_news_test_data, 'FAKENEWS', batch_size=3, model=fakenews_model_id)

for i, metrics in enumerate(fakenews_final_metrics):
    baseline_f1 = fakenews_baseline[i]['f1']
    improved_f1 = metrics['f1']
    improvement = (improved_f1 - baseline_f1) / baseline_f1 * 100
    print(f'FakeNewsNet Template {i+1}: F1 improved by {improvement:.2f}% from {baseline_f1:.4f} to {improved_f1:.4f}')



Evaluating GPT-3.5-Turbo baseline...
Prompt Template ['You are a fact-checking assistant that labels news with a single word.\n                \nClassify as real or fake:\n                News: "{text}"\n                URL: {news_url}\n                \nClassification (one word only):', 'You are a fact-checking assistant that labels news with a single word.\n                \nClassify as real or fake:\n                News: "{text}"\n                URL: {news_url}\n                \nConsider factors such as credibility of the source, verifiability of claims, and potential biases.\n                \nExamples:\n                News: \'scarlett johansson colin jost american museum gala...\'\nURL: www.popsugar.com/celebrity/Scarlett-Johansson-Colin-Jost-American-Museum-Gala-2017-44319959 → fake\nNews: \'prince michael jackson talks carrying father micha...\'\nURL: https://www.longroom.com/discussion/493103/prince-michael-jackson-talks-carrying-on-father-michael-jacksons-philanthropic-le

Template 1: 100%|██████████| 3430/3430 [40:44<00:00,  1.40it/s] 


Collected 3430 valid predictions for evaluation

Results for FAKENEWS - Template 1:
Accuracy: 0.8085
Precision: 0.8338
Recall: 0.9340
F1 Score: 0.8810
Average Tokens per Article: 82.08
Total Cost: $0.56
Average Processing Time: 0.71s
Evaluating with template 2...


Template 2: 100%|██████████| 3430/3430 [37:04<00:00,  1.54it/s]


Collected 3430 valid predictions for evaluation

Results for FAKENEWS - Template 2:
Accuracy: 0.8163
Precision: 0.8360
Recall: 0.9432
F1 Score: 0.8864
Average Tokens per Article: 224.07
Total Cost: $1.54
Average Processing Time: 0.65s
Evaluating with template 3...


Template 3: 100%|██████████| 3430/3430 [51:25<00:00,  1.11it/s]


Collected 3430 valid predictions for evaluation

Results for FAKENEWS - Template 3:
Accuracy: 0.7787
Precision: 0.8683
Recall: 0.8353
F1 Score: 0.8515
Average Tokens per Article: 141.29
Total Cost: $0.97
Average Processing Time: 0.90s

# Evaluating models with fine tuned model:
Prompt Template ['You are a fact-checking assistant that labels news with a single word.\n                \nClassify as real or fake:\n                News: "{text}"\n                URL: {news_url}\n                \nClassification (one word only):', 'You are a fact-checking assistant that labels news with a single word.\n                \nClassify as real or fake:\n                News: "{text}"\n                URL: {news_url}\n                \nConsider factors such as credibility of the source, verifiability of claims, and potential biases.\n                \nExamples:\n                News: \'scarlett johansson colin jost american museum gala...\'\nURL: www.popsugar.com/celebrity/Scarlett-Johansson-Colin-J

Template 1:  80%|███████▉  | 2730/3430 [50:32<11:41,  1.00s/it]  

Error in API call (attempt 1/3): Error code: 500 - {'error': {'message': 'The server had an error while processing your request. Sorry about that!', 'type': 'server_error', 'param': None, 'code': None}}


Template 1:  86%|████████▌ | 2937/3430 [54:07<07:40,  1.07it/s]

Error in API call (attempt 1/3): <!DOCTYPE html>
<!--[if lt IE 7]> <html class="no-js ie6 oldie" lang="en-US"> <![endif]-->
<!--[if IE 7]>    <html class="no-js ie7 oldie" lang="en-US"> <![endif]-->
<!--[if IE 8]>    <html class="no-js ie8 oldie" lang="en-US"> <![endif]-->
<!--[if gt IE 8]><!--> <html class="no-js" lang="en-US"> <!--<![endif]-->
<head>


<title>api.openai.com | 520: Web server is returning an unknown error</title>
<meta charset="UTF-8" />
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
<meta http-equiv="X-UA-Compatible" content="IE=Edge" />
<meta name="robots" content="noindex, nofollow" />
<meta name="viewport" content="width=device-width,initial-scale=1" />
<link rel="stylesheet" id="cf_styles-css" href="/cdn-cgi/styles/main.css" />


</head>
<body>
<div id="cf-wrapper">
    <div id="cf-error-details" class="p-0">
        <header class="mx-auto pt-10 lg:pt-6 lg:px-8 w-240 lg:w-full mb-8">
            <h1 class="inline-block sm:block sm:mb-2 font

Template 1: 100%|██████████| 3430/3430 [1:03:07<00:00,  1.10s/it]


Collected 3430 valid predictions for evaluation

Results for FAKENEWS - Template 1:
Accuracy: 0.8009
Precision: 0.8344
Recall: 0.9205
F1 Score: 0.8753
Average Tokens per Article: 82.07
Total Cost: $0.56
Average Processing Time: 1.10s
Evaluating with template 2...


Template 2:  52%|█████▏    | 1779/3430 [30:07<25:53,  1.06it/s]  

Error in API call (attempt 1/3): Error code: 500 - {'error': {'message': 'The server had an error while processing your request. Sorry about that!', 'type': 'server_error', 'param': None, 'code': None}}


Template 2:  55%|█████▌    | 1893/3430 [32:11<33:02,  1.29s/it]

Error in API call (attempt 1/3): Error code: 500 - {'error': {'message': 'The server had an error while processing your request. Sorry about that!', 'type': 'server_error', 'param': None, 'code': None}}


Template 2:  57%|█████▋    | 1938/3430 [32:58<22:31,  1.10it/s]

Error in API call (attempt 1/3): Error code: 500 - {'error': {'message': 'The server had an error while processing your request. Sorry about that!', 'type': 'server_error', 'param': None, 'code': None}}


Template 2: 100%|██████████| 3430/3430 [58:28<00:00,  1.02s/it]


Collected 3430 valid predictions for evaluation

Results for FAKENEWS - Template 2:
Accuracy: 0.8117
Precision: 0.8395
Recall: 0.9298
F1 Score: 0.8823
Average Tokens per Article: 224.07
Total Cost: $1.54
Average Processing Time: 1.02s
Evaluating with template 3...


Template 3:   5%|▌         | 174/3430 [03:35<52:43,  1.03it/s]  

Error in API call (attempt 3/3): Error code: 500 - {'error': {'message': 'The server had an error while processing your request. Sorry about that!', 'type': 'server_error', 'param': None, 'code': None}}
All retries failed. Using fallback response.


Template 3:   5%|▌         | 177/3430 [03:53<2:14:29,  2.48s/it]

Encountered 1 errors in batch processing


Template 3: 100%|██████████| 3430/3430 [1:05:14<00:00,  1.14s/it]

Collected 3430 valid predictions for evaluation

Results for FAKENEWS - Template 3:
Accuracy: 0.7860
Precision: 0.8633
Recall: 0.8534
F1 Score: 0.8583
Average Tokens per Article: 141.22
Total Cost: $0.97
Average Processing Time: 1.14s
FakeNewsNet Template 1: F1 improved by -0.65% from 0.8810 to 0.8753
FakeNewsNet Template 2: F1 improved by -0.45% from 0.8864 to 0.8823
FakeNewsNet Template 3: F1 improved by 0.80% from 0.8515 to 0.8583





### Cross Dataset Evaluation

In [22]:
# Map Fake News Labels to match LIAR labels
def map_fake_news_to_liar(label):
        return 0 if label == 0 else 5  

# Map LIAR Labels to match FakeNewsNet labels
def map_liar_to_binary(label):
    return 0 if label <= 2 else 1


liar_test['label'] = liar_test['label'].apply(map_liar_to_binary)
fake_news_test_data['label'] = fake_news_test_data['label'].apply(map_fake_news_to_liar)

In [26]:
liar_test['news_url'] = "Unknown"

In [28]:
fake_news_test_data['speaker'] = "Unknown"
fake_news_test_data['party_affiliation'] = "Unknown"
fake_news_test_data['pants_fire_count'] = 0
fake_news_test_data['false_count'] = 0
fake_news_test_data['barely_true_count'] = 0
fake_news_test_data['half_true_count'] = 0
fake_news_test_data['mostly_true_count'] = 0

In [29]:
print("\n# Cross dataset evaluation:")
# Two prompt templates of LIAR model being tested on FakeNewsNet data
fake_news_data_using_liar_model = evaluate_model(fake_news_test_data, 'LIAR', batch_size=10, model=liar_model_id, temperature=0.3)

# Three prompt templates of LIAR model being tested on FakeNewsNet data
liar_data_using_fake_news_model = evaluate_model(liar_test, 'FAKENEWS', batch_size=3, model=fakenews_model_id)



# Cross dataset evaluation:
Prompt Template ['You are a fact-checking assistant that labels news with a single word.\n                \nClassify as pants-fire/false/barely-true/half-true/mostly-true/true:\n                Statement: "{text}"\n                Speaker: {speaker} ({party})\n                History: PF:{pants_fire}, F:{false}, BT:{barely_true}, HT:{half_true}, MT:{mostly_true}\n                \nConsider the claim\'s accuracy based on the speaker\'s history.\n                \nExamples:\n                Statement: \'scarlett johansson colin jost american museum gala 2017\'\nSpeaker: Unknown (Unknown)\nHistory: PF:0, F:0, BT:0, HT:0, MT:0 → pants-fire\nStatement: \'prince michael jackson talks carrying father michael jacksons philanthropic legacy 2017 billboard music awards\'\nSpeaker: Unknown (Unknown)\nHistory: PF:0, F:0, BT:0, HT:0, MT:0 → true\n                \nClassification (one word only):', 'You are a fact-checking assistant that labels news with a single word.\n 

Template 1: 100%|██████████| 3430/3430 [51:39<00:00,  1.11it/s] 


Collected 3430 valid predictions for evaluation

Results for LIAR - Template 1:
Accuracy: 0.7166
Precision: 0.7546
Recall: 0.7166
F1 Score: 0.7155
Average Tokens per Article: 214.72
Total Cost: $1.47
Average Processing Time: 0.90s
Evaluating with template 2...


Template 2: 100%|██████████| 3430/3430 [56:07<00:00,  1.02it/s] 


Collected 3430 valid predictions for evaluation

Results for LIAR - Template 2:
Accuracy: 0.6329
Precision: 0.7645
Recall: 0.6329
F1 Score: 0.6806
Average Tokens per Article: 265.72
Total Cost: $1.82
Average Processing Time: 0.98s
Prompt Template ['You are a fact-checking assistant that labels news with a single word.\n                \nClassify as real or fake:\n                News: "{text}"\n                URL: {news_url}\n                \nClassification (one word only):', 'You are a fact-checking assistant that labels news with a single word.\n                \nClassify as real or fake:\n                News: "{text}"\n                URL: {news_url}\n                \nConsider factors such as credibility of the source, verifiability of claims, and potential biases.\n                \nExamples:\n                News: \'says rick scotts record jobs includes florida rank...\'\nURL: Unknown → real\nNews: \'know saddam hussein well killed terrorists...\'\nURL: Unknown → fake\n       

Template 1: 100%|██████████| 1250/1250 [18:03<00:00,  1.15it/s]


Collected 1250 valid predictions for evaluation

Results for FAKENEWS - Template 1:
Accuracy: 0.5384
Precision: 0.6761
Recall: 0.3395
F1 Score: 0.4520
Average Tokens per Article: 53.99
Total Cost: $0.13
Average Processing Time: 0.87s
Evaluating with template 2...


Template 2: 100%|██████████| 1250/1250 [20:37<00:00,  1.01it/s] 


Collected 1250 valid predictions for evaluation

Results for FAKENEWS - Template 2:
Accuracy: 0.5480
Precision: 0.7166
Recall: 0.3210
F1 Score: 0.4433
Average Tokens per Article: 114.97
Total Cost: $0.29
Average Processing Time: 0.99s
Evaluating with template 3...


Template 3: 100%|██████████| 1250/1250 [33:41<00:00,  1.62s/it]

Collected 1250 valid predictions for evaluation

Results for FAKENEWS - Template 3:
Accuracy: 0.4824
Precision: 0.7250
Recall: 0.1241
F1 Score: 0.2119
Average Tokens per Article: 113.38
Total Cost: $0.28
Average Processing Time: 1.62s





In [18]:
jobs = client.fine_tuning.jobs.list()


In [19]:
job_ids = ["ftjob-7sTZSC93NFkGkTYQIPfHcK4C", "ftjob-aa2UJ5qWMWVCiwvB4l2aFl2W"]  

for job in jobs:
    if job.id in job_ids:
        print(f"Job ID: {job.id}, Status: {job.status}, Fine Tune ID: {job.fine_tuned_model}")

Job ID: ftjob-aa2UJ5qWMWVCiwvB4l2aFl2W, Status: succeeded, Fine Tune ID: ft:gpt-3.5-turbo-0125:university-edinburgh:fakenews-balanced-e4:BHlR3dso
Job ID: ftjob-7sTZSC93NFkGkTYQIPfHcK4C, Status: succeeded, Fine Tune ID: ft:gpt-3.5-turbo-0125:university-edinburgh:liar-balanced-e4:BHkIJsSZ
