In [None]:
import pandas as pd

# Read the original CSV file
df = pd.read_csv("Theme_and_Hashtag_Data_human_without_examples.csv")

# Create a new list to store the converted data
new_data = []

# Iterate through each column (theme) in the DataFrame
for category in df.columns:
    # Get all non-null hashtags for this theme
    hashtags = df[category].dropna()
    
    # Add each hashtag and its theme to the new data list
    for hashtag in hashtags:
        new_data.append({"hashtag": hashtag, "category": category})

# Create a new DataFrame from the converted data
new_df = pd.DataFrame(new_data)

# Save the new DataFrame to a CSV file
new_df.to_csv("human_hashtag_classifications.csv", index=False)

print("Conversion complete. New file saved as 'human_hashtag_classifications.csv'")

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, accuracy_score

# Load the datasets
human_df = pd.read_csv("human_hashtag_classifications.csv")
gpt_df = pd.read_csv("gpt_hashtag_classifications_ground_truth.csv")

# Convert categories to lowercase for both DataFrames and replace NaN with a placeholder
human_df['category'] = human_df['category'].fillna('unknown').str.lower()
gpt_df['category'] = gpt_df['category'].fillna('unknown').str.lower()

# Apply the category mapping to the GPT DataFrame
category_mapping = {
    'platform engagement': 'platform',
    'commonly misused substances': 'commonly-misused substances'
}
gpt_df['category'] = gpt_df['category'].replace(category_mapping)

# Merge the datasets
merged_df = pd.merge(human_df, gpt_df, on='hashtag', how='left', suffixes=('_human', '_gpt'))

# Replace any remaining NaN values with 'unknown'
merged_df['category_human'] = merged_df['category_human'].fillna('unknown')
merged_df['category_gpt'] = merged_df['category_gpt'].fillna('unknown')

# Get all unique categories
all_categories = sorted(set(merged_df['category_human'].unique()) | set(merged_df['category_gpt'].unique()))

# Create the confusion matrix
cm = confusion_matrix(
    merged_df['category_human'],
    merged_df['category_gpt'],
    labels=all_categories
)

# Calculate overall accuracy
overall_accuracy = accuracy_score(merged_df['category_human'], merged_df['category_gpt'])

# Calculate accuracy for each category
category_accuracies = {}
for category in all_categories:
    category_mask = merged_df['category_human'] == category
    category_accuracy = accuracy_score(
        merged_df.loc[category_mask, 'category_human'],
        merged_df.loc[category_mask, 'category_gpt']
    )
    category_accuracies[category] = category_accuracy

# Create a heatmap of the confusion matrix
plt.figure(figsize=(15, 12))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=all_categories, yticklabels=all_categories)
plt.title('Confusion Matrix of Human vs GPT-4o Category Classifications')
plt.xlabel('GPT-4o Categories')
plt.ylabel('Human Categories')
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

# Print the confusion matrix
print("Confusion Matrix:")
print(pd.DataFrame(cm, index=all_categories, columns=all_categories))

# Print overall accuracy
print(f"\nOverall Accuracy: {overall_accuracy:.4f}")

# Print accuracy for each category
print("\nAccuracy by Category:")
for category, accuracy in sorted(category_accuracies.items(), key=lambda x: x[1], reverse=True):
    print(f"{category}: {accuracy:.4f}")

In [None]:
# Save the merged DataFrame to a CSV file
output_file = 'merged_hashtag_classifications.csv'
merged_df.to_csv(output_file, index=False)
print(f"\nMerged dataset saved to {output_file}")