In [295]:
# pip install openpyxl==3.1.2

In [296]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load file
output_file = 'ABA_dataset_2_gemini_2.5_pro_3_shot_1.csv'
answer_key_file = '/Users/nonny/Downloads/Remove Disagreement Version(2).xlsx'

df_output = pd.read_csv(output_file)
df_answer_key = pd.read_excel(answer_key_file)

In [297]:
# df_answer_key.head(30)

In [298]:
# print(df_answer_key.columns.tolist())

In [299]:
# Clean answer key
# Rename these columns to match the wanted format:
df_answer_cleaned = df_answer_key[['Column1', 'Topic', 'Selected Content', 'Pos/Neg']].rename(columns={
    'Column1': 'ID',
    'Topic': 'Topics',
    'Selected Content': 'Text',
    'Pos/Neg': 'NegPos'
})

# df_answer_cleaned.head(30)

In [300]:
df_answer_cleaned[df_answer_cleaned['ID'] ==22]

Unnamed: 0,ID,Topics,Text,NegPos
50,22,Room,"Room was perfect, looked new, clean shiny tile...",Positive
51,22,Room,although might have needed another set of draw...,Negative
52,22,Food,"Ordered pizza one night and was not great, too...",Negative
53,22,Food,The restaurant area is nice,Positive
54,22,Staff,the staff very accomodating.,Positive
55,22,Off,so it was cold. had to have a cold shower one ...,Negative


In [301]:
# Clean Output
# Define a function to clean the 'Topics' column in the model output:
# - Removes any trailing numbers from the topic string (e.g., 'Room1' -> 'Room')
def clean_topic(topic):
    return re.sub(r'\d+$', '', str(topic))

# Apply the clean_topic function to the 'Topics' column in df_output
# Store the result in a new column called 'Cleaned_Topics'
df_output['Cleaned_Topics'] = df_output['Topics'].apply(clean_topic)

# Create a cleaned version of the model output DataFrame by selecting relevant columns:
df_output_cleaned = df_output[['ID', 'Cleaned_Topics', 'Text', 'NegPos']].copy()
df_output_cleaned.rename(columns={'Cleaned_Topics': 'Topics'}, inplace=True)

# Remove rows where both 'Text' and 'NegPos' are NaN (missing output for this instance)
df_output_cleaned = df_output_cleaned[~(df_output_cleaned['Text'].isna() & df_output_cleaned['NegPos'].isna())]

# Optional to display first 30 row for checking
# df_output_cleaned.head(30)

# Aggregate the cleaned output to handle cases where multiple texts are generated for the same (ID, Topic, NegPos):
# - Group by 'ID', 'Topics', and 'NegPos'
# - For each group, concatenate all non-empty 'Text' values into a single string separated by commas
df_output_merged = df_output_cleaned.groupby(['ID', 'Topics', 'NegPos']).agg({
    'Text': lambda x: ', '.join(x.dropna().astype(str).str.strip())
}).reset_index()


# Optional: 
# View the first 30 rows of the merged output for inspection
# df_output_merged.head(30)

# Display the merged output for ID = 22 to check specific results
df_output_merged[df_output_merged['ID'] == 22]


Unnamed: 0,ID,Topics,NegPos,Text
50,22,Facility,Negative,Fridge could have been put on beforehand so it...
51,22,Facility,Positive,When you left it running for around 5 mins it ...
52,22,Food,Negative,"Ordered pizza one night and was not great, too..."
53,22,Room,Negative,although might have needed another set of draw...
54,22,Room,Positive,"Room was perfect, looked new, clean shiny tile..."
55,22,Staff,Positive,the staff very accomodating.


In [302]:
# df_output_merged .tail(30)

In [303]:
# Merge the cleaned answer key (df_answer_cleaned) and the cleaned model output (df_output_merged)

# - Before merging, rename columns to explicitly indicate source (answer or output):
#     - In the answer key: 'Text' -> 'Text_answer', 'NegPos' -> 'NegPos_answer'
#     - In the model output: 'Text' -> 'Text_output', 'NegPos' -> 'NegPos_output'
# - Merge on 'ID', 'Topics', and sentiment labels ('NegPos_answer' vs. 'NegPos_output')
# - Use an outer join to include all cases from both the answer key and the model output,
#   even when there is no match (this helps identify both missing predictions and extra outputs)
df_compare = pd.merge(
    df_answer_cleaned.rename(columns={'NegPos': 'NegPos_answer', 'Text': 'Text_answer'}),
    df_output_merged.rename(columns={'NegPos': 'NegPos_output', 'Text': 'Text_output'}),
    left_on=['ID', 'Topics', 'NegPos_answer'],
    right_on=['ID', 'Topics', 'NegPos_output'],
    how='outer'
)

# df_compare.head(30)
df_compare[df_compare['ID'] == 8]

Unnamed: 0,ID,Topics,Text_answer,NegPos_answer,NegPos_output,Text_output
16,8,Facility,,,Positive,renovated
17,8,Location,nice location,Positive,Positive,nice location
18,8,Off,"renovated, nothing",Positive,,
19,8,Staff,lovely personal,Positive,Positive,lovely personal


In [304]:
# Topic match function
# Define a function to check if the topic extraction is correct (topic match check):
def topic_match(row):
    text_answer = row['Text_answer']
    text_output = row['Text_output']

    # Case: both are NaN or empty → drop
    if (pd.isna(text_answer) or str(text_answer).strip() == '') and \
       (pd.isna(text_output) or str(text_output).strip() == ''):
        return 'drop'
    
    # Case: both are present → True
    elif (not pd.isna(text_answer) and str(text_answer).strip() != '') and \
         (not pd.isna(text_output) and str(text_output).strip() != ''):
        return True
    
    # Case: one is missing → False
    else:
        return False

# Apply match check and create a new column called 'Topic_Match' that contains 'drop', True, or False for each pair
df_compare['Topic_Match'] = df_compare.apply(topic_match, axis=1)

# Filter out rows where 'drop'
df_compare_filtered = df_compare[df_compare['Topic_Match'] != 'drop'].copy()

# Save
comparison_output_file = 'Topic_Match__gemini_2.5_pro_3_Shot_1.csv'
# comparison_output_file = 'test.csv'
df_compare_filtered.to_csv(comparison_output_file, index=False)

print(f"Filtered comparison result saved as '{comparison_output_file}'")
df_compare.head(20)




Filtered comparison result saved as 'Topic_Match__gemini_2.5_pro_3_Shot_1.csv'


Unnamed: 0,ID,Topics,Text_answer,NegPos_answer,NegPos_output,Text_output,Topic_Match
0,1,Facility,"New, comfortable apartments",Positive,Positive,"New, apartments, comfortable restaurant for bo...",True
1,1,Food,"Tasty food on the first floor, comfortable res...",Positive,Positive,Tasty food on the first floor,True
2,1,Location,"close to the airport, to very clean beach.",Positive,Positive,"close to the airport, to very clean beach.",True
3,1,Off,,,Negative,Nothing at all.,False
4,1,Off,"on the first floor, to escape the heat in the ...",,,,False
5,1,Room,,,Positive,comfortable apartments,False
6,1,Staff,Staff is extremely helpful and easy to communi...,Positive,Positive,Staff is extremely helpful and easy to communi...,True
7,3,Location,the location is great and near the airport. bu...,Positive,Positive,the location is great and near the airport. bu...,True
8,4,Price,Great Quality/price,Positive,Positive,Great Quality/price,True
9,4,Room,Clean,Positive,Positive,Clean,True


In [305]:
df_compare[df_compare['ID'] == 22]

Unnamed: 0,ID,Topics,Text_answer,NegPos_answer,NegPos_output,Text_output,Topic_Match
61,22,Facility,,,Negative,Fridge could have been put on beforehand so it...,False
62,22,Facility,,,Positive,When you left it running for around 5 mins it ...,False
63,22,Food,"Ordered pizza one night and was not great, too...",Negative,Negative,"Ordered pizza one night and was not great, too...",True
64,22,Food,The restaurant area is nice,Positive,,,False
65,22,Off,so it was cold. had to have a cold shower one ...,Negative,,,False
66,22,Room,although might have needed another set of draw...,Negative,Negative,although might have needed another set of draw...,True
67,22,Room,"Room was perfect, looked new, clean shiny tile...",Positive,Positive,"Room was perfect, looked new, clean shiny tile...",True
68,22,Staff,the staff very accomodating.,Positive,Positive,the staff very accomodating.,True


In [306]:
num_answer_topics = len(df_answer_cleaned)
num_output_topics = len(df_output_merged)

print(f"Total topics in Answer Key: {num_answer_topics}")
print(f"Total topics in Model Output: {num_output_topics}")


Total topics in Answer Key: 1923
Total topics in Model Output: 1878


In [307]:
total = len(df_compare)
correct = (df_compare['Topic_Match'] == True).sum()
accuracy = correct / total if total > 0 else 0

print(f"Total comparisons: {total}")
print(f"Correct matches: {correct}")
print(f"Topic Classification Accuracy: {accuracy:.2%}")

Total comparisons: 2237
Correct matches: 1566
Topic Classification Accuracy: 70.00%


In [308]:
# df_filtered = df_compare[df_compare['Topic_Match'] != 'drop']

# # Calculate True Positives (TP):
# # - Both human and model extracted text for the same (ID, Topic, NegPos) pair
# TP = (df_filtered['Topic_Match'] == True).sum()

# FP = ((df_filtered['Topic_Match'] == False) & (df_filtered['Text_output'].notna()) & (df_filtered['Text_answer'].isna())).sum()
# FN = ((df_filtered['Topic_Match'] == False) & (df_filtered['Text_output'].isna()) & (df_filtered['Text_answer'].notna())).sum()

# precision = TP / (TP + FP) if (TP + FP) > 0 else 0
# recall = TP / (TP + FN) if (TP + FN) > 0 else 0
# f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0

# print(f"Precision: {precision:.2%}")
# print(f"Recall: {recall:.2%}")
# print(f"F1 Score: {f1:.2%}")
