In [155]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

output_file = 'output_formatted_ABA_dataset_2_gpt4o_5_shot_1_1.csv'
answer_key_file = '/Users/nonny/Downloads/Answer.xlsx'

df_output = pd.read_csv(output_file)
df_answer_key = pd.read_excel(answer_key_file)

In [156]:
df_answer_key.head(30)

Unnamed: 0,Column1,Title,PositiveReview,NegativeReview,Topic,Selected Content,Pos/Neg,Head,Body 1,Body 2,...,Cont. Body 12,Cont. Body 13,Cont. Body 14,Cont. Body 15,Concat,Claim,PositiveReview Relevancy,NegativeReview Relevancy,Column2,Column3
0,1,Wonderful place to stay.,"New, comfortable apartments, close to the airp...",Nothing at all.,Facility,"New, comfortable apartments",Positive,good_facility,new_hotel,comfortable_hotel,...,,,,,"new_hotel , comfortable_hotel , no_evident_not...",,,,,
1,1,Wonderful place to stay.,"New, comfortable apartments, close to the airp...",Nothing at all.,Location,"close to the airport, to very clean beach.",Positive,good_location,close_to_airport,close_to_clean_beach,...,,,,,"close_to_airport , close_to_clean_beach , no_e...",Claim,Yes,Noise Negative Null,0.0,785.0
2,1,Wonderful place to stay.,"New, comfortable apartments, close to the airp...",Nothing at all.,Staff,Staff is extremely helpful and easy to communi...,Positive,good_staff,helpful_staff,easy_to_communicate_with,...,,,,,"helpful_staff , easy_to_communicate_with , no_...",,,,,
3,1,Wonderful place to stay.,"New, comfortable apartments, close to the airp...",Nothing at all.,Food,"Tasty food on the first floor, comfortable res...",Positive,good_food,tasty_food,comfortable_restaurant_for_cozy_evening,...,,,,,"tasty_food , comfortable_restaurant_for_cozy_e...",,,,,
4,1,Wonderful place to stay.,"New, comfortable apartments, close to the airp...",Nothing at all.,Off,"on the first floor, to escape the heat in the ...",,,,,...,,,,,,,,,,
5,3,Very Good,the location is great and near the airport. bu...,,Location,the location is great and near the airport. bu...,Positive,good_location,great_location,near_airport,...,,,,,"great_location , near_airport , close_to_bus_s...",Claim,Yes,Noise Negative Null,,
6,4,Wonderful,Great stuff\n Great Quality/price\n Clean,,Staff,Great stuff,Positive,good_staff,great_staff,,...,,,,,"great_staff , no_evident_not_great_staff",Claim,Yes,Noise Negative Null,,
7,4,Wonderful,Great stuff\n Great Quality/price\n Clean,,Price,Great Quality/price,Positive,good_price,great_quality,great_price,...,,,,,"great_quality , great_price , no_evident_not_g...",,,,,
8,4,Wonderful,Great stuff\n Great Quality/price\n Clean,,Room,Clean,Positive,good_room,clean_room,,...,,,,,"clean_room , no_evident_not_clean_room",,,,,
9,5,"Fantastic value for a new, modern and spotless...","Clean and modern with very comfortable beds, i...",,Room,Clean and modern with very comfortable beds,Positive,good_room,clean_room,modern_room,...,,,,,"clean_room , modern_room , comfortable_bed , n...",Claim,Yes,Noise Negative Null,,


In [157]:
# Clean answer key
df_answer_cleaned = df_answer_key[['Column1', 'Topic', 'Selected Content', 'Pos/Neg']].rename(columns={
    'Column1': 'ID',
    'Topic': 'Topics',
    'Selected Content': 'Text',
    'Pos/Neg': 'NegPos'
})

# df_answer_cleaned.head(30)

In [158]:
# Clean Output
def clean_topic(topic):
    return re.sub(r'\d+$', '', str(topic))

df_output['Cleaned_Topics'] = df_output['Topics'].apply(clean_topic)

df_output_cleaned = df_output[['ID', 'Cleaned_Topics', 'Text', 'NegPos']].copy()
df_output_cleaned.rename(columns={'Cleaned_Topics': 'Topics'}, inplace=True)
df_output_cleaned = df_output_cleaned[~(df_output_cleaned['Text'].isna() & df_output_cleaned['NegPos'].isna())]

# df_output_cleaned.head(30)

df_output_merged = df_output_cleaned.groupby(['ID', 'Topics', 'NegPos']).agg({
    'Text': lambda x: ', '.join(x.dropna().astype(str).str.strip())
}).reset_index()


# df_output_merged.head(30)
df_output_merged[df_output_merged['ID'] == 22]


Unnamed: 0,ID,Topics,NegPos,Text
54,22,Food,Negative,"Ordered pizza one night and was not great, too..."
55,22,Food,Positive,the restaurant area is nice.
56,22,Off,Null,I'm sure other food is better.
57,22,Room,Negative,might have needed another set of drawers if th...
58,22,Room,Positive,"Room was perfect, looked new, clean shiny tile..."
59,22,Staff,Positive,the staff very accommodating.


In [159]:
df_compare = pd.merge(
    df_answer_cleaned.rename(columns={'NegPos': 'NegPos_answer', 'Text': 'Text_answer'}),
    df_output_merged.rename(columns={'NegPos': 'NegPos_output', 'Text': 'Text_output'}),
    left_on=['ID', 'Topics', 'NegPos_answer'],
    right_on=['ID', 'Topics', 'NegPos_output'],
    how='outer'
)

df_compare = df_compare[df_compare['Topics'] != 'Off']

df_compare.head(30)
# df_compare[df_compare['ID'] == 22]

Unnamed: 0,ID,Topics,Text_answer,NegPos_answer,NegPos_output,Text_output
0,1,Facility,"New, comfortable apartments",Positive,,
1,1,Food,"Tasty food on the first floor, comfortable res...",Positive,Positive,"Tasty food on the first floor, comfortable res..."
2,1,Location,"close to the airport, to very clean beach.",Positive,Positive,"close to the airport, to very clean beach."
5,1,Room,,,Positive,"New, comfortable apartments."
6,1,Staff,Staff is extremely helpful and easy to communi...,Positive,Positive,Staff is extremely helpful and easy to communi...
7,3,Location,the location is great and near the airport. bu...,Positive,Positive,the location is great and near the airport. bu...
8,4,Price,Great Quality/price,Positive,Positive,Great Quality/price
9,4,Room,Clean,Positive,Positive,Clean
10,4,Staff,Great stuff,Positive,Positive,Great stuff
11,5,Location,in a very convenient location. An easy stroll...,Positive,Positive,in a very convenient location. An easy stroll ...


In [160]:
def sentiment_match(row):
    if pd.isna(row['NegPos_answer']) or pd.isna(row['NegPos_output']):
        return False  # Missing either → False
    return str(row['NegPos_answer']).strip().lower() == str(row['NegPos_output']).strip().lower()

df_compare['Sentiment_Match'] = df_compare.apply(sentiment_match, axis=1)

df_compare.head(30)
# df_compare[df_compare['ID'] == 22]

Unnamed: 0,ID,Topics,Text_answer,NegPos_answer,NegPos_output,Text_output,Sentiment_Match
0,1,Facility,"New, comfortable apartments",Positive,,,False
1,1,Food,"Tasty food on the first floor, comfortable res...",Positive,Positive,"Tasty food on the first floor, comfortable res...",True
2,1,Location,"close to the airport, to very clean beach.",Positive,Positive,"close to the airport, to very clean beach.",True
5,1,Room,,,Positive,"New, comfortable apartments.",False
6,1,Staff,Staff is extremely helpful and easy to communi...,Positive,Positive,Staff is extremely helpful and easy to communi...,True
7,3,Location,the location is great and near the airport. bu...,Positive,Positive,the location is great and near the airport. bu...,True
8,4,Price,Great Quality/price,Positive,Positive,Great Quality/price,True
9,4,Room,Clean,Positive,Positive,Clean,True
10,4,Staff,Great stuff,Positive,Positive,Great stuff,True
11,5,Location,in a very convenient location. An easy stroll...,Positive,Positive,in a very convenient location. An easy stroll ...,True


In [161]:
df_compare.to_csv('sentiment_result_5_shot_1.csv', index=False)

In [162]:
# Calculate accuracy
num_true = df_compare['Sentiment_Match'].sum()
num_total = len(df_compare)

accuracy = num_true / num_total

print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Correct (True): {num_true}")
print(f"Total Cases: {num_total}")


Accuracy: 80.32%
Correct (True): 1465
Total Cases: 1824
