In [58]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

output_file = 'output_formatted_reviews_gpt4o_3_v6_3.csv'
answer_key_file = '/Users/nonny/Downloads/ABA Dataset - Label Topic (For Tasks 1 & 2).xlsx'

df_output = pd.read_csv(output_file)
df_answer_key = pd.read_excel(answer_key_file)

In [59]:
# Clean answer key
df_answer_cleaned = df_answer_key[['Unnamed: 0', 'Topic', 'Selected Content', 'Pos/Neg']].rename(columns={
    'Unnamed: 0': 'ID',
    'Topic': 'Topics',
    'Selected Content': 'Text_answer',
    'Pos/Neg': 'NegPos_answer'
})

df_answer_cleaned.head(30)

Unnamed: 0,ID,Topics,Text_answer,NegPos_answer
0,1,Location,"close to the airport, to very clean beach.",Positive
1,1,Staff,Staff is extremely helpful and easy to communi...,Positive
2,1,Food,"Tasty food on the first floor, comfortable res...",Positive
3,2,Staff,The staff was very nice and helpful!,Positive
4,2,Room,"The room was very clean, well decorated and mo...",Positive
5,2,Food,"Also, the breakfast was amazing, fresh and han...",Positive
6,3,Location,the location is great and near the airport. bu...,Positive
7,4,Staff,Great stuff,Positive
8,4,Price,Great Quality/price,Positive
9,4,Room,Clean,Positive


In [60]:
# Clean Output
def clean_topic(topic):
    return re.sub(r'\d+$', '', str(topic))

df_output['Cleaned_Topics'] = df_output['Topics'].apply(clean_topic)

df_output_cleaned = df_output[['ID', 'Cleaned_Topics', 'Text', 'NegPos']].copy()
df_output_cleaned.rename(columns={'Cleaned_Topics': 'Topics'}, inplace=True)
df_output_cleaned = df_output_cleaned[~(df_output_cleaned['Text'].isna() & df_output_cleaned['NegPos'].isna())]

# df_output_cleaned.head(30)

df_output_merged = df_output_cleaned.groupby(['ID', 'Topics', 'NegPos']).agg({
    'Text': lambda x: ', '.join(x.dropna().astype(str).str.strip())
}).reset_index()


# df_output_merged.head(30)
df_output_merged[df_output_merged['ID'] == 22]


Unnamed: 0,ID,Topics,NegPos,Text
59,22,Facility,Positive,The restaurant area is nice.
60,22,Food,Negative,"Ordered pizza one night and was not great, too..."
61,22,Food,Positive,I'm sure other food is better.
62,22,Room,Negative,might have needed another set of drawers if th...
63,22,Room,Positive,"Room was perfect, looked new, clean shiny tile..."
64,22,Staff,Positive,the staff very accommodating.


In [61]:
# Merge on ID and Topics
# Use 'ID', 'Topics', and 'NegPos' as the keys to align the data
# 'NegPos' is renamed to 'NegPos_answer' in df_answer_cleaned before the merge, so we match on that
# Use an OUTER JOIN to capture all possible cases:
# - Matching (ID, Topics, Sentiment) pairs
# - Missing predictions (False Negatives)
# - Extra predictions (False Positives)
df_compare = pd.merge(
    df_answer_cleaned,
    df_output_merged,
    left_on=['ID', 'Topics', 'NegPos_answer'],
    right_on=['ID', 'Topics', 'NegPos'],
    how='outer',
    suffixes=('_answer', '_output')
)

df_compare.rename(columns={
    'Text': 'Text_output',
    'NegPos': 'NegPos_output'
}, inplace=True)

# Filter out rows where BOTH 'Text_answer' and 'Text_output' are missing or empty
df_compare = df_compare[~(
    (df_compare['Text_answer'].isna() | df_compare['Text_answer'].str.strip().eq('')) &
    (df_compare['Text_output'].isna() | df_compare['Text_output'].str.strip().eq(''))
)]

# Remove topics labeled as 'Off'
# df_compare = df_compare[df_compare['Topics'] != 'Off']
df_compare.head(30)
# df_compare[df_compare['ID'] == 22]


Unnamed: 0,ID,Topics,Text_answer,NegPos_answer,NegPos_output,Text_output
0,1,Food,"Tasty food on the first floor, comfortable res...",Positive,Positive,"Tasty food on the first floor, comfortable res..."
1,1,Location,"close to the airport, to very clean beach.",Positive,Positive,"close to the airport, to very clean beach."
2,1,Room,,,Positive,"New, comfortable apartments."
3,1,Staff,Staff is extremely helpful and easy to communi...,Positive,Positive,Staff is extremely helpful and easy to communi...
4,2,Facility,,,Positive,"I like hotels with a family atmosphere, cozy a..."
5,2,Food,"Also, the breakfast was amazing, fresh and han...",Positive,Positive,"the breakfast was amazing, fresh and handmade!..."
6,2,Room,,,Negative,although not big.
7,2,Room,"The room was very clean, well decorated and mo...",Positive,Positive,"The room was very clean, well decorated and mo..."
8,2,Staff,The staff was very nice and helpful!,Positive,Positive,The staff was very nice and helpful! all staff...
9,3,Location,the location is great and near the airport. bu...,Positive,Positive,the location is great and near the airport. bu...


In [62]:
# Combine all text data from both the answer key and model output into a single list 
# - Use .fillna('') to replace NaN with empty strings to avoid errors
all_texts = df_compare['Text_answer'].fillna('').tolist() + df_compare['Text_output'].fillna('').tolist()

# Initialize the TF-IDF Vectorizer
# - lowercase=True: converts all text to lowercase for normalization
# - token_pattern=r"(?u)\b\w+\b": matches any single word (includes single letters and numbers), so it's more general than the default
vectorizer = TfidfVectorizer(lowercase=True, token_pattern=r"(?u)\b\w+\b")

vectorizer.fit(all_texts)

In [63]:
# Define a function to compute cosine similarity between two text strings using TF-IDF
def compute_cosine(text1, text2, vectorizer):
    # Case 1: If either text is NaN, return 0 similarity
    if pd.isna(text1) or pd.isna(text2):
        return 0.0

    # Strip leading/trailing spaces to avoid issues with empty strings
    text1 = str(text1).strip()
    text2 = str(text2).strip()
    # Case 2: If either text is empty after stripping, return 0 similarity
    if text1 == '' or text2 == '':
        return 0.0

    # Transform both texts into TF-IDF vectors using the pre-fitted vectorizer
    tfidf = vectorizer.transform([text1, text2])
    return cosine_similarity(tfidf[0:1], tfidf[1:2])[0][0]

df_compare['Cosine_Similarity'] = df_compare.apply(
    lambda row: compute_cosine(row['Text_answer'], row['Text_output'], vectorizer),
    axis=1
)

df_compare[['ID', 'Topics', 'Text_answer', 'Text_output', 'Cosine_Similarity']].head(30)



Unnamed: 0,ID,Topics,Text_answer,Text_output,Cosine_Similarity
0,1,Food,"Tasty food on the first floor, comfortable res...","Tasty food on the first floor, comfortable res...",1.0
1,1,Location,"close to the airport, to very clean beach.","close to the airport, to very clean beach.",1.0
2,1,Room,,"New, comfortable apartments.",0.0
3,1,Staff,Staff is extremely helpful and easy to communi...,Staff is extremely helpful and easy to communi...,1.0
4,2,Facility,,"I like hotels with a family atmosphere, cozy a...",0.0
5,2,Food,"Also, the breakfast was amazing, fresh and han...","the breakfast was amazing, fresh and handmade!...",0.876323
6,2,Room,,although not big.,0.0
7,2,Room,"The room was very clean, well decorated and mo...","The room was very clean, well decorated and mo...",0.773283
8,2,Staff,The staff was very nice and helpful!,The staff was very nice and helpful! all staff...,0.862393
9,3,Location,the location is great and near the airport. bu...,the location is great and near the airport. bu...,1.0


In [64]:
def isCorrect(row):
    # Case 1: If either the human-labeled text or the model output text is missing, mark as incorrect
    if pd.isna(row['Text_answer']) or pd.isna(row['Text_output']):
        return False
    # Case 2: If cosine similarity is missing (NaN), mark as incorrect
    if pd.isna(row['Cosine_Similarity']):
        return False
     # Case 3: If the cosine similarity is greater than or equal to 0.7, consider it correct
    return row['Cosine_Similarity'] >= 0.7

df_compare['IsCorrect'] = df_compare.apply(isCorrect, axis=1)

# Optional:
# Display the first 40 rows to review the following columns:
# - ID: Data point ID
# - Topics: Topic label
# - NegPos_answer: Human-labeled sentiment
# - NegPos_output: Model-predicted sentiment
# - Text_answer: Human-labeled text span
# - Text_output: Model-predicted text span
# - Cosine_Similarity: Similarity score between the texts
# - IsCorrect: Boolean flag indicating whether the extraction is correct based on the similarity threshold
df_compare[['ID', 'Topics', 'NegPos_answer', 'NegPos_output', 'Text_answer', 'Text_output', 'Cosine_Similarity', 'IsCorrect']].head(40)



Unnamed: 0,ID,Topics,NegPos_answer,NegPos_output,Text_answer,Text_output,Cosine_Similarity,IsCorrect
0,1,Food,Positive,Positive,"Tasty food on the first floor, comfortable res...","Tasty food on the first floor, comfortable res...",1.0,True
1,1,Location,Positive,Positive,"close to the airport, to very clean beach.","close to the airport, to very clean beach.",1.0,True
2,1,Room,,Positive,,"New, comfortable apartments.",0.0,False
3,1,Staff,Positive,Positive,Staff is extremely helpful and easy to communi...,Staff is extremely helpful and easy to communi...,1.0,True
4,2,Facility,,Positive,,"I like hotels with a family atmosphere, cozy a...",0.0,False
5,2,Food,Positive,Positive,"Also, the breakfast was amazing, fresh and han...","the breakfast was amazing, fresh and handmade!...",0.876323,True
6,2,Room,,Negative,,although not big.,0.0,False
7,2,Room,Positive,Positive,"The room was very clean, well decorated and mo...","The room was very clean, well decorated and mo...",0.773283,True
8,2,Staff,Positive,Positive,The staff was very nice and helpful!,The staff was very nice and helpful! all staff...,0.862393,True
9,3,Location,Positive,Positive,the location is great and near the airport. bu...,the location is great and near the airport. bu...,1.0,True


In [65]:
df_compare.to_csv('cosine_3_shot_3.csv', index=False)

print("CSV saved successfully")

CSV saved successfully


In [66]:
eval_list = []

for id_val in sorted(df_compare['ID'].unique()):
    df_id = df_compare[df_compare['ID'] == id_val]

    num_preds = df_id['Text_output'].notna().sum()  # Model output per ID
    num_ans = df_id['Text_answer'].notna().sum()    # Ground truth per ID
    num_correct = df_id['IsCorrect'].sum()           # Correct cosine matches

    # Precision = Correct Predictions / Total Predictions
    precision = num_correct / num_preds if num_preds > 0 else 0
    # Recall = Correct Predictions / Total Answers
    recall = num_correct / num_ans if num_ans > 0 else 0

    # Append results as a dictionary into eval_list
    eval_list.append({
        'ID': id_val,               # Current ID
        'pre_nom': num_correct,     # Numerator for Precision (True Positives)
        'pre_denom': num_preds,     # Denominator for Precision (Model outputs)
        'Precision': precision,     
        're_nom': num_correct,      # Numerator for Recall (True Positives)
        're_denom': num_ans,        # Denominator for Recall (Human annotations)
        'Recall': recall
    })


eval_df = pd.DataFrame(eval_list).sort_values('ID').reset_index(drop=True)


eval_df.head(30)

Unnamed: 0,ID,pre_nom,pre_denom,Precision,re_nom,re_denom,Recall
0,1,3,4,0.75,3,3,1.0
1,2,3,5,0.6,3,3,1.0
2,3,1,1,1.0,1,1,1.0
3,4,3,3,1.0,3,3,1.0
4,5,2,2,1.0,2,2,1.0
5,6,1,1,1.0,1,1,1.0
6,8,2,3,0.666667,2,2,1.0
7,9,0,5,0.0,0,1,0.0
8,10,2,2,1.0,2,2,1.0
9,11,4,5,0.8,4,4,1.0


In [67]:
# Save to CSV
eval_df.to_csv("evaluation_result_3_shot_3.csv", index=False)

print("Evaluation results saved to .csv")


Evaluation results saved to .csv


In [68]:
macro_precision = eval_df['Precision'].mean()
macro_recall = eval_df['Recall'].mean()

print(f"Macro Precision: {macro_precision:.4f}")
print(f"Macro Recall: {macro_recall:.4f}")


Macro Precision: 0.6029
Macro Recall: 0.8347
