### Step 10.  Accuracy Measurement

#### Import required libraries, load submissions and comments with summary

In [1]:
import torch
import bert_score

In [None]:
import pandas as pd

def extract_nan_rows(df, name):
    nan_rows = df[df['summary'].isna()]
    nan_rows['source'] = name
    return nan_rows

# Load DataFrames
df = pd.read_csv('tfcc_submissions_top20_with_sentiment_including_comment_sentiment_and_summaries.csv')
comments_df = pd.read_csv('tfcc_top_comments_summarized.csv')
pegasus_df = pd.read_csv('tfcc_submissions_top20_pegasus_summaries.csv')
pegasus_comments_df = pd.read_csv('tfcc_top_comments_pegasus_summarized.csv')
cohere_df = pd.read_csv('tfcc_submissions_top20_cohere_summaries.csv')
cohere_comments_df = pd.read_csv('tfcc_top_comments_cohere_summarized.csv')

# Extract rows with NaN values in the 'summary' column
nan_rows_df = pd.DataFrame()
nan_rows_df = nan_rows_df.append(extract_nan_rows(df, "tfcc_submissions_top20_with_sentiment_including_comment_sentiment_and_summaries"))
nan_rows_df = nan_rows_df.append(extract_nan_rows(comments_df, "tfcc_top_comments_summarized"))
nan_rows_df = nan_rows_df.append(extract_nan_rows(pegasus_df, "tfcc_submissions_top20_pegasus_summaries"))
nan_rows_df = nan_rows_df.append(extract_nan_rows(pegasus_comments_df, "tfcc_top_comments_pegasus_summarized"))
nan_rows_df = nan_rows_df.append(extract_nan_rows(cohere_df, "tfcc_submissions_top20_cohere_summaries"))
nan_rows_df = nan_rows_df.append(extract_nan_rows(cohere_comments_df, "tfcc_top_comments_cohere_summarized"))

# Remove rows with NaN values from the original DataFrames
df = df.dropna(subset=['summary'])
comments_df = comments_df.dropna(subset=['summary'])
pegasus_df = pegasus_df.dropna(subset=['summary'])
pegasus_comments_df = pegasus_comments_df.dropna(subset=['summary'])
cohere_df = cohere_df.dropna(subset=['summary'])
cohere_comments_df = cohere_comments_df.dropna(subset=['summary'])

# Print the DataFrame containing rows with NaN values in the 'summary' column
nan_rows_df.to_csv('summary_errors.csv', index=False)



In [3]:
nan_rows_df

Unnamed: 0,id,title,selftext,author,score,num_comments,created_date,selftext_length,topic,pos_sentiment,neg_sentiment,comments_pos_sentiment,comments_neg_sentiment,summary,source
3,e01rb8,One of my agents actually said what everyone t...,this happened a couple of weeks ago and is bot...,wirwarennamenlos,1553.0,121.0,2019-11-22 14:33:25,138.0,0.0,0.498418,0.501582,0.250270,0.749730,,tfcc_submissions_top20_cohere_summaries
12,b4w6n6,Hung up on a customer today,ill give a little bit of background before i g...,forever_a10ne,1128.0,86.0,2019-03-24 13:11:29,395.0,0.0,0.235142,0.764858,0.131666,0.868334,,tfcc_submissions_top20_cohere_summaries
21,cf0t8g,Perv masturbates loudly and the rep documents it,"so, i wasn't sure i wanted to put this one her...",TaraJo,920.0,74.0,2019-07-19 00:48:39,446.0,0.0,0.199540,0.800460,0.392115,0.607885,,tfcc_submissions_top20_cohere_summaries
35,bqz64k,I Love Karma...,this happened to me a few years ago but it sti...,David-Arroyo,756.0,58.0,2019-05-20 18:17:20,221.0,0.0,0.645398,0.354602,0.269660,0.730340,,tfcc_submissions_top20_cohere_summaries
58,k60ejc,Your Son is Seven and He's Getting WHAT?,so last night i had a very bizarre call from s...,olivecornbread,553.0,68.0,2020-12-03 16:25:56,232.0,0.0,0.422916,0.577084,0.206383,0.793617,,tfcc_submissions_top20_cohere_summaries
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1957,24xldx,Craziest thing you have been called by an angr...,customer calls in and reaches a frontline t1 a...,sadiegirl66,19.0,29.0,2014-05-07 06:22:59,180.0,19.0,0.199234,0.800766,0.002895,0.997105,,tfcc_submissions_top20_cohere_summaries
48,,,You made the best choice for you. Your health ...,,,,,,7.0,,,,,,tfcc_top_comments_cohere_summarized
71,,,I really dont understand why people prefer a m...,,,,,,11.0,,,,,,tfcc_top_comments_cohere_summarized
79,,,I used to work in a call center and I vividly ...,,,,,,13.0,,,,,,tfcc_top_comments_cohere_summarized


### Measure summary accuracy and drop all rows that are below 0.8 F1

In [4]:
# get the number of rows using the shape attribute
start_num_rows = df.shape[0]
pegasus_start_num_rows = pegasus_df.shape[0]
cohere_start_num_rows = cohere_df.shape[0]

In [5]:
print("OpenAI - Measuring accuracy on " +str(start_num_rows) + " rows.")
print("Pegasus - Measuring accuracy on " +str(pegasus_start_num_rows) + " rows.")
print("COhere - Measuring accuracy on " +str(cohere_start_num_rows) + " rows.")


OpenAI - Measuring accuracy on 1964 rows.
Pegasus - Measuring accuracy on 1964 rows.
COhere - Measuring accuracy on 1883 rows.


#### Calculate accuracy for OpenAI submissions

In [None]:
import torch

selftext = df['selftext']
summary = df['summary']
P, R, F1 = bert_score.score(selftext.tolist(), summary.tolist(), lang='en', verbose=True)
df['bert_f1'] = F1.tolist()

# Free up GPU memory
del P, R, F1
torch.cuda.empty_cache()




#### Calculate accuracy for Pegasus submissions

In [None]:
selftext = pegasus_df['selftext']
summary = pegasus_df['summary']

# Compute the BERTScore for each pair of text
P, R, F1 = bert_score.score(selftext.tolist(), summary.tolist(), lang='en', verbose=True)
pegasus_df['bert_f1'] = F1.tolist()

# Free up GPU memory
del P, R, F1
torch.cuda.empty_cache()


#### Calculate accuracy for Cohere submissions

In [None]:
selftext = cohere_df['selftext']
summary = cohere_df['summary']

# Compute the BERTScore for each pair of text
P, R, F1 = bert_score.score(selftext.tolist(), summary.tolist(), lang='en', verbose=True)
cohere_df['bert_f1'] = F1.tolist()

# Free up GPU memory
del P, R, F1
torch.cuda.empty_cache()


#### Display average accuracy score for OpenAI, Pegasus, and Cohere

In [9]:
from IPython.display import display, HTML

# Calculate the average F1 scores
df_avg_f1 = df['bert_f1'].mean()
pegasus_avg_f1 = pegasus_df['bert_f1'].mean()
cohere_avg_f1 = cohere_df['bert_f1'].mean()

# Create a dictionary to store the results
data = {
    'Dataframe': ['df', 'pegasus_df', 'cohere_df'],
    'Average F1': [df_avg_f1, pegasus_avg_f1, cohere_avg_f1]
}

# Create a new dataframe to display the results
results_df = pd.DataFrame(data)

# Display the results in a simple table
display(HTML(results_df.to_html(index=False)))


Dataframe,Average F1
df,0.842497
pegasus_df,0.855814
cohere_df,0.869656


#### Print results for OpenAI test

In [10]:
df2 = df[df['bert_f1'] < 0.8] 
df = df[df['bert_f1'] >= 0.8]

end_num_rows = df.shape[0]

removed_num = df2.shape[0]

print(str(end_num_rows) + " rows were >= 0.8 F1 and " + str(removed_num) + " were below 0.8 F1 and have been removed.")


1943 rows were >= 0.8 F1 and 21 were below 0.8 F1 and have been removed.


In [11]:
df.to_csv('tfcc_submissions_with_accuracy.csv', index=False)

#### Print results for Pegasus test

In [12]:
pegasus_df2 = pegasus_df[pegasus_df['bert_f1'] < 0.8] 
pegasus_df = pegasus_df[pegasus_df['bert_f1'] >= 0.8]

end_num_rows = pegasus_df.shape[0]

removed_num = pegasus_df2.shape[0]

print(str(end_num_rows) + " rows were >= 0.8 F1 and " + str(removed_num) + " were below 0.8 F1 and have been removed.")

1950 rows were >= 0.8 F1 and 14 were below 0.8 F1 and have been removed.


In [13]:
pegasus_df.to_csv('tfcc_submissions_pegasus_with_accuracy.csv', index=False)

#### Print results for Cohere test

In [14]:
cohere_df2 = cohere_df[cohere_df['bert_f1'] < 0.8] 
cohere_df = cohere_df[cohere_df['bert_f1'] >= 0.8]

end_num_rows = cohere_df.shape[0]

removed_num = cohere_df2.shape[0]

print(str(end_num_rows) + " rows were >= 0.8 F1 and " + str(removed_num) + " were below 0.8 F1 and have been removed.")

1849 rows were >= 0.8 F1 and 34 were below 0.8 F1 and have been removed.


In [15]:
cohere_df.to_csv('tfcc_submissions_cohere_with_accuracy.csv', index=False)

### Repeat accuracy measurement process for comments.  

In [16]:
# get the number of rows using the shape attribute
start_num_rows = comments_df.shape[0]
print("OpenAI Measuring accuracy on " +str(start_num_rows) + " rows.")

pegasus_start_num_rows = pegasus_comments_df.shape[0]
print("Pegasus Measuring accuracy on " +str(pegasus_start_num_rows) + " rows.")

cohere_start_num_rows = cohere_comments_df.shape[0]
print("Cohere Measuring accuracy on " +str(cohere_start_num_rows) + " rows.")

OpenAI Measuring accuracy on 118 rows.
Pegasus Measuring accuracy on 118 rows.
Cohere Measuring accuracy on 114 rows.


#### Measure OpenAI comments summaries accuracy

In [None]:
# Select the columns to compare
selftext = comments_df['selftext']
summary = comments_df['summary']

# Compute the BERTScore for each pair of text
P, R, F1 = bert_score.score(selftext.tolist(), summary.tolist(), lang='en', verbose=True)
comments_df['bert_f1'] = F1.tolist()

# Free up GPU memory
del P, R, F1
torch.cuda.empty_cache()

#### Measure Pegasus comments summaries accuracy

In [None]:
# Select the columns to compare
selftext = pegasus_comments_df['selftext']
summary = pegasus_comments_df['summary']

# Compute the BERTScore for each pair of text
P, R, F1 = bert_score.score(selftext.tolist(), summary.tolist(), lang='en', verbose=True)
pegasus_comments_df['bert_f1'] = F1.tolist()

# Free up GPU memory
del P, R, F1
torch.cuda.empty_cache()

#### Measure Cohere comments summaries accuracy

In [None]:
# Select the columns to compare
selftext = cohere_comments_df['selftext']
summary = cohere_comments_df['summary']

# Compute the BERTScore for each pair of text
P, R, F1 = bert_score.score(selftext.tolist(), summary.tolist(), lang='en', verbose=True)
cohere_comments_df['bert_f1'] = F1.tolist()

# Free up GPU memory
del P, R, F1
torch.cuda.empty_cache()

#### Display average accuracy score for OpenAI, Pegasus, and Cohere

In [20]:
# Calculate the average F1 scores
df_avg_f1 = comments_df['bert_f1'].mean()
pegasus_avg_f1 = pegasus_comments_df['bert_f1'].mean()
cohere_avg_f1 = cohere_comments_df['bert_f1'].mean()

# Create a dictionary to store the results
data = {
    'Dataframe': ['df', 'pegasus_df', 'cohere_df'],
    'Average F1': [df_avg_f1, pegasus_avg_f1, cohere_avg_f1]
}

# Create a new dataframe to display the results
results_df = pd.DataFrame(data)

# Display the results in a simple table
display(HTML(results_df.to_html(index=False)))

Dataframe,Average F1
df,0.812531
pegasus_df,0.833296
cohere_df,0.84294


#### Print OpenAI results

In [21]:
comments_df2 = comments_df[comments_df['bert_f1'] < 0.8] 
comments_df = comments_df[comments_df['bert_f1'] >= 0.8]

end_num_rows = comments_df.shape[0]

removed_num = comments_df2.shape[0]

print(str(end_num_rows) + " rows were >= 0.8 F1 and " + str(removed_num) + " were below 0.8 F1 and have been removed.")


110 rows were >= 0.8 F1 and 8 were below 0.8 F1 and have been removed.


#### Print Pegasus results

In [22]:
pegasus_comments_df2 = pegasus_comments_df[pegasus_comments_df['bert_f1'] < 0.8] 
pegasus_comments_df = pegasus_comments_df[pegasus_comments_df['bert_f1'] >= 0.8]

end_num_rows = pegasus_comments_df.shape[0]

removed_num = pegasus_comments_df2.shape[0]

print(str(end_num_rows) + " rows were >= 0.8 F1 and " + str(removed_num) + " were below 0.8 F1 and have been removed.")


116 rows were >= 0.8 F1 and 2 were below 0.8 F1 and have been removed.


#### Print Cohere results

In [23]:
cohere_comments_df2 = cohere_comments_df[cohere_comments_df['bert_f1'] < 0.8] 
cohere_comments_df = cohere_comments_df[cohere_comments_df['bert_f1'] >= 0.8]

end_num_rows = cohere_comments_df.shape[0]

removed_num = cohere_comments_df2.shape[0]

print(str(end_num_rows) + " rows were >= 0.8 F1 and " + str(removed_num) + " were below 0.8 F1 and have been removed.")

102 rows were >= 0.8 F1 and 12 were below 0.8 F1 and have been removed.


#### Double check that we still have comments for every top.  Count should = 20

OpenAI:

In [24]:
unique_count = comments_df['topic'].nunique()

print(unique_count)

20


Pegasus:

In [25]:
unique_count = pegasus_comments_df['topic'].nunique()

print(unique_count)

20


Cohere:

In [26]:
unique_count = cohere_comments_df['topic'].nunique()

print(unique_count)

20


#### Export comments with accuracy to CSV

In [27]:
comments_df.to_csv('tfcc_comments_with_accuracy.csv', index=False)
pegasus_comments_df.to_csv('tfcc_pegasus_comments_with_accuracy.csv', index=False)
cohere_comments_df.to_csv('tfcc_cohere_comments_with_accuracy.csv', index=False)

#### Generate accuracy report

In [28]:
import pandas as pd

In [29]:
openai_df = pd.read_csv('tfcc_submissions_with_accuracy.csv')
pegasus_df = pd.read_csv('tfcc_submissions_pegasus_with_accuracy.csv')
cohere_df = pd.read_csv('tfcc_submissions_cohere_with_accuracy.csv')

openai_comments_df = pd.read_csv('tfcc_comments_with_accuracy.csv')
pegasus_comments_df = pd.read_csv('tfcc_pegasus_comments_with_accuracy.csv')
cohere_comments_df = pd.read_csv('tfcc_cohere_comments_with_accuracy.csv')


topics = pd.read_csv('tfcc_top20_topics_with_sentiment_and_comments_sentiment.csv')

In [30]:
topics.rename(columns={'Topic': 'topic'}, inplace=True)


In [38]:
import pandas as pd

# Group by 'topic' and calculate the mean F1 score for each model
openai_average_f1 = openai_df.groupby('topic')['bert_f1'].mean().reset_index().rename(columns={'bert_f1': 'openai_bert_f1'})
pegasus_average_f1 = pegasus_df.groupby('topic')['bert_f1'].mean().reset_index().rename(columns={'bert_f1': 'pegasus_bert_f1'})
cohere_average_f1 = cohere_df.groupby('topic')['bert_f1'].mean().reset_index().rename(columns={'bert_f1': 'cohere_bert_f1'})

# Merge the topics dataframe with the average F1 scores dataframes on the 'topic' column
topics_with_f1 = topics.merge(openai_average_f1, on='topic', how='left')
topics_with_f1 = topics_with_f1.merge(pegasus_average_f1, on='topic', how='left')
topics_with_f1 = topics_with_f1.merge(cohere_average_f1, on='topic', how='left')


In [39]:
topics_with_f1

Unnamed: 0,topic,Count,Name,avg_pos_sentiment,avg_neg_sentiment,avg_comments_pos_sentiment,avg_comments_neg_sentiment,openai_bert_f1,pegasus_bert_f1,cohere_bert_f1
0,0,796,0_like_help_name_say,0.161216,0.838784,0.196144,0.803856,0.840499,0.853035,0.866495
1,1,591,1_card_bank_credit_fraud,0.113983,0.886017,0.148113,0.851887,0.839251,0.849684,0.868786
2,2,457,2_delivery_shipping_store_email,0.122277,0.877723,0.163302,0.836698,0.838268,0.848682,0.86831
3,3,283,3_insurance_car_claims_coverage,0.103466,0.896534,0.11797,0.88203,0.844882,0.856713,0.871323
4,4,243,4_patient_doctor_clinic_medical,0.13952,0.86048,0.159091,0.840909,0.838917,0.851357,0.870041
5,5,201,5_tow_roadside_truck_assistance,0.159511,0.840489,0.139324,0.860676,0.838405,0.851095,0.870274
6,6,188,6_bill_phones_service_data,0.1335,0.8665,0.103897,0.896103,0.839475,0.851504,0.862329
7,7,108,7_job_feel_anxiety_work,0.154886,0.845114,0.290429,0.709571,0.865139,0.88413,0.90337
8,8,106,8_english_spanish_speak_language,0.154176,0.845824,0.2081,0.7919,0.842097,0.857787,0.869481
9,9,101,9_job_interview_role_experience,0.374143,0.625857,0.343623,0.656377,0.864895,0.881741,0.883442


In [40]:
topics_with_f1 = topics_with_f1.drop(columns=['Count', 'avg_pos_sentiment', 'avg_neg_sentiment', 'avg_comments_pos_sentiment', 'avg_comments_neg_sentiment'])


In [41]:
topics_with_f1

Unnamed: 0,topic,Name,openai_bert_f1,pegasus_bert_f1,cohere_bert_f1
0,0,0_like_help_name_say,0.840499,0.853035,0.866495
1,1,1_card_bank_credit_fraud,0.839251,0.849684,0.868786
2,2,2_delivery_shipping_store_email,0.838268,0.848682,0.86831
3,3,3_insurance_car_claims_coverage,0.844882,0.856713,0.871323
4,4,4_patient_doctor_clinic_medical,0.838917,0.851357,0.870041
5,5,5_tow_roadside_truck_assistance,0.838405,0.851095,0.870274
6,6,6_bill_phones_service_data,0.839475,0.851504,0.862329
7,7,7_job_feel_anxiety_work,0.865139,0.88413,0.90337
8,8,8_english_spanish_speak_language,0.842097,0.857787,0.869481
9,9,9_job_interview_role_experience,0.864895,0.881741,0.883442


In [42]:
# Group by 'topic' and calculate the mean F1 score for each model
openai_average_f1 = openai_comments_df.groupby('topic')['bert_f1'].mean().reset_index().rename(columns={'bert_f1': 'openai_bert_f1'})
pegasus_average_f1 = pegasus_comments_df.groupby('topic')['bert_f1'].mean().reset_index().rename(columns={'bert_f1': 'pegasus_bert_f1'})
cohere_average_f1 = cohere_comments_df.groupby('topic')['bert_f1'].mean().reset_index().rename(columns={'bert_f1': 'cohere_bert_f1'})

# Merge the topics dataframe with the average F1 scores dataframes on the 'topic' column
comments_topics_with_f1 = topics.merge(openai_average_f1, on='topic', how='left')
comments_topics_with_f1 = comments_topics_with_f1.merge(pegasus_average_f1, on='topic', how='left')
comments_topics_with_f1 = comments_topics_with_f1.merge(cohere_average_f1, on='topic', how='left')

In [43]:
comments_topics_with_f1 = comments_topics_with_f1.drop(columns=['Count','avg_pos_sentiment', 'avg_neg_sentiment', 'avg_comments_pos_sentiment', 'avg_comments_neg_sentiment'])


In [44]:
comments_topics_with_f1

Unnamed: 0,topic,Name,openai_bert_f1,pegasus_bert_f1,cohere_bert_f1
0,0,0_like_help_name_say,0.819252,0.836024,0.833344
1,1,1_card_bank_credit_fraud,0.809929,0.833655,0.871884
2,2,2_delivery_shipping_store_email,0.806287,0.82981,0.847947
3,3,3_insurance_car_claims_coverage,0.815597,0.845653,0.872815
4,4,4_patient_doctor_clinic_medical,0.818111,0.835969,0.825033
5,5,5_tow_roadside_truck_assistance,0.80836,0.82703,0.83554
6,6,6_bill_phones_service_data,0.810225,0.832656,0.881458
7,7,7_job_feel_anxiety_work,0.829013,0.842602,0.822271
8,8,8_english_spanish_speak_language,0.808781,0.82971,0.850395
9,9,9_job_interview_role_experience,0.815137,0.838199,0.8456
