### Step 10.  Accuracy Measurement

#### Import required libraries, load submissions and comments with summary

In [1]:
import torch
import pandas as pd
from rouge import Rouge


In [None]:


def extract_nan_rows(df, name):
    nan_rows = df[df['summary'].isna()]
    nan_rows['source'] = name
    return nan_rows

# Load DataFrames
df = pd.read_csv('tfcc_submissions_top20_with_sentiment_including_comment_sentiment_and_summaries.csv')
comments_df = pd.read_csv('tfcc_top_comments_summarized.csv')
pegasus_df = pd.read_csv('tfcc_submissions_top20_pegasus_summaries.csv')
pegasus_comments_df = pd.read_csv('tfcc_top_comments_pegasus_summarized.csv')
cohere_df = pd.read_csv('tfcc_submissions_top20_cohere_summaries.csv')
cohere_comments_df = pd.read_csv('tfcc_top_comments_cohere_summarized.csv')

# Extract rows with NaN values in the 'summary' column
nan_rows_df = pd.DataFrame()
nan_rows_df = nan_rows_df.append(extract_nan_rows(df, "tfcc_submissions_top20_with_sentiment_including_comment_sentiment_and_summaries"))
nan_rows_df = nan_rows_df.append(extract_nan_rows(comments_df, "tfcc_top_comments_summarized"))
nan_rows_df = nan_rows_df.append(extract_nan_rows(pegasus_df, "tfcc_submissions_top20_pegasus_summaries"))
nan_rows_df = nan_rows_df.append(extract_nan_rows(pegasus_comments_df, "tfcc_top_comments_pegasus_summarized"))
nan_rows_df = nan_rows_df.append(extract_nan_rows(cohere_df, "tfcc_submissions_top20_cohere_summaries"))
nan_rows_df = nan_rows_df.append(extract_nan_rows(cohere_comments_df, "tfcc_top_comments_cohere_summarized"))

# Remove rows with NaN values from the original DataFrames
df = df.dropna(subset=['summary'])
comments_df = comments_df.dropna(subset=['summary'])
pegasus_df = pegasus_df.dropna(subset=['summary'])
pegasus_comments_df = pegasus_comments_df.dropna(subset=['summary'])
cohere_df = cohere_df.dropna(subset=['summary'])
cohere_comments_df = cohere_comments_df.dropna(subset=['summary'])

# Print the DataFrame containing rows with NaN values in the 'summary' column
nan_rows_df.to_csv('summary_errors.csv', index=False)



In [3]:
nan_rows_df

Unnamed: 0,id,title,selftext,author,score,num_comments,created_date,selftext_length,topic,pos_sentiment,neg_sentiment,comments_pos_sentiment,comments_neg_sentiment,summary,source
1187,2rfobe,Almost 15 years in callcenter,..and i was unemployed for 3 years. last augus...,reddandy73,1.0,4.0,2015-01-05 20:28:47,120.0,9.0,0.194657,0.805343,0.999371,0.000629,,tfcc_submissions_top20_pegasus_summaries
3,e01rb8,One of my agents actually said what everyone t...,this happened a couple of weeks ago and is bot...,wirwarennamenlos,1553.0,121.0,2019-11-22 14:33:25,138.0,0.0,0.498418,0.501582,0.250270,0.749730,,tfcc_submissions_top20_cohere_summaries
12,b4w6n6,Hung up on a customer today,ill give a little bit of background before i g...,forever_a10ne,1128.0,86.0,2019-03-24 13:11:29,395.0,0.0,0.235142,0.764858,0.131666,0.868334,,tfcc_submissions_top20_cohere_summaries
21,cf0t8g,Perv masturbates loudly and the rep documents it,"so, i wasn't sure i wanted to put this one her...",TaraJo,920.0,74.0,2019-07-19 00:48:39,446.0,0.0,0.199540,0.800460,0.392115,0.607885,,tfcc_submissions_top20_cohere_summaries
35,bqz64k,I Love Karma...,this happened to me a few years ago but it sti...,David-Arroyo,756.0,58.0,2019-05-20 18:17:20,221.0,0.0,0.645398,0.354602,0.269660,0.730340,,tfcc_submissions_top20_cohere_summaries
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
191,,,I get calls like that all of the time....and f...,,,,,,8.0,,,,,,tfcc_top_comments_cohere_summarized
193,,,I also worked for a military affiliated credit...,,,,,,8.0,,,,,,tfcc_top_comments_cohere_summarized
223,,,I could see a legitimate scenario where someon...,,,,,,11.0,,,,,,tfcc_top_comments_cohere_summarized
270,,,I once had an elderly couple calling about the...,,,,,,17.0,,,,,,tfcc_top_comments_cohere_summarized


### Measure rouge

In [4]:
# get the number of rows using the shape attribute
start_num_rows = df.shape[0]
pegasus_start_num_rows = pegasus_df.shape[0]
cohere_start_num_rows = cohere_df.shape[0]

In [5]:
print("OpenAI - Measuring accuracy on " +str(start_num_rows) + " rows.")
print("Pegasus - Measuring accuracy on " +str(pegasus_start_num_rows) + " rows.")
print("COhere - Measuring accuracy on " +str(cohere_start_num_rows) + " rows.")


OpenAI - Measuring accuracy on 1964 rows.
Pegasus - Measuring accuracy on 1963 rows.
COhere - Measuring accuracy on 1883 rows.


In [9]:
# Initialize the Rouge object
rouge = Rouge()

# Function to calculate ROUGE scores
def calculate_rouge1_score(row):
    try:
        scores = rouge.get_scores(row['summary'], row['selftext'])
        return scores[0]['rouge-1']  # Return only the ROUGE-1 score
    except Exception as e:
        print(f"Error calculating ROUGE-1 score: {e}")
        return None

def extract_f1_score(row):
    return row['rouge1_scores']['f']



#### Calculate rouge for OpenAI submissions

In [11]:
# Apply the function to the dataframe
df['rouge1_scores'] = df.apply(calculate_rouge1_score, axis=1)

# Extract F1-score and store it in a new column called 'rouge1_f1_score'
df['rouge1_f1_score'] = df.apply(extract_f1_score, axis=1)

# Calculate the average ROUGE-1 F1-score
average_rouge1_f1 = df['rouge1_f1_score'].mean()

# Print the average ROUGE-1 F1-score
print(f"Average ROUGE-1 F1-score: {average_rouge1_f1}")




Average ROUGE-1 F1-score: 0.29108867246853465


#### Calculate rouge for Pegasus submissions

In [12]:
# Apply the function to the dataframe
pegasus_df['rouge1_scores'] = pegasus_df.apply(calculate_rouge1_score, axis=1)

# Extract F1-score and store it in a new column called 'rouge1_f1_score'
pegasus_df['rouge1_f1_score'] = pegasus_df.apply(extract_f1_score, axis=1)

# Calculate the average ROUGE-1 F1-score
average_rouge1_f1 = pegasus_df['rouge1_f1_score'].mean()

# Print the average ROUGE-1 F1-score
print(f"Average ROUGE-1 F1-score: {average_rouge1_f1}")


Average ROUGE-1 F1-score: 0.3557169809484121


#### Calculate rouge for Cohere submissions

In [13]:
# Apply the function to the dataframe
cohere_df['rouge1_scores'] = cohere_df.apply(calculate_rouge1_score, axis=1)

# Extract F1-score and store it in a new column called 'rouge1_f1_score'
cohere_df['rouge1_f1_score'] = cohere_df.apply(extract_f1_score, axis=1)

# Calculate the average ROUGE-1 F1-score
average_rouge1_f1 = cohere_df['rouge1_f1_score'].mean()

# Print the average ROUGE-1 F1-score
print(f"Average ROUGE-1 F1-score: {average_rouge1_f1}")

Average ROUGE-1 F1-score: 0.41162155878564466


In [8]:
df

Unnamed: 0,id,title,selftext,author,score,num_comments,created_date,selftext_length,topic,pos_sentiment,neg_sentiment,comments_pos_sentiment,comments_neg_sentiment,summary,rouge_scores
0,9odgd4,"If you cuss before you tell me why, I'm hangin...","so. this just happened. i had a caller, we wil...",sleepernick,4183,167,2018-10-15 15:03:20,317,0,0.002438,0.997562,0.167575,0.832425,A rude caller was disconnected by Sleeper Nick...,"{'rouge-1': {'r': 0.10714285714285714, 'p': 0...."
1,acncqg,I swear I’m not usually this dumb: when the ca...,my absolute favorite type of calls are when th...,QuoteTheKitty,2002,63,2019-01-04 22:13:45,173,0,0.360828,0.639172,0.239294,0.760706,A customer telling a customer service represen...,"{'rouge-1': {'r': 0.1588785046728972, 'p': 0.3..."
2,js29up,The mute button is not the customers friend,at a previous call center i worked they wanted...,supersizedlady,1756,115,2020-11-11 05:30:44,240,0,0.003723,0.996277,0.305934,0.694066,The writer worked in a call center and preferr...,"{'rouge-1': {'r': 0.18181818181818182, 'p': 0...."
3,e01rb8,One of my agents actually said what everyone t...,this happened a couple of weeks ago and is bot...,wirwarennamenlos,1553,121,2019-11-22 14:33:25,138,0,0.498418,0.501582,0.250270,0.749730,"An agent was recorded responding with ""how the...","{'rouge-1': {'r': 0.23469387755102042, 'p': 0...."
4,h0gvxc,Customer loves his analogy until I use it agai...,this call was from a while ago so i don't real...,BostonB96,1411,42,2020-06-10 18:28:42,330,0,0.184346,0.815654,0.001718,0.998282,A software provider received a call from a cus...,"{'rouge-1': {'r': 0.19148936170212766, 'p': 0...."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1959,4oloj6,Shooter threat,"wednesday night, someone called in and threate...",Believeinthis,18,20,2016-06-17 22:08:20,149,19,0.001418,0.998582,0.001219,0.998781,An unknown person called in and threatened to ...,"{'rouge-1': {'r': 0.422680412371134, 'p': 0.57..."
1960,2dbpxp,How come people assume call center employees a...,i've had a few infuriating calls. people just ...,lacquerqueen,15,21,2014-08-12 11:05:09,254,19,0.001012,0.998988,0.150063,0.849937,The author is frustrated with the assumption t...,"{'rouge-1': {'r': 0.1488095238095238, 'p': 0.5..."
1961,4x4pjt,Probably getting a CA because I didn't apologi...,customer complained that i blew her off despi...,evosthunder,14,4,2016-08-10 22:06:34,104,19,0.013253,0.986747,,,A customer complained about being blown off de...,"{'rouge-1': {'r': 0.4074074074074074, 'p': 0.4..."
1962,2ldeab,Don't Feed Stray Cats,"about two years ago, an old lady got upset and...",EveryoneHatesCJ,7,3,2014-11-05 15:11:42,125,19,0.005375,0.994625,,,"An old lady, upset about an issue, requested t...","{'rouge-1': {'r': 0.47560975609756095, 'p': 0...."


#### Calculate accuracy for Pegasus submissions

In [11]:
selftext = pegasus_df['selftext']
summary = pegasus_df['summary']

# Compute the BERTScore for each pair of text
P, R, F1 = bert_score.score(selftext.tolist(), summary.tolist(), lang='en', verbose=True)
pegasus_df['bert_f1'] = F1.tolist()

# Free up GPU memory
del P, R, F1
torch.cuda.empty_cache()


Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


calculating scores...
computing bert embedding.


  0%|          | 0/62 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/31 [00:00<?, ?it/s]

done in 102.52 seconds, 19.15 sentences/sec


#### Calculate accuracy for Cohere submissions

In [12]:
selftext = cohere_df['selftext']
summary = cohere_df['summary']

# Compute the BERTScore for each pair of text
P, R, F1 = bert_score.score(selftext.tolist(), summary.tolist(), lang='en', verbose=True)
cohere_df['bert_f1'] = F1.tolist()

# Free up GPU memory
del P, R, F1
torch.cuda.empty_cache()


Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


calculating scores...
computing bert embedding.


  0%|          | 0/59 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/30 [00:00<?, ?it/s]

done in 104.49 seconds, 18.02 sentences/sec


#### Display average accuracy score for OpenAI, Pegasus, and Cohere

In [13]:
from IPython.display import display, HTML

# Assuming you have already loaded the dataframes: df, pegasus_df, and cohere_df

# Calculate the average F1 scores
df_avg_f1 = df['bert_f1'].mean()
pegasus_avg_f1 = pegasus_df['bert_f1'].mean()
cohere_avg_f1 = cohere_df['bert_f1'].mean()

# Create a dictionary to store the results
data = {
    'Dataframe': ['df', 'pegasus_df', 'cohere_df'],
    'Average F1': [df_avg_f1, pegasus_avg_f1, cohere_avg_f1]
}

# Create a new dataframe to display the results
results_df = pd.DataFrame(data)

# Display the results in a simple table
display(HTML(results_df.to_html(index=False)))


Dataframe,Average F1
df,0.842497
pegasus_df,0.847639
cohere_df,0.869656


#### Print results for OpenAI test

In [14]:
df2 = df[df['bert_f1'] < 0.8] 
df = df[df['bert_f1'] >= 0.8]

end_num_rows = df.shape[0]

removed_num = df2.shape[0]

print(str(end_num_rows) + " rows were >= 0.8 F1 and " + str(removed_num) + " were below 0.8 F1 and have been removed.")


1943 rows were >= 0.8 F1 and 21 were below 0.8 F1 and have been removed.


In [16]:
df.to_csv('tfcc_submissions_with_accuracy.csv', index=False)

#### Print results for Pegasus test

In [15]:
pegasus_df2 = pegasus_df[pegasus_df['bert_f1'] < 0.8] 
pegasus_df = pegasus_df[pegasus_df['bert_f1'] >= 0.8]

end_num_rows = pegasus_df.shape[0]

removed_num = pegasus_df2.shape[0]

print(str(end_num_rows) + " rows were >= 0.8 F1 and " + str(removed_num) + " were below 0.8 F1 and have been removed.")

1957 rows were >= 0.8 F1 and 6 were below 0.8 F1 and have been removed.


In [18]:
pegasus_df.to_csv('tfcc_submissions_pegasus_with_accuracy.csv', index=False)

#### Print results for Cohere test

In [19]:
cohere_df2 = cohere_df[cohere_df['bert_f1'] < 0.8] 
cohere_df = cohere_df[cohere_df['bert_f1'] >= 0.8]

end_num_rows = cohere_df.shape[0]

removed_num = cohere_df2.shape[0]

print(str(end_num_rows) + " rows were >= 0.8 F1 and " + str(removed_num) + " were below 0.8 F1 and have been removed.")

1849 rows were >= 0.8 F1 and 34 were below 0.8 F1 and have been removed.


In [20]:
cohere_df.to_csv('tfcc_submissions_cohere_with_accuracy.csv', index=False)

#### Repeat accuracy measurement process for comments.  

In [21]:
# get the number of rows using the shape attribute
start_num_rows = comments_df.shape[0]
print("OpenAI Measuring accuracy on " +str(start_num_rows) + " rows.")

pegasus_start_num_rows = pegasus_comments_df.shape[0]
print("Pegasus Measuring accuracy on " +str(pegasus_start_num_rows) + " rows.")

cohere_start_num_rows = cohere_comments_df.shape[0]
print("Cohere Measuring accuracy on " +str(cohere_start_num_rows) + " rows.")

OpenAI Measuring accuracy on 285 rows.
Pegasus Measuring accuracy on 285 rows.
Cohere Measuring accuracy on 275 rows.


#### Measure OpenAI comments summaries accuracy

In [22]:
# Select the columns to compare
selftext = comments_df['selftext']
summary = comments_df['summary']

# Compute the BERTScore for each pair of text
P, R, F1 = bert_score.score(selftext.tolist(), summary.tolist(), lang='en', verbose=True)
comments_df['bert_f1'] = F1.tolist()

# Free up GPU memory
del P, R, F1
torch.cuda.empty_cache()

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


calculating scores...
computing bert embedding.


  0%|          | 0/9 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/5 [00:00<?, ?it/s]

done in 19.07 seconds, 14.95 sentences/sec


#### Measure Pegasus comments summaries accuracy

In [23]:
# Select the columns to compare
selftext = pegasus_comments_df['selftext']
summary = pegasus_comments_df['summary']

# Compute the BERTScore for each pair of text
P, R, F1 = bert_score.score(selftext.tolist(), summary.tolist(), lang='en', verbose=True)
pegasus_comments_df['bert_f1'] = F1.tolist()

# Free up GPU memory
del P, R, F1
torch.cuda.empty_cache()

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


calculating scores...
computing bert embedding.


  0%|          | 0/9 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/5 [00:00<?, ?it/s]

done in 28.50 seconds, 10.00 sentences/sec


#### Measure Cohere comments summaries accuracy

In [24]:
# Select the columns to compare
selftext = cohere_comments_df['selftext']
summary = cohere_comments_df['summary']

# Compute the BERTScore for each pair of text
P, R, F1 = bert_score.score(selftext.tolist(), summary.tolist(), lang='en', verbose=True)
cohere_comments_df['bert_f1'] = F1.tolist()

# Free up GPU memory
del P, R, F1
torch.cuda.empty_cache()

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


calculating scores...
computing bert embedding.


  0%|          | 0/9 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/5 [00:00<?, ?it/s]

done in 18.30 seconds, 15.03 sentences/sec


#### Display average accuracy score for OpenAI, Pegasus, and Cohere

In [25]:
# Calculate the average F1 scores
df_avg_f1 = comments_df['bert_f1'].mean()
pegasus_avg_f1 = pegasus_comments_df['bert_f1'].mean()
cohere_avg_f1 = cohere_comments_df['bert_f1'].mean()

# Create a dictionary to store the results
data = {
    'Dataframe': ['df', 'pegasus_df', 'cohere_df'],
    'Average F1': [df_avg_f1, pegasus_avg_f1, cohere_avg_f1]
}

# Create a new dataframe to display the results
results_df = pd.DataFrame(data)

# Display the results in a simple table
display(HTML(results_df.to_html(index=False)))

Dataframe,Average F1
df,0.815775
pegasus_df,0.828468
cohere_df,0.819117


#### Print OpenAI results

In [26]:
comments_df2 = comments_df[comments_df['bert_f1'] < 0.8] 
comments_df = comments_df[comments_df['bert_f1'] >= 0.8]

end_num_rows = comments_df.shape[0]

removed_num = comments_df2.shape[0]

print(str(end_num_rows) + " rows were >= 0.8 F1 and " + str(removed_num) + " were below 0.8 F1 and have been removed.")


279 rows were >= 0.8 F1 and 6 were below 0.8 F1 and have been removed.


#### Print Pegasus results

In [27]:
pegasus_comments_df2 = pegasus_comments_df[pegasus_comments_df['bert_f1'] < 0.8] 
pegasus_comments_df = pegasus_comments_df[pegasus_comments_df['bert_f1'] >= 0.8]

end_num_rows = pegasus_comments_df.shape[0]

removed_num = pegasus_comments_df2.shape[0]

print(str(end_num_rows) + " rows were >= 0.8 F1 and " + str(removed_num) + " were below 0.8 F1 and have been removed.")


285 rows were >= 0.8 F1 and 0 were below 0.8 F1 and have been removed.


#### Print Cohere results

In [28]:
cohere_comments_df2 = cohere_comments_df[cohere_comments_df['bert_f1'] < 0.8] 
cohere_comments_df = cohere_comments_df[cohere_comments_df['bert_f1'] >= 0.8]

end_num_rows = cohere_comments_df.shape[0]

removed_num = cohere_comments_df2.shape[0]

print(str(end_num_rows) + " rows were >= 0.8 F1 and " + str(removed_num) + " were below 0.8 F1 and have been removed.")

242 rows were >= 0.8 F1 and 33 were below 0.8 F1 and have been removed.


#### Double check that we still have comments for every top.  Count should = 20

OpenAI:

In [None]:
unique_count = comments_df['topic'].nunique()

print(unique_count)

Pegasus:

In [31]:
unique_count = pegasus_comments_df['topic'].nunique()

print(unique_count)

20


Cohere:

In [32]:
unique_count = cohere_comments_df['topic'].nunique()

print(unique_count)

20


#### Export comments with accuracy to CSV

In [33]:
comments_df.to_csv('tfcc_comments_with_accuracy.csv', index=False)
pegasus_comments_df.to_csv('tfcc_pegasus_comments_with_accuracy.csv', index=False)
cohere_comments_df.to_csv('tfcc_cohere_comments_with_accuracy.csv', index=False)