In [None]:
import pandas as pd

from google.colab import drive
drive.mount('/content/drive')

from transformers import PreTrainedTokenizerFast
tokenizer = PreTrainedTokenizerFast.from_pretrained("bert-base-uncased")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


In [None]:
# Replace 'path_to_your_file.csv' with the actual path to your CSV file
file_path = '/content/drive/MyDrive/6.7930 <> Rology Final Project/outputs/cheXagent_custom_generated_reports.csv'
# Read the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)

In [None]:
display(df['content_findings'][0])

'   Clear both lung fields. No parenchymal masses, nodules or consolidations\n   No abnormal parahilar shadows.\n   Clear both costo-phrenic angles.\n   Preserved cardio-thoracic ratio.\n   Intact bony thorax.'

In [None]:
# def move_first_word_to_end(s):
#     if not isinstance(s, str):
#       s = str(s)
#     words = s.split()  # Split the string into words
#     if len(words) > 1:  # Check if there is more than one word
#         return ' '.join(words[1:] + [words[0]])  # Join words, moving the first word to the end
#     else:
#         return s  # If there is only one word, return it as is

# # Apply the function to the column and assign the result to a new column
# df['content_findings_modified'] = df['content_findings'].apply(move_first_word_to_end)

In [None]:
def rogue_n(summary, reference, n, tokenizer):
    if not isinstance(summary, str):
      summary = str(summary)
    summary_tokens = tokenizer.tokenize(summary.lower())
    if not isinstance(reference, str):
      reference = str(reference)
    reference_tokens = tokenizer.tokenize(reference.lower())

    summary_ngrams = [tuple(summary_tokens[i:i+n]) for i in range(len(summary_tokens)-n+1)]
    reference_ngrams = [tuple(reference_tokens[i:i+n]) for i in range(len(reference_tokens)-n+1)]

    intersection_count = len(set(summary_ngrams) & set(reference_ngrams))
    union_count = len(set(summary_ngrams) | set(reference_ngrams))

    if union_count == 0:
        rogue_n_score = 0
    else:
        rogue_n_score = intersection_count / union_count

    return rogue_n_score

def rogue_scores_row(row, tokenizer):
    summary = row['generated_report']
    reference = row['content_findings']
    unigram_score = rogue_n(summary, reference, 1, tokenizer)
    bigram_score = rogue_n(summary, reference, 2, tokenizer)
    trigram_score = rogue_n(summary, reference, 3, tokenizer)
    quadrigram_score = rogue_n(summary, reference, 4, tokenizer)
    return pd.Series({'unigram_score': unigram_score, 'bigram_score': bigram_score, 'trigram_score': trigram_score, 'quadrigram_score': quadrigram_score})

# Apply rogue_scores_row function to each row of the DataFrame
rogue_scores_df = df.apply(lambda row: rogue_scores_row(row, tokenizer), axis=1)

# Concatenate the resulting DataFrame with the original DataFrame
df = pd.concat([df, rogue_scores_df], axis=1)

Token indices sequence length is longer than the specified maximum sequence length for this model (513 > 512). Running this sequence through the model will result in indexing errors


In [None]:
display(df)
df.to_csv('rogue_scores.csv')

Unnamed: 0.1,Unnamed: 0,Study_id,Report_id,content_impression,content_findings,Type,generated_report,unigram_score,bigram_score,trigram_score,quadrigram_score
0,0,658d21d4575a0800087f196e,658d264eddc8042ba98cb027,Normal chest X-ray.,Clear both lung fields. No parenchymal mass...,Normal,Lung volume is normal. There are no focal lesi...,0.090909,0.023529,0.010870,0.000000
1,1,658d21ec575a0800087f196f,658d25bcddc8042ba98caf88,Normal chest X-ray.,Clear both lung fields. No parenchymal mass...,Normal,Lung volume is normal. No focal lesions are se...,0.115942,0.033333,0.010101,0.000000
2,2,658d222d575a0800087f1970,658d2654ddc8042ba98cb033,Normal chest X-RAY.,Lungs: Normal both lung volumes with fair a...,Normal,Lung volume is normal. There is no focal lesio...,0.408451,0.180952,0.106557,0.060606
3,3,658d225d575a0800087f1971,658d26acddc8042ba98cb0ac,Normal chest X-ray.,Clear both lung fields. No parenchymal mass...,Normal,Lung volume is normal. Focal lesions are prese...,0.071429,0.022727,0.010638,0.000000
4,4,658d22fc575a0800087f1973,658d2663ddc8042ba98cb044,Normal chest X-ray.,Clear both lung fields. No parenchymal mass...,Normal,The lungs are hyperinflated with flattening of...,0.075949,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...
5495,5495,657ebef49d7911000829d4dd,657ec428ddc8043ca0e5c0dc,Normal chest X-RAY.,Lungs: Normal both lung volumes with fair a...,Normal,Lung volume is normal. There is no focal lesio...,0.408451,0.180952,0.106557,0.060606
5496,5496,657ebf2d9d7911000829d4df,657ec3e6ddc8043ca0e5c03e,Normal chest X-RAY.,Lungs: Normal both lung volumes with fair a...,Normal,Lung volume is normal. There is no focal lesio...,0.408451,0.180952,0.106557,0.060606
5497,5497,657ebf379d7911000829d4e1,657ec427ddc8043ca0e5c0d9,Normal chest X-RAY.,Lungs: Normal both lung volumes with fair a...,Normal,The lungs are hyperinflated. There is no focal...,0.096774,0.033613,0.007874,0.000000
5498,5498,657ebf7c3497520008734c64,657ec45bddc8043ca0e5c16b,Normal chest X-ray.,Clear both lung fields. No parenchymal mass...,Normal,The lungs are hyperinflated with flattening of...,0.066667,0.020833,0.009709,0.000000


In [None]:
unigram_score = df['unigram_score'].mean()
print("Unigram Score:", unigram_score)
bigram_score = df['bigram_score'].mean()
print("Bigram Score:", bigram_score)
trigram_score = df['trigram_score'].mean()
print("Trigram Score:", trigram_score)
quadrigram_score = df['quadrigram_score'].mean()
print("Quadrigram Score:", quadrigram_score)



Unigram Score: 0.2400612231052761
Bigram Score: 0.10022509149065309
Trigram Score: 0.05776525730236508
Quadrigram Score: 0.030268612766526277


In [None]:
float_rows = df[df['content_findings'].apply(lambda x: isinstance(x, float))]
display(float_rows)

Unnamed: 0.1,Unnamed: 0,Study_id,Report_id,content_impression,content_findings,Type,generated_report,unigram_score,bigram_score,trigram_score,quadrigram_score
3578,3578,647ae172f9cdb80008f63c39,647ae43addc8043d8a4be3b1,,,Abnormal,Lung volume is normal. No focal lesions are ob...,0.0,0.0,0.0,0.0


In [None]:
abnormal_rows = df[df['Type']=='Abnormal']
display(abnormal_rows)

Unnamed: 0.1,Unnamed: 0,Study_id,Report_id,content_impression,content_findings,Type,generated_report,unigram_score,bigram_score,trigram_score,quadrigram_score
250,250,658428c9ddc8043ca0e87424,65844980ddc8043ca0e88e65,As described.,Poorly centralized patient.\n ETT seen in...,Abnormal,Lung volume has decreased. Focal lesions are p...,0.150000,0.019231,0.009174,0.000000
251,251,658428c9ddc8043ca0e87427,658435a6ddc8043ca0e88028,As described.,Right CVL is seen in place.\n Resolved le...,Abnormal,Lung volume has decreased. Focal lesions are p...,0.230000,0.070922,0.045752,0.025157
252,252,658428caddc8043ca0e8742a,658434d0ddc8043ca0e87e80,As described.,_Suboptimal study due to malpositioning shows:...,Abnormal,Enlargement of the cardiac silhouette with pul...,0.040000,0.000000,0.000000,0.000000
253,253,6584333bddc8043ca0e87caf,658440dfddc8043ca0e88823,As described above\n Poor inspiratory ima...,Poor inspiratory image view.\n RCVL in pl...,Abnormal,Lung volume has decreased. Focal lesions are p...,0.117647,0.018519,0.008621,0.000000
254,254,658458fdddc8043ca0e89802,65846d65ddc8043ca0e8a929,Prominent bronchovascular markings in both ...,Prominent bronchovascular markings noted in...,Abnormal,Lung volume is normal. No focal lesions are ob...,0.240964,0.111111,0.060150,0.028169
...,...,...,...,...,...,...,...,...,...,...,...
4495,4495,64bf952dddc80425c63c1797,64bf9cccddc80425c63c1da4,Cardiomegaly.\n Cardiogenic pulmonary edema.,Lungs: _++Bilateral para-hilar alveolar opa...,Abnormal,Lung volume is normal. There are no focal lesi...,0.284404,0.132911,0.083799,0.046154
4496,4496,64bf952eddc80425c63c179a,64bf9da1ddc80425c63c1e68,Cadiomegaly.\n Pulmonary hypertension.\n\...,_++Prominent pulmonary conus.++_\n Heart:...,Abnormal,Lung volumes are low. There is no focal consol...,0.103774,0.035714,0.006579,0.000000
4497,4497,64bfa70772cd1b0008b7472f,64bfbb19ddc80425c63c3467,Left lung upper zone reticular and consolid...,++Left lung upper zone reticular and consol...,Abnormal,Lung volume is normal. Focal lesions are prese...,0.223404,0.093750,0.049296,0.020000
4498,4498,64bfba6ea33a9e0008700148,64bfe5e9ddc80425c63c4e1b,Accentuated bronchovascular marking.,Lungs: Accentuated bronchovascular marking....,Abnormal,Lung volume is normal. No focal lesions are ob...,0.348837,0.152000,0.090909,0.051948
