### Bert2Bert for Summarization
Referencing https://huggingface.co/patrickvonplaten/bert2bert_cnn_daily_mail

In [20]:
# import libraries
import pandas as pd
from transformers import AutoTokenizer, EncoderDecoderModel

In [39]:
# read transcriptions with definition replacement
defs = pd.read_csv('inputs/transcripts_with_definitions_100.csv')

# read original transcriptions
transcript = pd.read_csv('inputs/clean_transcriptions.csv')

defs = defs.drop('Unnamed: 0', axis = 1).rename({'transcription': 'defs'}, axis = 'columns')
tr = transcript.head(100).join(defs.head(100))

In [40]:
# load pre-trained model and tokenizer
model = EncoderDecoderModel.from_pretrained("patrickvonplaten/bert2bert_cnn_daily_mail", max_length = 2664)
tokenizer = AutoTokenizer.from_pretrained("patrickvonplaten/bert2bert_cnn_daily_mail", model_max_length = 2664)

### Functions for implementing model

In [41]:
def summarizer(text):
    input_ids = tokenizer(text, return_tensors="pt").input_ids
    # autoregressively generate summary (uses greedy decoding by default)
    generated_ids = model.generate(input_ids)
    generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return generated_text

In [52]:
def truncate(text):
    return text[:1024]

### Summarize the definition replaced text using the pre-trained model

In [53]:
tr = tr.dropna(axis = 0).reset_index().drop('index', axis = 1)
lst = list()
for i in range(len(tr['defs'])):
    # truncate the sequences so that the model can parse
    sequence = truncate(tr['defs'][i])
    # add new lines for better formatting when printing
    lst.append(summarizer(sequence))

df = pd.DataFrame(lst)
final_df = tr.join(df).rename({0:'summary'}, axis = 'columns')


### Determine Reading Level of the original transcription and the new summary
https://stackoverflow.com/questions/46759492/syllable-count-in-python

In [54]:
# find syllables in sentences
import re

def syllable(text):
    syllable_count = 0
    word_lst = text.split(' ')
    for word in word_lst:
        syllable_count += len(re.findall('(?!e$)[aeiouy]+', word, re.I) +re.findall('^[^aeiouy]*e$', word, re.I))
    return syllable_count

In [55]:
def word_count(text):
    word_lst = text.split(' ')
    words = len(word_lst)
    return words

In [56]:
def sent_count(text):
    sent_lst = text.split('.')
    sents = len(sent_lst)
    return sents

In [57]:
def flesch(word_count, sent_count, syllable_count):
    score = 206.835 - 1.015 * (word_count / sent_count) - 84.6 * (syllable_count / word_count)
    return score

In [58]:
def reading_level(score):
    if score < 100 and score > 90:
        return '5th grade'
    if score < 90 and score > 80:
        return '6th grade'
    if score < 80 and score > 70:
        return '7th grade'
    if score < 70 and score > 60:
        return '8th and 9th grade'
    if score < 60 and score > 50:
        return '10th to 12th grade'
    if score < 50 and score > 30:
        return 'College'
    if score < 30 and score > 10:
        return 'College Graduate'
    else:
        return 'Professional'
    

In [59]:
# finds syllable count and word count of each row
scores = list()
level = list()
for i in range(len(final_df['transcription'])):
    syllables = syllable(final_df['transcription'][i])
    words = word_count(final_df['transcription'][i])
    sentences = sent_count(final_df['transcription'][i])
    # flesch score test
    score = (flesch(words, sentences, syllables))
    scores.append(score)
    level.append(reading_level(score))


final_df = final_df.join(pd.DataFrame({'t_scores': scores}))
final_df = final_df.join(pd.DataFrame({'t_level': level}))

In [60]:
scores = list()
level = list()
for i in range(len(final_df['summary'])):
    syllables = syllable(final_df['summary'][i])
    words = word_count(final_df['summary'][i])
    sentences = sent_count(final_df['summary'][i])
    score = (flesch(words, sentences, syllables))
    scores.append(score)
    level.append(reading_level(score))


final_df = final_df.join(pd.DataFrame({'s_scores': scores}))
final_df = final_df.join(pd.DataFrame({'s_level': level}))

In [61]:
print(final_df)

                                        transcription  \
0   SUBJECTIVE:,  This 23-year-old white female pr...   
1   PAST MEDICAL HISTORY:, He has difficulty climb...   
2   HISTORY OF PRESENT ILLNESS: , I have seen ABC ...   
3   2-D M-MODE: , ,1.  Left atrial enlargement wit...   
4   1.  The left ventricular cavity size and wall ...   
..                                                ...   
93  PREOPERATIVE DIAGNOSIS:,  Left inguinal hernia...   
94  PREOPERATIVE DIAGNOSIS: , Benign prostatic hyp...   
95  PREOPERATIVE DIAGNOSIS: , Inguinal hernia.,POS...   
96  PROCEDURE PERFORMED: , Inguinal herniorrhaphy....   
97  PREOPERATIVE DIAGNOSIS:,  Bilateral inguinal h...   

                                                 defs  \
0   SUBJECTIVE:,  This 23-year-old white female pr...   
1   PAST MEDICAL HISTORY:, He has difficulty climb...   
2   HISTORY OF PRESENT ILLNESS: , I have seen ABC ...   
3   2-D M-MODE: , ,1.  Left atrial enlargement wit...   
4   1.  The left ventricular c

In [62]:
final_df.to_csv('final_df.csv')