In [1]:
!nvidia-smi

Sun Mar 10 21:10:14 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 545.23.08              Driver Version: 545.23.08    CUDA Version: 12.3     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla V100-SXM2-32GB           Off | 00000000:18:00.0 Off |                    0 |
| N/A   45C    P0              63W / 300W |      0MiB / 32768MiB |      2%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [1]:
from transformers import BartForConditionalGeneration, BartTokenizer
import pandas as pd

In [3]:
# Load the model and tokenizer
model = BartForConditionalGeneration.from_pretrained(
    'facebook/bart-large-cnn')
tokenizer = BartTokenizer.from_pretrained(
    'facebook/bart-large-cnn')

In [4]:
def summarize(text, maxSummarylength=500):
    # Encode the text and summarize
    inputs = tokenizer.encode("summarize: " +
                              text,
                              return_tensors="pt",
                              max_length=1024, truncation=True)
    summary_ids = model.generate(inputs, max_length=maxSummarylength,
                                 min_length=int(maxSummarylength/5),
                                 length_penalty=10.0,
                                 num_beams=4, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

In [5]:
def split_text_into_pieces(text,
                           max_tokens=900,
                           overlapPercent=10):
    # Tokenize the text
    tokens = tokenizer.tokenize(text)

    # Calculate the overlap in tokens
    overlap_tokens = int(max_tokens * overlapPercent / 100)

    # Split the tokens into chunks of size
    # max_tokens with overlap
    pieces = [tokens[i:i + max_tokens]
              for i in range(0, len(tokens),
                             max_tokens - overlap_tokens)]

    # Convert the token pieces back into text
    text_pieces = [tokenizer.decode(
        tokenizer.convert_tokens_to_ids(piece),
        skip_special_tokens=True) for piece in pieces]

    return text_pieces

In [6]:
def recursive_summarize(text, max_length=1000, recursionLevel=0):
    recursionLevel=recursionLevel+1
    print("######### Recursion level: ",
          recursionLevel,"\n\n######### ")
    tokens = tokenizer.tokenize(text)
    expectedCountOfChunks = len(tokens)/max_length
    max_length=int(len(tokens)/expectedCountOfChunks)+2

    # Break the text into pieces of max_length
    pieces = split_text_into_pieces(text, max_tokens=max_length)

    print("Number of pieces: ", len(pieces))
    # Summarize each piece
    summaries=[]
    k=0
    for k in range(0, len(pieces)):
        piece=pieces[k]
        print("****************************************************")
        print("Piece:",(k+1)," out of ", len(pieces), "pieces")
        print(piece, "\n")
        summary =summarize(piece, maxSummarylength=max_length/3*2)
        print("SUMNMARY: ", summary)
        summaries.append(summary)
        print("****************************************************")

    concatenated_summary = ' '.join(summaries)

    tokens = tokenizer.tokenize(concatenated_summary)

    if len(tokens) > max_length:
        # If the concatenated_summary is too long, repeat the process
        print("############# GOING RECURSIVE ##############")
        return recursive_summarize(concatenated_summary,
                                   max_length=max_length,
                                   recursionLevel=recursionLevel)
    else:
      # Concatenate the summaries and summarize again
        final_summary=concatenated_summary
        if len(pieces)>1:
            final_summary = summarize(concatenated_summary,
                                  maxSummarylength=max_length)
        return final_summary

In [7]:
def recursive_summarize(text, max_length=1000, recursionLevel=0):
    recursionLevel = recursionLevel + 1
    tokens = tokenizer.tokenize(text)
    expectedCountOfChunks = len(tokens) / max_length
    max_length = int(len(tokens) / expectedCountOfChunks) + 2

    pieces = split_text_into_pieces(text, max_tokens=max_length)

    summaries = []
    for k in range(len(pieces)):
        piece = pieces[k]
        summary = summarize(piece, maxSummarylength=max_length / 3 * 2)
        summaries.append(summary)

    concatenated_summary = ' '.join(summaries)
    tokens = tokenizer.tokenize(concatenated_summary)

    if len(tokens) > max_length:
        return recursive_summarize(concatenated_summary, max_length, recursionLevel)
    else:
        final_summary = concatenated_summary
        if len(pieces) > 1:
            final_summary = summarize(concatenated_summary, maxSummarylength=max_length)
        return final_summary


In [2]:
train_path = "Training_data.csv"
validation_path = "Validation_data.csv"
test_path = "/work/LitArt/data/generated_summaries/test_dataset_with_summaries.csv"

In [3]:
train_data = pd.read_csv(train_path)
validate_data = pd.read_csv(validation_path)
test_data = pd.read_csv(test_path)
train_data.head()

train_data = train_data[0:100]

In [4]:
train_data

Unnamed: 0,bid,is_aggregate,source,chapter_path,summary_path,book_id,summary_id,content,summary,chapter,chapter_length,summary_name,summary_url,summary_text,summary_analysis,summary_length,analysis_length,New_Summary,BART_summary
0,27681,True,cliffnotes,all_chapterized_books/27681-chapters/chapters_...,finished_summaries/cliffnotes/The Last of the ...,The Last of the Mohicans.chapters 1-2,chapters 1-2,,"{""name"": ""Chapters 1-2"", ""url"": ""https://web.a...",mine ear is open and my heart prepared the wor...,6471.0,Chapters 1-2,https://web.archive.org/web/20201101053205/htt...,before any characters appear the time and geog...,These two chapters introduce the reader to the...,388.0,473.0,"Before any characters appear, the time and geo...","""Webb"" is about a group of men who set out on ..."
1,27681,False,cliffnotes,all_chapterized_books/27681-chapters/03.txt,finished_summaries/cliffnotes/The Last of the ...,The Last of the Mohicans.chapter 3,chapter 3,,"{""name"": ""Chapter 3"", ""url"": ""https://web.arch...",before these fields were shorn and tilled full...,3132.0,Chapter 3,https://web.archive.org/web/20201101053205/htt...,in another part of the forest by the river a f...,This chapter introduces the other three main a...,198.0,149.0,In another part of the forest by the river a f...,The book is written in the language of the red...
2,27681,False,cliffnotes,all_chapterized_books/27681-chapters/04.txt,finished_summaries/cliffnotes/The Last of the ...,The Last of the Mohicans.chapter 4,chapter 4,,"{""name"": ""Chapter 4"", ""url"": ""https://web.arch...",well go thy way thou shalt not from this grove...,3075.0,Chapter 4,https://web.archive.org/web/20201101053205/htt...,when the mounted party from fort howard approa...,Since this chapter is mostly one of surface ac...,319.0,75.0,When the mounted party from Fort Howard approa...,The story is about a group of travellers who f...
3,27681,False,cliffnotes,all_chapterized_books/27681-chapters/05.txt,finished_summaries/cliffnotes/The Last of the ...,The Last of the Mohicans.chapter 5,chapter 5,,"{""name"": ""Chapter 5"", ""url"": ""https://web.arch...",in such a night did thisbe fearfully o ertrip ...,3268.0,Chapter 5,https://web.archive.org/web/20201101053205/htt...,the pursuit of magua is unsuccessful but hawke...,Here the reader encounters the first bloodshed...,329.0,156.0,"The pursuit of Magua is unsuccessful, but Hawk...","""The Voyage of the Bering Sea"" is a tale of a ..."
4,27681,False,cliffnotes,all_chapterized_books/27681-chapters/06.txt,finished_summaries/cliffnotes/The Last of the ...,The Last of the Mohicans.chapter 6,chapter 6,,"{""name"": ""Chapter 6"", ""url"": ""https://web.arch...",those strains that once did sweet in zion glid...,3873.0,Chapter 6,https://web.archive.org/web/20201101053205/htt...,heyward and the girls are uneasy and gamut is ...,This chapter shows Cooper in his most inventiv...,321.0,128.0,Heyward and the girls are uneasy and Gamut is ...,"The book is published by Simon & Schuster, a d..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,345,False,shmoop,all_chapterized_books/345-chapters/38.txt,finished_summaries/shmoop/Dracula/section_10_p...,Dracula.chapter 11,chapter 11,,"{""name"": ""Chapter 11"", ""url"": ""https://web.arc...",september --how good they all are to me i quit...,4746.0,Chapter 11,https://web.archive.org/web/20201219141110/htt...,lucy doesn t understand why van helsing was so...,,723.0,1.0,Lucy doesn't understand why Van Helsing was so...,Dr seward's diary describes a visit to the zoo...
96,345,False,shmoop,all_chapterized_books/345-chapters/39.txt,finished_summaries/shmoop/Dracula/section_11_p...,Dracula.chapter 12,chapter 12,,"{""name"": ""Chapter 12"", ""url"": ""https://web.arc...",september --i drove at once to hillingham and ...,6737.0,Chapter 12,https://web.archive.org/web/20201219141110/htt...,dr seward arrives at the same time as van hels...,,661.0,1.0,Dr. Seward arrives at the same time as Van Hel...,Mona harker is the junior partner of the impor...
97,345,False,shmoop,all_chapterized_books/345-chapters/40.txt,finished_summaries/shmoop/Dracula/section_12_p...,Dracula.chapter 13,chapter 13,,"{""name"": ""Chapter 13"", ""url"": ""https://web.arc...",the funeral was arranged for the next succeedi...,6267.0,Chapter 13,https://web.archive.org/web/20201219141110/htt...,dr seward has to arrange a lot of the funeral ...,,362.0,1.0,Dr. Seward has to arrange a lot of the funeral...,"Mona harker's husband, Jonathan, was killed in..."
98,345,False,shmoop,all_chapterized_books/345-chapters/41.txt,finished_summaries/shmoop/Dracula/section_13_p...,Dracula.chapter 14,chapter 14,,"{""name"": ""Chapter 14"", ""url"": ""https://web.arc...",september --jonathan is better after a bad nig...,5963.0,Chapter 14,https://web.archive.org/web/20201219141110/htt...,jonathan isn t sleeping well and mina s worrie...,,353.0,1.0,"Jonathan isn't sleeping well, and Mina's worri...",Dr. van helsing visited Westenra in her last i...


In [10]:
# import pandas as pd  # Import pandas if not already imported

# # Assuming train_data is a pandas DataFrame

# # Create an empty column named "BART_summary"
# train_data["BART_summary"] = pd.Series(dtype=object)

# # Loop through each chapter in the "chapter" column
# for index, row in train_data.iterrows():
#   text = row["chapter"]
#   final_summary = recursive_summarize(text)
#   train_data.at[index, "BART_summary"] = final_summary

# # Print confirmation message (optional)
# print("BART summaries saved to the 'BART_summary' column.")


In [12]:
final_summary = recursive_summarize(text)
print("\n%%%%%%%%%%%%%%%%%%%%%\n")
print("Final summary:", final_summary)

2024-03-10 21:10:44.418632: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-03-10 21:10:50.122175: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /shared/centos7/cuda/11.8/lib64:/shared/centos7/anaconda3/2022.05/lib:/shared/centos7/nodejs/14.15.4/lib
2024-03-10 21:10:50.124361: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /shared/centos7/c


%%%%%%%%%%%%%%%%%%%%%

Final summary: "The Voyagers" is published by Simon & Schuster at a price of $24.99. The book is set in the early 1800s and is set on the shores of what is now known as the Yukon Territory. It is set during the time of the first contact between the U.S. and the indigenous people of the region. "The Voyage of the Youngest Boy" is about the journey of a young boy and his family through the wilderness. "ien" is the story of a fight between two tribes of Native Americans in the American Outback. The story also tells of a battle between a white man and a Native American called "duncan" The story of the death of a savage by a rifle fire is told in the novel "The Hanging Man" The novel is based on a true story about a battleBetween the Hohican and the Indians in theEarly 19th century. It was published in the territory of the present-day New Mexico, in the United States.


In [13]:
from tqdm import tqdm 


train_data["BART_summary"] = pd.Series(dtype=object)

for index, row in tqdm(train_data.iterrows(), total=len(train_data)):
  text = row["chapter"]
  final_summary = recursive_summarize(text)
  train_data.at[index, "BART_summary"] = final_summary


print("BART summaries saved to the 'BART_summary' column.")

100%|██████████| 100/100 [2:31:46<00:00, 91.07s/it] 

BART summaries saved to the 'BART_summary' column.





In [16]:
train_data.to_csv("Training_data.csv", index=False)

In [6]:
train_data = pd.read_csv("Training_data.csv")

train_data["BART_summary"][3]

