#1. Installing required libraries

In [33]:
!pip install -q openai gradio pypdf tiktoken langchain

In [34]:
!pip install langchain_community



In [35]:
pip install datasets



In [36]:
!pip install rouge_score




#2. Importing the necessary libraries

In [37]:
import os
import tiktoken
import gradio as gr
from langchain import OpenAI , PromptTemplate
from langchain.chains.summarize import load_summarize_chain
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import pipeline
from langchain import LLMChain, HuggingFaceHub

# 3. Setup the environment

#4. Count the number of tokens in string

In [38]:
# function to return the number of tokens in the string
def num_tokens_from_string(string:str ,encoding_name:str) -> int :
  encoding = tiktoken.get_encoding(encoding_name)
  num_tokens = len(encoding.encode(string))
  return num_tokens

In [39]:
num_tokens_from_string('This is a summarization project first prototype','cl100k_base')

8

# 5. Download a pdf file

# 6. Load, Read and Extract the PDF

In [40]:

loader = PyPDFLoader('/content/CorporateFinance-IJMSBR.pdf')

In [41]:
docs = loader.load()

In [42]:
docs

[Document(metadata={'source': '/content/CorporateFinance-IJMSBR.pdf', 'page': 0}, page_content='See discussions, st ats, and author pr ofiles f or this public ation at : https://www .researchgate.ne t/public ation/281557147\nUnderstanding Corporate Finance: Experiences of Emerging Economies and\nCase of a High-Income Small Developing Cou ntry\nArticle  · July 2015\nCITATIONS\n0READS\n7,262\n1 author:\nHav en Allahar\nUniv ersity of the West Indies, St . Augustine\n40 PUBLICA TIONS \xa0\xa0\xa0230 CITATIONS \xa0\xa0\xa0\nSEE PROFILE\nAll c ontent f ollo wing this p age was uplo aded b y Hav en Allahar  on 07 Sept ember 2015.\nThe user has r equest ed enhanc ement of the do wnlo aded file.'),
 Document(metadata={'source': '/content/CorporateFinance-IJMSBR.pdf', 'page': 1}, page_content='International Journal of Management Sciences and Business Research, July -2015 ISSN (2226 -8235) Vol -4, Issue 7  \nhttp://www.ijmsbr.com   Page 34 Understanding  Corporate  Finance : Experience of  Emerg

In [43]:
len(docs)

11

In [44]:
type(docs)

list

# Step 7: Split the data into chunks

In [45]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=200)

In [46]:
texts = text_splitter.split_documents(docs)

In [47]:
len(texts) #to show the number of chunks

73

In [48]:
text_i_str = " ".join([str(doc) for doc in docs])

In [49]:
type(text_i_str)

str

In [50]:
len(text_i_str)

57337

In [51]:
text_i_str

"page_content='See discussions, st ats, and author pr ofiles f or this public ation at : https://www .researchgate.ne t/public ation/281557147\nUnderstanding Corporate Finance: Experiences of Emerging Economies and\nCase of a High-Income Small Developing Cou ntry\nArticle  · July 2015\nCITATIONS\n0READS\n7,262\n1 author:\nHav en Allahar\nUniv ersity of the West Indies, St . Augustine\n40 PUBLICA TIONS \xa0\xa0\xa0230 CITATIONS \xa0\xa0\xa0\nSEE PROFILE\nAll c ontent f ollo wing this p age was uplo aded b y Hav en Allahar  on 07 Sept ember 2015.\nThe user has r equest ed enhanc ement of the do wnlo aded file.' metadata={'source': '/content/CorporateFinance-IJMSBR.pdf', 'page': 0} page_content='International Journal of Management Sciences and Business Research, July -2015 ISSN (2226 -8235) Vol -4, Issue 7  \nhttp://www.ijmsbr.com   Page 34 Understanding  Corporate  Finance : Experience of  Emerging Economies and Case of a High -Income \nSmall Developing Country . \n \nAuthor Details:  \n

In [52]:
import re



# Use regular expressions to extract text between "Abstract" and "Keywords"
# match = re.search(r'Abstract\s(.*?)\sKeywords:', text_i_str, re.DOTALL)
match = re.search(r'(?i)abstract\s(.*?)\s(?i)keywords', text_i_str, re.DOTALL)
if match:
    reference_summary = match.group(1).strip()
else:
    raise ValueError("Could not find 'Abstract' or 'Keywords' in the document")

print(reference_summary)

The purpose of this paper is to provide an overview of the theories, concepts and issues involved in the area of corporate finance 
and the implications for emerging economies. A secondary research appr oach was adopted based on a review of published texts, 
journals, and technical reports.  The results  of the study demonstrate that based on the indicators of capital market development, 
the high -income developing country of Trinidad and Tobago used as a case study, lags behind in critical areas of development o f 
its capital market and, along with related developing countries, can benefit fro m the fundamental principles involved in the 
practice of corporate finance. The practical implications of the study are the potential use by students in tertiary level instit utions 
engaged in MBA or financial management studies, participants pursuing professional qualifications  in accounting and finance, 
employees in financial institutions, and general stakeholders seeking to invest funds

# Step 9: Loading hugging face models and trying to summarize the pdf we have

In [53]:
# from langchain import LLMChain, HuggingFaceHub

In [54]:
docs

[Document(metadata={'source': '/content/CorporateFinance-IJMSBR.pdf', 'page': 0}, page_content='See discussions, st ats, and author pr ofiles f or this public ation at : https://www .researchgate.ne t/public ation/281557147\nUnderstanding Corporate Finance: Experiences of Emerging Economies and\nCase of a High-Income Small Developing Cou ntry\nArticle  · July 2015\nCITATIONS\n0READS\n7,262\n1 author:\nHav en Allahar\nUniv ersity of the West Indies, St . Augustine\n40 PUBLICA TIONS \xa0\xa0\xa0230 CITATIONS \xa0\xa0\xa0\nSEE PROFILE\nAll c ontent f ollo wing this p age was uplo aded b y Hav en Allahar  on 07 Sept ember 2015.\nThe user has r equest ed enhanc ement of the do wnlo aded file.'),
 Document(metadata={'source': '/content/CorporateFinance-IJMSBR.pdf', 'page': 1}, page_content='International Journal of Management Sciences and Business Research, July -2015 ISSN (2226 -8235) Vol -4, Issue 7  \nhttp://www.ijmsbr.com   Page 34 Understanding  Corporate  Finance : Experience of  Emerg

In [55]:
# os.environ['HUGGINGFACEHUB_API_TOKEN'] ='?'


# Generating Summary with Bart with finetune

In [56]:
# from transformers import AutoTokenizer, AutoModelForSeq2SeqLM


# Load the tokenizer and model

tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")



# Tokenize the input text
inputs = tokenizer(text_i_str, return_tensors="pt", max_length=1024, truncation=True)

# Generate summary
# here controlling the length of summary in numer of tokens
# summary_ids = model.generate(inputs["input_ids"], max_length=1500, min_length=150, do_sample=False)

summary_ids = model.generate(
    inputs["input_ids"],
    max_length=2500,
    min_length=250,
    do_sample=False,
    length_penalty=1.0,  # Default value
    repetition_penalty=1.0,  # Default value
    early_stopping=False  # Try setting this to True or False
)

# Decode the summary
summary_i = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# print(summary_i)

KeyboardInterrupt: 

In [None]:
print(len(summary_i))

In [None]:
print(summary_i)

In [None]:
from rouge_score import rouge_scorer


In [None]:
def calculate_accuracy(reference_summary, generated_summary, model_name):
    generated_summary = generated_summary.replace('\n', ' ')
    reference_summary = reference_summary.replace('\n', ' ')

    # Initialize the ROUGE scorer
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

    # Calculate ROUGE scores
    scores = scorer.score(reference_summary, generated_summary)

    print(f'Model name: {model_name}')
    print("ROUGE-1:", scores['rouge1'])
    print("ROUGE-2:", scores['rouge2'])
    print("ROUGE-L:", scores['rougeL'])

    # Prepare the scores to return
    return {
        'model_name': model_name,
        'rouge1': scores['rouge1'],
        'rouge2': scores['rouge2'],
        'rougeL': scores['rougeL']
    }

In [None]:
calculate_accuracy(reference_summary,summary_i,'BART_Normal')

#Our Finetuned Model on finance subset

# Using our saved model on finance dataset to see the result

In [None]:
from transformers import BartTokenizer, BartForConditionalGeneration

# Path to the directory where the model was saved
model_path = './bart_finance_finetuned'  # Path where the model and tokenizer were saved

# Load the model and tokenizer
tokenizer = BartTokenizer.from_pretrained(model_path)
model = BartForConditionalGeneration.from_pretrained(model_path)

# Function to summarize text
def summarize_text(text):
    inputs = tokenizer(text, return_tensors='pt', max_length=1024, truncation=True)
    summary_ids = model.generate(inputs['input_ids'], max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary