<a href="https://colab.research.google.com/github/sheldonkemper/bank_of_england/blob/tidy_up_preprocessing_notebook/notebooks/processed/ct_nlp_pipeline_v1_0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Libraries**

In [None]:
!pip install PyPDF2



In [None]:
!pip install openai==0.28



In [None]:
!pip install python-dotenv



In [None]:
import re
import PyPDF2
import openai
import json
import pandas as pd
import time
import os
from dotenv import load_dotenv
from google.colab import userdata

In [None]:
load_dotenv()
openai_api_key = os.getenv("Openai_key")

# **1. Data extraction**

In [None]:
def extract_sections_from_pdf(pdf_path):
    """
    Reads a PDF file and extracts text from all pages.
    Then segregates the text into two sections based on markers:
    - 'Management Discussion' section (md_section)
    - 'Question and Answer' section (QNA_section)

    Parameters:
        pdf_path (str): The file path to the PDF.

    Returns:
        md_section (str): Extracted text for the Management Discussion section.
        QNA_section (str): Extracted text for the Question and Answer section.
    """
    # Open the PDF file in binary mode
    with open(pdf_path, 'rb') as pdf_file:
        # Create a PDF reader object
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        num_pages = len(pdf_reader.pages)

        # Extract text from all pages
        full_text = ""
        for page_num in range(num_pages):
            page = pdf_reader.pages[page_num]
            page_text = page.extract_text()
            full_text += page_text + "\n"

    # Optionally, clean up extra whitespace/newlines
    full_text = re.sub(r'\n+', '\n', full_text)

    # Convert text to lowercase for marker searching (you can retain original text for extraction)
    text_lower = full_text.lower()

    # Define markers for splitting sections
    # Adjust these markers if your PDF uses slightly different headings
    md_marker = "management discussion"
    qna_marker = "question and answer"

    # Find the starting index of each section in the text
    md_start = text_lower.find(md_marker)
    qna_start = text_lower.find(qna_marker)

    if md_start == -1:
        raise ValueError("Management Discussion section marker not found in the PDF.")
    if qna_start == -1:
        raise ValueError("Question and Answer section marker not found in the PDF.")

    # Extract the sections based on the identified markers
    # We assume that the Management Discussion section comes first
    md_section = full_text[md_start:qna_start].strip()
    QNA_section = full_text[qna_start:].strip()

    return md_section, QNA_section

In [None]:
if __name__ == "__main__":
    # Specify the path to your PDF file
    pdf_path = "/content/4q24-earnings-transcript.pdf"

    # Extract the sections
    md_section, QNA_section = extract_sections_from_pdf(pdf_path)

    # Optionally, convert the extracted text into datasets (e.g., as a list of lines)
    md_dataset = md_section.split("\n")
    qna_dataset = QNA_section.split("\n")

In [None]:
    # Display a sample from each dataset
    print("=== Management Discussion Section Sample ===")
    for line in md_dataset[:5]:
        print(line)

=== Management Discussion Section Sample ===
MANAGEMENT DISCUSSION SECTION 
........................................................................................................................................................................................................................................................................................ 
Operator : Good morning, ladies and gentlemen. Welcome to JPMorganChase's Fourth Quarter 2024 Earnings Call. This call is being recorded. 
Your line will be muted for the duration of the call. We will now go live to the presentation. The presentation is available on JPMorganChase's 
website. Please refer to the disclaimer in the back concerning forward-looking statements. Please stand by. 


In [None]:
    print("\n=== Question and Answer Section Sample ===")
    for line in qna_dataset[:5]:
        print(line)


=== Question and Answer Section Sample ===
QUESTION AND ANSWER SECTION 
 
Operator : Thank you. Please stand by. Our first question comes from John McDonald with Truist Securities. You may proceed. 
........................................................................................................................................................................................................................................................................................ 
John McDonald 


In [None]:
# Convert md_dataset to DataFrame before saving
md_df = pd.DataFrame(md_dataset, columns=['Text'])
md_df.to_csv("MD_1Q23.csv", index=False)

In [None]:
qna_dataset

['QUESTION AND ANSWER SECTION ',
 ' ',
 'Operator : Thank you. Please stand by. Our first question comes from John McDonald with Truist Securities. You may proceed. ',
 '........................................................................................................................................................................................................................................................................................ ',
 'John McDonald ',
 'Analyst, Truist Securities, Inc. Q ',
 'Hi. Good morning. Jeremy, I wanted to ask about capital, and I know you get this question a lot about the kind of high-class dilemma of your ',
 "growing capital base and your perspective of that as earnings in store. So, I guess, what's the framework for thinking about the opportunity ",
 'cost of sitting on the growing base of capital and how high you might let that go versus your patience in waiting for more attractive deployment ',
 'opportunities? ',
 '.......................

In [None]:
openai.api_key = userdata.get('Openai_key')

In [None]:
def convert_qna_dataset_to_text(qna_dataset):
    """
    Converts a list of Q&A pair strings into a single text string.

    Parameters:
        qna_dataset (list): List of strings, where each string represents a Q&A pair.

    Returns:
        str: A single string containing all Q&A pairs separated by newlines.
    """
    # Join the list elements using a newline separator
    text = "\n".join(qna_dataset)
    return text

# Example usage:
if __name__ == "__main__":
    # Convert the qna_dataset to a single text string
    qna_text = convert_qna_dataset_to_text(qna_dataset)
    print("Converted Q&A Text:")
    print(qna_text)

Converted Q&A Text:
QUESTION AND ANSWER SECTION 
 
Operator : Thank you. Please stand by. Our first question comes from John McDonald with Truist Securities. You may proceed. 
........................................................................................................................................................................................................................................................................................ 
John McDonald 
Analyst, Truist Securities, Inc. Q 
Hi. Good morning. Jeremy, I wanted to ask about capital, and I know you get this question a lot about the kind of high-class dilemma of your 
growing capital base and your perspective of that as earnings in store. So, I guess, what's the framework for thinking about the opportunity 
cost of sitting on the growing base of capital and how high you might let that go versus your patience in waiting for more attractive deployment 
opportunities? 
.............................................

In [None]:
def extract_operator_segments(qna_dataset):
    """
    Processes the qna_dataset to extract text between consecutive occurrences of the word Operator.

    Steps:
      1. Check if the word Operator exists in the text.
      2. Find all occurrences of Operator.
      3. Extract text between each consecutive pair of Operator occurrences.
      4. Create a DataFrame with two columns:
         - 'Question_Number': a count starting at 1.
         - 'Text': the text between consecutive occurrences.

    Parameters:
        qna_dataset (str): The input text containing multiple occurrences of Operator.

    Returns:
        pd.DataFrame: A DataFrame with the extracted segments.
    """
    # Step 1: Check if 'Operator' exists in the dataset.
    if "Operator" not in qna_dataset:
        print("The word 'Operator' is not found in the dataset.")
        return pd.DataFrame(columns=["Question_Number", "Text"])

    # Step 2: Find all occurrences of 'Operator'
    matches = list(re.finditer(r"Operator", qna_dataset))

    # Check if there are at least two occurrences to form a segment.
    if len(matches) < 2:
        print("Not enough occurrences of 'Operator' to extract segments.")
        return pd.DataFrame(columns=["Question_Number", "Text"])

    segments = []
    # Step 3: Extract text between consecutive occurrences.
    for i in range(len(matches) - 1):
        # Get the end index of the current occurrence and start index of the next occurrence.
        start = matches[i].end()
        end = matches[i+1].start()
        segment_text = qna_dataset[start:end].strip()
        segments.append(segment_text)

    # Step 4: Create the DataFrame.
    df = pd.DataFrame({
        "Question_Number": list(range(1, len(segments) + 1)),
        "Text": segments
    })
    return df

In [None]:
if __name__ == "__main__":
    result_df = extract_operator_segments(qna_text)
    print(result_df)

   Question_Number                                               Text
0                1  : Thank you. Please stand by. Our first questi...
1                2  : Thank you. Next, we will go to the line of M...
2                3  : Thank you. Our next question comes from Jim ...
3                4  : Thank you. Next, we will go to the line of E...
4                5  : Does that conclude your question, Erika? \n....
5                6  : Thank you. Our next question comes from Matt...
6                7  : Thank you. Our next question comes from Bets...
7                8  : Thank you. Next, we will go to the line of E...
8                9  : Thank you. Our final question comes from Ger...
9               10  : Thank you. And we have no further questions ...


In [None]:
result_df

Unnamed: 0,Question_Number,Text
0,1,: Thank you. Please stand by. Our first questi...
1,2,": Thank you. Next, we will go to the line of M..."
2,3,: Thank you. Our next question comes from Jim ...
3,4,": Thank you. Next, we will go to the line of E..."
4,5,": Does that conclude your question, Erika? \n...."
5,6,: Thank you. Our next question comes from Matt...
6,7,: Thank you. Our next question comes from Bets...
7,8,": Thank you. Next, we will go to the line of E..."
8,9,: Thank you. Our final question comes from Ger...
9,10,: Thank you. And we have no further questions ...


In [None]:
result_df.to_csv("QNA_output.csv", index=False)

In [None]:
def extract_info(text):
    """
    This function sends a prompt to the GPT-4 Turbo model asking it to extract
    specific fields from the provided text. The model is expected to return a JSON
    with the following keys:
    - Name of the first person
    - Role of the first person
    - All text that the first person said
    - Name of the second person
    - Role of the second person
    - All text that the second person said
    """
    prompt = f"""
    The text is conversation between two people. Please Extract the following information from the text below:

    - Name of the first person
    - Role of the first person
    - All text that the first person said
    - Name of the second person
    - Role of the second person
    - All text that the second person said


    The output should have all text both the persons said in the text.

    Provide the response in JSON format with keys exactly as:
    "Name of the first person", "Role of the first person", "All text that the first person said", "Name of the second person", "Role of the second person", "All text that the second person said".

    Text: {text}
    """
    try:
        response = openai.ChatCompletion.create(
            model="gpt-4-turbo",
            messages=[
                {"role": "system", "content": "You are a helpful assistant that extracts structured information from text."},
                {"role": "user", "content": prompt}
            ],
            response_format={"type": "json_object"}, # Set output to JSON format
            max_tokens=4000,  # Adjust tokens based on your text size
            temperature=0  # Keep it deterministic
        )
        content = response['choices'][0]['message']['content']
        # Attempt to parse the JSON response
        result = json.loads(content)
    except Exception as e:
        print(f"Error processing text: {e}")
        # Return a dictionary with None values in case of error
        result = {
            "Name of the first person": None,
            "Role of the first person": None,
            "All text that the first person said": None,
            "Name of the second person": None,
            "Role of the second person": None,
            "All text that the second person said": None
        }
    return result

In [None]:
# List to store processed results
processed_results = []

# Loop through each row in result_df
for idx, row in result_df.iterrows():
    text = row['Text']
    info = extract_info(text)
    processed_results.append(info)
    # Optional: sleep to respect rate limits (adjust the delay as needed)
    time.sleep(1)

# Convert the list of dictionaries to a DataFrame
processed_df = pd.DataFrame(processed_results)

# Display the processed DataFrame
processed_df.head()

Unnamed: 0,Name of the first person,Role of the first person,All text that the first person said,Name of the second person,Role of the second person,All text that the second person said
0,John McDonald,"Analyst, Truist Securities, Inc.","Hi. Good morning. Jeremy, I wanted to ask abou...",Jeremy Barnum,"Chief Financial Officer, JPMorganChase","Yeah. Good question, John, and welcome back, b..."
1,Mike Mayo,"Analyst, Wells Fargo Securities LLC","Hi. Simple and then more difficult, I guess. J...",Jamie Dimon,"Chairman & Chief Executive Officer, JPMorganChase",I do love what I do. And answering the second ...
2,Jim Mitchell,"Analyst, Seaport Global Securities LLC","Hey. Good morning. Maybe just on regulation, w...",Jeremy Barnum,"Chief Financial Officer, JPMorganChase","Hey, Jim. I mean, it's obviously something we'..."
3,Erika Najarian,"Analyst, UBS Securities LLC","Yes. Hi, good morning. Wanted to follow up on ...",Jeremy Barnum,"Chief Financial Officer, JPMorganChase","Right, Erika. Okay. You are tempting me with m..."
4,Erika,Unknown,"Does that conclude your question, Erika?",Jeremy Barnum,"Chief Financial Officer, JPMorganChase",Very good. We can go to the next question. Tha...


In [None]:
processed_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 6 columns):
 #   Column                                Non-Null Count  Dtype 
---  ------                                --------------  ----- 
 0   Name of the first person              10 non-null     object
 1   Role of the first person              10 non-null     object
 2   All text that the first person said   10 non-null     object
 3   Name of the second person             10 non-null     object
 4   Role of the second person             10 non-null     object
 5   All text that the second person said  10 non-null     object
dtypes: object(6)
memory usage: 612.0+ bytes


In [None]:
processed_df.to_csv("QNA_4Q24.csv", index=False)