In [1]:
from docx import Document
import pandas as pd
from nltk.tokenize import sent_tokenize
import nltk

In [2]:
nltk.download("punkt")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\razan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
document_path = "../Data/Apple Con Call Transcript.docx"

In [4]:
doc = Document(document_path)

In [5]:
company_participants = []
company_participants_index = -1 

for i, paragraph in enumerate(doc.paragraphs):
    if paragraph.text == "Company Participants":
        company_participants_index = i
    if i == (company_participants_index + 1) and company_participants_index != -1:
        company_participants.append(paragraph.text.split("\n"))

In [6]:
company_participants[0]

['Suhasini Chandramouli - Director, Investor Relations',
 'Tim Cook - Chief Executive Officer',
 'Kevan Parekh - Chief Financial Officer']

In [7]:
analysts = []
conference_call_participants_index = -1 

for i, paragraph in enumerate(doc.paragraphs):
    if paragraph.text == "Conference Call Participants":
        conference_call_participants_index = i
    if i == (conference_call_participants_index + 1) and conference_call_participants_index != -1:
        analysts.append(paragraph.text.split("\n"))

In [8]:
analysts[0]

['Erik Woodring - Morgan Stanley',
 'Ben Reitzes - Melius',
 'Michael Ng - Goldman Sachs',
 'Amit Daryanani - Evercore',
 'Wamsi Mohan - Bank of America',
 'Samik Chatterjee - JPMorgan',
 'David Vogt - UBS',
 'Krish Sankar - TD Cowen',
 'Richard Kramer - Arete Research',
 'Atif Malik - Citi',
 'Ben Bollin - Cleveland Research Company']

In [9]:
def extract_name_role(pairs):
    name_role_dict = {}
    for pair in pairs:
        name, role = [item.strip() for item in pair.split(" - ")]
        name_role_dict[name] = role
    return name_role_dict

In [10]:
company_participants_dictionary = extract_name_role(company_participants[0])

In [11]:
company_participants_dictionary

{'Suhasini Chandramouli': 'Director, Investor Relations',
 'Tim Cook': 'Chief Executive Officer',
 'Kevan Parekh': 'Chief Financial Officer'}

In [12]:
company_participants_names = list(company_participants_dictionary.keys())

In [13]:
company_participants_names

['Suhasini Chandramouli', 'Tim Cook', 'Kevan Parekh']

In [14]:
analyst_names = [name.split("-")[0].strip() for name in analysts[0]]

In [15]:
analyst_names

['Erik Woodring',
 'Ben Reitzes',
 'Michael Ng',
 'Amit Daryanani',
 'Wamsi Mohan',
 'Samik Chatterjee',
 'David Vogt',
 'Krish Sankar',
 'Richard Kramer',
 'Atif Malik',
 'Ben Bollin']

In [16]:
names = company_participants_names + analyst_names + ["Operator"]

In [17]:
names

['Suhasini Chandramouli',
 'Tim Cook',
 'Kevan Parekh',
 'Erik Woodring',
 'Ben Reitzes',
 'Michael Ng',
 'Amit Daryanani',
 'Wamsi Mohan',
 'Samik Chatterjee',
 'David Vogt',
 'Krish Sankar',
 'Richard Kramer',
 'Atif Malik',
 'Ben Bollin',
 'Operator']

In [18]:
def extract_dialogues(names):
    dialogues = []

    current_speaker = None
    current_dialogue = []

    for para in doc.paragraphs:
        line = para.text.strip()

        if line in names:
            if current_speaker:
                dialogues.append((current_speaker, " ".join(current_dialogue)))

            current_speaker = line
            current_dialogue = []
        else:
            current_dialogue.append(line)

    if current_speaker and current_dialogue:
        dialogues.append((current_speaker, " ".join(current_dialogue)))

    return dialogues

In [19]:
result = extract_dialogues(names)

In [20]:
result

[('Suhasini Chandramouli',
  "Good afternoon, and welcome to the Apple Q1 Fiscal Year 2025 Earnings Conference Call. My name is Suhasini Chandramouli, Director of Investor Relations. Today's call is being recorded. Speaking first today are Apple CEO, Tim Cook, and he will be followed by CFO, Kevan Parekh. After that, we'll open the call to questions from analysts. Please note that some of the information you'll hear during our discussion today will consist of forward-looking statements, including, without limitation, those regarding revenue, gross margin, operating expenses, other income and expense, taxes, capital allocation, and future business outlook, including the potential impact of macroeconomic conditions on the company's business and results of operations. These statements involve risks and uncertainties that may cause actual results or trends to differ materially from our forecast. For more information, please refer to the risk factors discussed in Apple's most recently filed

In [21]:
df = pd.DataFrame(result, columns=["Speaker", "Statement"])

In [22]:
df

Unnamed: 0,Speaker,Statement
0,Suhasini Chandramouli,"Good afternoon, and welcome to the Apple Q1 Fi..."
1,Tim Cook,"Thank you, Suhasini. Good afternoon, everyone,..."
2,Kevan Parekh,"Thanks Tim, and good afternoon everyone. I'm g..."
3,Suhasini Chandramouli,"Thank you, Kevin. We ask that you limit yourse..."
4,Operator,"Certainly, we will go ahead and take our first..."
...,...,...
83,Operator,Our last question is from Ben Bollin from Clev...
84,Ben Bollin,"Good evening, everyone. Thanks for taking the ..."
85,Tim Cook,"Yes. Ben, I think it's different for different..."
86,Ben Bollin,"That’s it from me. Thanks, Tim."


In [23]:
def map_role(name):
    if name in company_participants_dictionary:
        return company_participants_dictionary[name]
    elif name in analyst_names:
        return "Analyst"
    else:
        return 'Operator'

In [24]:
df['Role'] = df['Speaker'].apply(map_role)

In [25]:
df

Unnamed: 0,Speaker,Statement,Role
0,Suhasini Chandramouli,"Good afternoon, and welcome to the Apple Q1 Fi...","Director, Investor Relations"
1,Tim Cook,"Thank you, Suhasini. Good afternoon, everyone,...",Chief Executive Officer
2,Kevan Parekh,"Thanks Tim, and good afternoon everyone. I'm g...",Chief Financial Officer
3,Suhasini Chandramouli,"Thank you, Kevin. We ask that you limit yourse...","Director, Investor Relations"
4,Operator,"Certainly, we will go ahead and take our first...",Operator
...,...,...,...
83,Operator,Our last question is from Ben Bollin from Clev...,Operator
84,Ben Bollin,"Good evening, everyone. Thanks for taking the ...",Analyst
85,Tim Cook,"Yes. Ben, I think it's different for different...",Chief Executive Officer
86,Ben Bollin,"That’s it from me. Thanks, Tim.",Analyst


In [26]:
rows = []

for idx, row in df.iterrows():
    sentences = sent_tokenize(row['Statement'])
    for sentence in sentences:
        rows.append({
            'Speaker': row['Speaker'],
            'Role': row['Role'],
            'Statement': sentence
        })
        
df = pd.DataFrame(rows)

In [27]:
df

Unnamed: 0,Speaker,Role,Statement
0,Suhasini Chandramouli,"Director, Investor Relations","Good afternoon, and welcome to the Apple Q1 Fi..."
1,Suhasini Chandramouli,"Director, Investor Relations","My name is Suhasini Chandramouli, Director of ..."
2,Suhasini Chandramouli,"Director, Investor Relations",Today's call is being recorded.
3,Suhasini Chandramouli,"Director, Investor Relations","Speaking first today are Apple CEO, Tim Cook, ..."
4,Suhasini Chandramouli,"Director, Investor Relations","After that, we'll open the call to questions f..."
...,...,...,...
467,Tim Cook,Chief Executive Officer,And most people are between those two points.
468,Tim Cook,Chief Executive Officer,And so I do think there were lots of units tha...
469,Ben Bollin,Analyst,That’s it from me.
470,Ben Bollin,Analyst,"Thanks, Tim."


In [28]:
df.to_csv("../Data/Statements_With_Speaker_&_Role.csv", index=False)