In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
import re

In [2]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\razan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
stop_words = stopwords.words('english')

In [4]:
df = pd.read_csv("../Data/Statement_With_Speaker_&_Role.csv")

In [5]:
df

Unnamed: 0,Speaker,Role,Statement
0,Suhasini Chandramouli,"Director, Investor Relations","Good afternoon, and welcome to the Apple Q1 Fi..."
1,Suhasini Chandramouli,"Director, Investor Relations","My name is Suhasini Chandramouli, Director of ..."
2,Suhasini Chandramouli,"Director, Investor Relations",Today's call is being recorded.
3,Suhasini Chandramouli,"Director, Investor Relations","Speaking first today are Apple CEO, Tim Cook, ..."
4,Suhasini Chandramouli,"Director, Investor Relations","After that, we'll open the call to questions f..."
...,...,...,...
467,Tim Cook,Chief Executive Officer,And most people are between those two points.
468,Tim Cook,Chief Executive Officer,And so I do think there were lots of units tha...
469,Ben Bollin,Analyst,That’s it from me.
470,Ben Bollin,Analyst,"Thanks, Tim."


In [6]:
def remove_stopwords(sentence, stopwords=None):
    pattern = r"\b(?:" + "|".join(re.escape(word) for word in stopwords) + r")\b"

    cleaned_sentence = re.sub(pattern, "", sentence, flags=re.IGNORECASE)

    return " ".join(cleaned_sentence.split())

In [7]:
def replace_apostrophe(text):
    text = re.sub("’", "'", text)
    return text

In [8]:
custom_stopwords = set(stopwords.words("english")) | set(
    [
        "um", "uh", "okay", "well", "you know", "i mean", "like", "think", "question", "questions", "so", "actually", "basically",
        "just", "right", "sure", "yeah", "yep", "nope", "great", "thanks", "thank you", "good morning",
        "good afternoon", "good evening", "ladies and gentlemen", "everyone", "folks", "team",
        "welcome", "pleasure", "appreciate", "introduction", "moving on", "next slide", "let’s move on",
        "let’s begin", "let’s get started", "first of all", "secondly", "last but not least", "before we start",
        "before we begin", "without further ado", "i’d like to", "we’d like to", "going forward",
        "as you can see", "as mentioned earlier", "as we discussed", "as previously stated",
        "as i said", "as we said", "again", "also", "furthermore", "moreover", "in addition",
        "of course", "obviously", "clearly", "frankly", "honestly", "to be honest", "to be frank",
        "to be clear", "as far as i know", "as far as we know", "our perspective", "our standpoint",
        "to some extent", "at the end of the day", "bottom line", "high level", "big picture",
        "to wrap up", "to summarize", "in conclusion", "in summary", "before i hand it over",
        "before we wrap up", "any questions", "do you have any questions", "q&a", "let’s take questions",
        "moving to the next question", "let me check", "we will get back to you", "we will follow up",
        "circle back", "touch base", "take this offline", "offline discussion", "follow-up",
        "let’s revisit", "we are looking into it", "we are working on it", "stay tuned", "more details to come",
        "we don’t have that information right now", "i don’t have that data", "good question", "great question",
        "let me clarify", "let me add", "if i may", "if you will", "if you look at", "when you think about it",
        "at this point in time", "right now", "currently", "as of now", "in terms of", "with respect to",
        "regarding", "pertaining to", "relative to", "looking at", "focusing on", "from a standpoint of",
        "from a perspective of", "from an angle of", "talking about", "discussing", "speaking of",
        "with that said", "having said that", "on that note", "before i forget", "one more thing", "please", "operator"
    ]
)

In [9]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"\bapple intelligence\b", "appleintelligence", text)
    text = re.sub(r"\byear-over-year\b", "yearoveryear", text)
    text = re.sub(r"\ball-time\b", "alltime", text)
    text = re.sub(r"\ball-in-one\b", "allinone", text)
    text = re.sub(r"(?<!\w)(u\.s\.)(?!\w)", "unitedstates", text)
    text = re.sub(r"(?<!\w)(u\.k\.)(?!\w)", "unitedkingdom", text)
    text = re.sub(r"(?<!\w)(i\.e\.)(?!\w)", "that is", text)
    text = re.sub(r"\biphone (1[0-6]|[1-9])\b", lambda m: f"iphone{m.group(1)}", text)
    text = replace_apostrophe(text)
    text = remove_stopwords(text, stopwords = custom_stopwords)
    text = re.sub(r"[^\w\s%$.]", "", text)
    text = re.sub(r"\.$", "", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

In [10]:
df["Cleaned Statement"] = df["Statement"].apply(clean_text)

In [11]:
df = df[df["Cleaned Statement"].str.split().apply(len) >= 3].reset_index(drop = True)

In [12]:
df.drop("Cleaned Statement", axis = 1, inplace = True)

In [13]:
df

Unnamed: 0,Speaker,Role,Statement
0,Suhasini Chandramouli,"Director, Investor Relations","Good afternoon, and welcome to the Apple Q1 Fi..."
1,Suhasini Chandramouli,"Director, Investor Relations","My name is Suhasini Chandramouli, Director of ..."
2,Suhasini Chandramouli,"Director, Investor Relations",Today's call is being recorded.
3,Suhasini Chandramouli,"Director, Investor Relations","Speaking first today are Apple CEO, Tim Cook, ..."
4,Suhasini Chandramouli,"Director, Investor Relations","After that, we'll open the call to questions f..."
...,...,...,...
331,Tim Cook,Chief Executive Officer,"Ben, I think it's different for different type..."
332,Tim Cook,Chief Executive Officer,I mean you have very early adopter kind of use...
333,Tim Cook,Chief Executive Officer,And then you have people that are on the entir...
334,Tim Cook,Chief Executive Officer,And most people are between those two points.


In [14]:
df.to_csv("../Data/Cleaned_Statements_With_Speaker_&_Role.csv", index=False) 