In [26]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.nn.functional import softmax
import spacy
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch.nn.functional as F
from nltk.corpus import stopwords
import nltk
import re

In [18]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\razan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [21]:
stop_words = stopwords.words('english')

In [2]:
df = pd.read_csv("../Data/Cleaned_Statements_With_Speaker_&_Role.csv")

In [3]:
df

Unnamed: 0,Speaker,Role,Statement
0,Suhasini Chandramouli,"Director, Investor Relations","Good afternoon, and welcome to the Apple Q1 Fi..."
1,Suhasini Chandramouli,"Director, Investor Relations","My name is Suhasini Chandramouli, Director of ..."
2,Suhasini Chandramouli,"Director, Investor Relations",Today's call is being recorded.
3,Suhasini Chandramouli,"Director, Investor Relations","Speaking first today are Apple CEO, Tim Cook, ..."
4,Suhasini Chandramouli,"Director, Investor Relations","After that, we'll open the call to questions f..."
...,...,...,...
331,Tim Cook,Chief Executive Officer,"Ben, I think it's different for different type..."
332,Tim Cook,Chief Executive Officer,I mean you have very early adopter kind of use...
333,Tim Cook,Chief Executive Officer,And then you have people that are on the entir...
334,Tim Cook,Chief Executive Officer,And most people are between those two points.


In [4]:
sentiment_model_name = "ProsusAI/finbert"
sentiment_tokenizer = BertTokenizer.from_pretrained(sentiment_model_name)
sentiment_model = BertForSequenceClassification.from_pretrained(sentiment_model_name)
sentiment_model.eval()

emotion_model_name = "j-hartmann/emotion-english-distilroberta-base"
emotion_tokenizer = AutoTokenizer.from_pretrained(emotion_model_name)
emotition_model = AutoModelForSequenceClassification.from_pretrained(emotion_model_name)
emotition_model.eval()

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-5): 6 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
           

In [5]:
sentiments = ["Positive", "Negative", "Neutral"]

emotions = ['anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness', 'surprise']

In [6]:
def get_sentiments(list_of_dialogues):
    inputs = sentiment_tokenizer(
        list_of_dialogues, return_tensors="pt", padding=True, truncation=True
    )

    with torch.no_grad():
        output = sentiment_model(**inputs)

    logits = output.logits
    probabilities = torch.nn.functional.softmax(logits, dim=-1)

    predicted_indices = torch.argmax(probabilities, dim=1)
    sentiment_labels = [sentiments[idx] for idx in predicted_indices.tolist()]

    confidence = probabilities[
        torch.arange(len(predicted_indices)), predicted_indices
    ].tolist()

    sentence_scores = (probabilities[:, 0] - probabilities[:, 1]).tolist()

    return (
        sentiment_labels,
        confidence,
        sentence_scores,
    )

In [7]:
def classify_emotions(statement):
    inputs = emotion_tokenizer(statement, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = emotition_model(**inputs)
        probs = F.softmax(outputs.logits, dim=-1)
        predicted_label = emotions[torch.argmax(probs).item()]
        return predicted_label       

In [8]:
df["Sentiment Label"], df["Confidence Rate"], df["Sentiment Score"] = get_sentiments(df["Statement"].tolist())

In [9]:
df["Emotion"] = df["Statement"].apply(classify_emotions)

In [10]:
df

Unnamed: 0,Speaker,Role,Statement,Sentiment Label,Confidence Rate,Sentiment Score,Emotion
0,Suhasini Chandramouli,"Director, Investor Relations","Good afternoon, and welcome to the Apple Q1 Fi...",Neutral,0.665896,0.307354,joy
1,Suhasini Chandramouli,"Director, Investor Relations","My name is Suhasini Chandramouli, Director of ...",Neutral,0.933584,0.000260,neutral
2,Suhasini Chandramouli,"Director, Investor Relations",Today's call is being recorded.,Neutral,0.938438,-0.019006,neutral
3,Suhasini Chandramouli,"Director, Investor Relations","Speaking first today are Apple CEO, Tim Cook, ...",Neutral,0.934187,-0.021369,neutral
4,Suhasini Chandramouli,"Director, Investor Relations","After that, we'll open the call to questions f...",Neutral,0.911465,0.056613,neutral
...,...,...,...,...,...,...,...
331,Tim Cook,Chief Executive Officer,"Ben, I think it's different for different type...",Neutral,0.907566,0.030952,neutral
332,Tim Cook,Chief Executive Officer,I mean you have very early adopter kind of use...,Neutral,0.881435,0.077339,neutral
333,Tim Cook,Chief Executive Officer,And then you have people that are on the entir...,Neutral,0.883099,-0.045996,neutral
334,Tim Cook,Chief Executive Officer,And most people are between those two points.,Neutral,0.716154,-0.148352,neutral


In [11]:
nlp = spacy.load("en_core_web_md")

In [12]:
relevant_entities = [
    "ORG",     
    "MONEY",    
    "PERCENT",  
    "CARDINAL", 
    "GPE",     
    "DATE",    
    "EVENT",   
    "PRODUCT",  
    "LAW"
]

In [23]:
def remove_stopwords(sentence, stopwords=None):
    pattern = r"\b(?:" + "|".join(re.escape(word) for word in stopwords) + r")\b"

    cleaned_sentence = re.sub(pattern, "", sentence, flags=re.IGNORECASE)

    return " ".join(cleaned_sentence.split())

In [24]:
def remove_stop_words(text):
    text = remove_stopwords(text, stopwords = stop_words)
    return text.strip()

def get_strategic_focus_areas(text):
    doc = nlp(text)

    entities = []
    
    for ent in doc.ents:
        if ent.label_ in relevant_entities:
            entities.append(remove_stop_words(ent.text))

    if entities:
        return ", ".join(entities)
    else:
        return "No Strategic Focus Areas"

In [27]:
df["Strategic Focus Areas"] = df["Statement"].apply(get_strategic_focus_areas)

In [28]:
df

Unnamed: 0,Speaker,Role,Statement,Sentiment Label,Confidence Rate,Sentiment Score,Emotion,Strategic Focus Areas
0,Suhasini Chandramouli,"Director, Investor Relations","Good afternoon, and welcome to the Apple Q1 Fi...",Neutral,0.665896,0.307354,joy,Apple Q1 Fiscal Year
1,Suhasini Chandramouli,"Director, Investor Relations","My name is Suhasini Chandramouli, Director of ...",Neutral,0.933584,0.000260,neutral,Investor Relations
2,Suhasini Chandramouli,"Director, Investor Relations",Today's call is being recorded.,Neutral,0.938438,-0.019006,neutral,Today
3,Suhasini Chandramouli,"Director, Investor Relations","Speaking first today are Apple CEO, Tim Cook, ...",Neutral,0.934187,-0.021369,neutral,"today, Apple, CFO"
4,Suhasini Chandramouli,"Director, Investor Relations","After that, we'll open the call to questions f...",Neutral,0.911465,0.056613,neutral,No Strategic Focus Areas
...,...,...,...,...,...,...,...,...
331,Tim Cook,Chief Executive Officer,"Ben, I think it's different for different type...",Neutral,0.907566,0.030952,neutral,No Strategic Focus Areas
332,Tim Cook,Chief Executive Officer,I mean you have very early adopter kind of use...,Neutral,0.881435,0.077339,neutral,No Strategic Focus Areas
333,Tim Cook,Chief Executive Officer,And then you have people that are on the entir...,Neutral,0.883099,-0.045996,neutral,No Strategic Focus Areas
334,Tim Cook,Chief Executive Officer,And most people are between those two points.,Neutral,0.716154,-0.148352,neutral,two


In [31]:
df.to_csv("../Data/Labeled_Statements_With_Speaker_&_Role.csv", index = False)