# Step 1: Create functions to extract relevant sentences 

In this section, we define several functions to extract pertinent sentences along with their context. Additionally, we establish various categories to facilitate a thorough analysis of the text.

In [32]:
# Import modules
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from seaborn import set_style
set_style("whitegrid")

import os
import json
import re

from bs4 import BeautifulSoup

import spacy

from transformers import BertTokenizer, BertForSequenceClassification
from transformers import pipeline

from textblob import TextBlob

import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.stem import PorterStemmer

In [33]:
# Load the language library
nlp = spacy.load('en_core_web_lg')

<span style="font-family: Helvetica, sans-serif; font-size: 16px; font-weight: bold;">1a: Named Entity Recognition (NER)</span>

In this subsection, we employ Named Entity Recognition (NER) to identify and extract sentences containing entities such as 'MONEY', 'PERCENT', and 'QUANTITY'.

In [34]:
for label in nlp.get_pipe("ner").labels:
    print(f'{label:{15}} - {spacy.explain(label)}')

CARDINAL        - Numerals that do not fall under another type
DATE            - Absolute or relative dates or periods
EVENT           - Named hurricanes, battles, wars, sports events, etc.
FAC             - Buildings, airports, highways, bridges, etc.
GPE             - Countries, cities, states
LANGUAGE        - Any named language
LAW             - Named documents made into laws.
LOC             - Non-GPE locations, mountain ranges, bodies of water
MONEY           - Monetary values, including unit
NORP            - Nationalities or religious or political groups
ORDINAL         - "first", "second", etc.
ORG             - Companies, agencies, institutions, etc.
PERCENT         - Percentage, including "%"
PERSON          - People, including fictional
PRODUCT         - Objects, vehicles, foods, etc. (not services)
QUANTITY        - Measurements, as of weight or distance
TIME            - Times smaller than a day
WORK_OF_ART     - Titles of books, songs, etc.


In [35]:
# Relevant Entities
RELEVANT_ENTITIES = {'MONEY', 'PERCENT', 'QUANTITY'}

In [36]:
# Extract sentences (with context) that contain one of the relevant entities
def extract_sent_ner(doc, relevant_entities):
    filtered_indices = set()
    sent_lst = list(doc.sents)
    for i, sent in enumerate(sent_lst):
        if any(ent.label_ in relevant_entities for ent in sent.ents):
            filtered_indices.update(range(max(i - 1, 0),min(i + 2, len(sent_lst))))
            
    filtered_sentences = []
    if filtered_indices:
        sorted_indices = sorted(filtered_indices)
        current_para = sent_lst[sorted_indices[0]].text
        for i in range(1, len(sorted_indices)):
            if sorted_indices[i] == sorted_indices[i-1] + 1:
                current_para += ' ' + sent_lst[sorted_indices[i]].text
            else:
                filtered_sentences.append(current_para)
                current_para = sent_lst[sorted_indices[i]].text
        filtered_sentences.append(current_para)  # Append the last accumulated paragraph
    return filtered_sentences

<span style="font-family: Helvetica, sans-serif; font-size: 16px; font-weight: bold;">1b: Key Phrase Extraction</span>

In this subsection, we categorize performance into seven distinct groups and define each group using a specific list of keywords. Additionally, we employ lemmatization techniques to identify and extract all relevant sentences.

In [37]:
# Keyword Lists by Category
financial_performance_keywords = {"revenue", "profit", "loss", "earnings", "margin", "expense", "cost", "dividend", "sales"}

market_position_keywords = {"market", "share", "grow", "growth", "decline", "competitive", "demand", "supply", "expansion", "contraction"}

strategic_direction_keywords = {"strategy", "acquisition", "merger", "investment", "divestiture", "innovation", "product", "launch", "development"}

operational_aspects_keywords = {"production", "capacity", "inventory", "distribution", "facility", "outlet", "store", "operation"}

financial_indicators_keywords = {"EBITDA", "cash flow", "capital", "asset", "liability", "equity", "return", "forecast", "guidance"}

risks_challenges_keywords = {"risk", "challenge", "uncertain", "uncertainty", "regulation", "compliance", "legal", "issue", "problem"}

economic_factors_keywords = {"economic", "recession", "inflation", "interest", "rate", "currency", "exchange"}

categories = {'financial_performance_keywords': financial_performance_keywords, 
                'market_position_keywords': market_position_keywords,
                'strategic_direction_keywords': strategic_direction_keywords, 
                'operational_aspects_keywords': operational_aspects_keywords,
                'financial_indicators_keywords': financial_indicators_keywords, 
                'risks_challenges_keywords': risks_challenges_keywords,
                'economic_factors_keywords': economic_factors_keywords}

In [38]:
# Lemmatization function
def lemmatize_keyword(keyword):
    doc = nlp(keyword)
    return [token.lemma_ for token in doc]

# Use lemmatization to update lists of keywords
for group, keywords in categories.items():
    categories[group] = set()
    for keyword in keywords:
        categories[group].add(lemmatize_keyword(keyword)[0])

In [39]:
for group in categories:
    print(f'{group} :')
    print(f'{categories[group]}')
    print()
    print()

financial_performance_keywords :
{'sale', 'cost', 'profit', 'dividend', 'earning', 'revenue', 'loss', 'margin', 'expense'}


market_position_keywords :
{'supply', 'expansion', 'growth', 'demand', 'grow', 'contraction', 'decline', 'market', 'share', 'competitive'}


strategic_direction_keywords :
{'investment', 'innovation', 'divestiture', 'strategy', 'merger', 'acquisition', 'product', 'launch', 'development'}


operational_aspects_keywords :
{'store', 'facility', 'operation', 'outlet', 'inventory', 'production', 'capacity', 'distribution'}


financial_indicators_keywords :
{'liability', 'ebitda', 'asset', 'return', 'cash', 'forecast', 'guidance', 'capital', 'equity'}


risks_challenges_keywords :
{'challenge', 'problem', 'legal', 'uncertainty', 'uncertain', 'risk', 'regulation', 'compliance', 'issue'}


economic_factors_keywords :
{'exchange', 'recession', 'interest', 'economic', 'rate', 'currency', 'inflation'}




In [40]:
# Extract sentences with context by categories using keywords
def extract_sent_keywords(doc, keywords):
    filtered_indices = set()
    sent_lst = list(doc.sents)
    for i, sent in enumerate(sent_lst):
        if any(token.lemma_.lower() in keywords for token in sent):
            filtered_indices.update(range(max(i - 1, 0),min(i + 2, len(sent_lst))))
            
    filtered_sentences = []
    if filtered_indices:
        sorted_indices = sorted(filtered_indices)
        current_para = sent_lst[sorted_indices[0]].text
        for i in range(1, len(sorted_indices)):
            if sorted_indices[i] == sorted_indices[i-1] + 1:
                current_para += ' ' + sent_lst[sorted_indices[i]].text
            else:
                filtered_sentences.append(current_para)
                current_para = sent_lst[sorted_indices[i]].text
        filtered_sentences.append(current_para)  # Append the last accumulated paragraph
    return filtered_sentences

## Step 2: Sentiment Analysis

In this section, we define functions and perform the sentiment analysis on relevant sentences and record sentiment scores

<span style="font-family: Helvetica, sans-serif; font-size: 16px; font-weight: bold;">2a: finBERT</span>

In [41]:
def finbert(sentences):
    finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3)
    tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')

    nlp = pipeline("sentiment-analysis", model=finbert, tokenizer=tokenizer)
    scores = []
    results = nlp(list(sentences), truncation = True, max_length = 512)
    for i, sent in enumerate(sentences):
        if results[i]['label'] == 'Negative':
            results[i]['score'] *= -1
        elif results[i]['label'] == 'Neutral':
            results[i]['score'] = (results[i]['score'] - 0.5)*0.2       #One can also set it to be zero.
        scores.append(results[i]['score'])
    return sum(scores)/len(scores) if scores else 0

<span style="font-family: Helvetica, sans-serif; font-size: 16px; font-weight: bold;">2b: TextBlob</span>

In [2]:
def textblob(sentences):
    polarity = []
    subjectivity = []
    for sent in sentences:
        blob = TextBlob(sent)
        sentiment = blob.sentiment
        polarity.append(sentiment.polarity)
        subjectivity.append(sentiment.subjectivity)
    if polarity:
        return [sum(polarity) / len(polarity), sum(subjectivity) / len(subjectivity)]
    else:
        return [0, 0]

<span style="font-family: Helvetica, sans-serif; font-size: 16px; font-weight: bold;">2c: VADER</span>

In [43]:
def vader(sentences):
    sia = SentimentIntensityAnalyzer()
    scores = []
    for sent in sentences:
        sentiment = sia.polarity_scores(sent)
        scores.append(sentiment['compound'])
    return sum(scores) / len(scores) if scores else 0

# Step 3: Create Data Frame (Date, File Name, File Text, SentimentScores)

In this section, we create a data frame and record sentiment scores.

In [44]:
ticker_symbols = ['VZ', 'AMZN', 'CAT', 'AAPL', 'PANW', 'PM', 'CMCSA', 'PFE', 'UBER', 'SYK', 'MMC', 'PEP',
                  'PLD', 'GOOG', 'AMT', 'ADI', 'UPS', 'GILD', 'MS', 'WMT', 'CVX', 'TXN', 'KLAC', 'INTC',
                  'GS', 'BLK', 'LIN', 'MA', 'MU', 'HD', 'UNP', 'AMAT', 'LLY', 'REGN', 'LMT', 'CI', 'WFC',
                  'MRK', 'JNJ', 'QCOM', 'BAC', 'TSLA', 'CRM', 'COST', 'DHR', 'TMO', 'MSFT', 'META', 'BSX',
                  'ELV', 'ABBV', 'MCD', 'PGR', 'NFLX', 'ACN', 'T', 'ORCL', 'IBM', 'ADP', 'AMD', 'PG', 'XOM',
                  'LRCX', 'TJX', 'SBUX', 'PH', 'MDLZ', 'MDT', 'ABT', 'NEE', 'NOW', 'RTX', 'HON', 'BA', 'GE',
                  'INTU', 'NVDA', 'AMGN', 'UNH', 'DIS', 'DE', 'CSCO', 'CVS', 'KO', 'AXP', 'FI', 'AVGO', 'ISRG',
                  'ETN', 'BMY', 'NKE', 'BKNG', 'CB', 'ADBE', 'C', 'V', 'VRTX', 'COP', 'JPM']
len(ticker_symbols)

99

In [48]:
# Create data frame
# Basic information

# date = []
f_names = []
# quarter = []
# year = []
# company_symbol = []
# earning_call_text = []

# Sentiment Scores
quant_score_NER = []
financial_performance_score = []
market_position_score = []
strategic_direction_score = []
operational_aspects_score = []
financial_indicators_score = []
risks_challenges_score = []
economic_factors_score = []

# json loading issue
json_prob = []

# year problem issue
year_prob = []

In [49]:
# Traverse files and store sentiment scores
for ticker in ticker_symbols:
    # Define the folder path
    directory = 'Earnings Call Transcripts/' + ticker

    # List all files in the folder
    files = os.listdir(directory)

    # Loop over each file
    for filename in files:
        if filename == '.DS_Store':
            continue  # Skip this file
    
        # Get the full path of the file
        filepath = os.path.join(directory, filename)

        # Check if the path is a regular file
        if os.path.isfile(filepath):    
            with open(os.path.join(directory, filename), 'r') as file:
                try:
                    json_file = json.load(file)
                except:
                    json_prob.append(filename)
                    continue
            try:
                year = int(filename[-4:])
            except:
                year_prob.append(filename)
                continue
            if year < 2017:
                continue
            # Get Filenames, Symbol, quarter and year from filename
            f_names.append(filename)
            # company_symbol.append(re.search(r'([A-Z]+Q)', filename).group()[:-1])
            # year.append(filename[-4:])
            # quarter.append(filename[-6:-4])

            # Get date
            # d = re.search(r'(\d{4}-\d{2}-\d{2}T)', json_file['data']['attributes']['publishOn']).group()[:-1]
            # date.append(d)
            # Get Text
            soup = BeautifulSoup(json_file['data']['attributes']['content'], 'html.parser')
            ect = ''
            for p in soup.find_all('p'):
                ect += p.text
            # earning_call_text.append(ect)

            # Get Sentiment Scores
            doc = nlp(ect)
            quant_score_NER.append(vader(extract_sent_ner(doc, RELEVANT_ENTITIES)))

            fp = categories['financial_performance_keywords']
            mp = categories['market_position_keywords']
            sd = categories['strategic_direction_keywords']
            oa = categories['operational_aspects_keywords']
            fi = categories['financial_indicators_keywords']
            rc = categories['risks_challenges_keywords']
            ef = categories['economic_factors_keywords']

            financial_performance_score.append(vader(extract_sent_keywords(doc, fp)))
            market_position_score.append(vader(extract_sent_keywords(doc, mp)))
            strategic_direction_score.append(vader(extract_sent_keywords(doc, sd)))
            operational_aspects_score.append(vader(extract_sent_keywords(doc, oa)))
            financial_indicators_score.append(vader(extract_sent_keywords(doc, fi)))
            risks_challenges_score.append(vader(extract_sent_keywords(doc, rc)))
            economic_factors_score.append(vader(extract_sent_keywords(doc, ef)))

In [50]:
# Build Dataframe
data_df = pd.DataFrame({'f_names' : f_names, 'quant_score_NER' : quant_score_NER,
                        'financial_performance_score' : financial_performance_score, 
                        'market_position_score' : market_position_score,
                        'strategic_direction_score' : strategic_direction_score,
                        'operational_aspects_score' : operational_aspects_score,
                        'financial_indicators_score' : financial_indicators_score,
                        'risks_challenges_score' : risks_challenges_score,
                        'economic_factors_score' : economic_factors_score})

In [51]:
data_df

Unnamed: 0,f_names,quant_score_NER,financial_performance_score,market_position_score,strategic_direction_score,operational_aspects_score,financial_indicators_score,risks_challenges_score,economic_factors_score
0,VZQ42023,0.641895,0.799723,0.773718,0.684652,0.668230,0.610928,0.533500,0.630167
1,VZQ42022,0.500700,0.656455,0.695640,0.702053,0.595356,0.775747,0.500325,0.601088
2,VZQ12022,0.710535,0.645537,0.797707,0.717752,0.508138,0.654721,0.212983,0.682775
3,VZQ22023,0.663882,0.661021,0.726271,0.646467,0.725710,0.734340,0.289257,0.644245
4,VZQ32023,0.610950,0.677594,0.711575,0.699431,0.588673,0.799629,0.424017,0.605820
...,...,...,...,...,...,...,...,...,...
2700,JPMQ42021,0.602292,0.527262,0.642297,0.633158,0.453682,0.589147,0.016520,0.511783
2701,JPMQ42019,0.521281,0.661630,0.783019,0.639760,0.599325,0.649774,0.570791,0.609950
2702,JPMQ42017,0.711100,0.676751,0.713940,0.656974,0.367900,0.558776,0.669686,0.669714
2703,JPMQ42018,0.526721,0.515767,0.640264,0.508442,0.024400,0.630465,0.248315,0.500603


In [52]:
data_df.to_csv('sentiment_scores.csv', index=False)