In [1]:
%load_ext autoreload
%autoreload 2

# Load Text

In [None]:
from bs4 import BeautifulSoup
def load_text(path):
    # read 10k report
    raw_text = None
    with open(path) as f:
        raw_text = f.read()

    soup = BeautifulSoup(raw_text, "lxml")
    text_only = soup.contents[0].get_text()
    return text_only

In [None]:

import pandas as pd
text_map = {"Company": ["INTC", 'INTC', "AAPL", "AAPL", "KO", "KO"], 
            "Year": [1999, 2015, 2008, 1997, 2000, 2017], 
            "Path": ['/Users/rohan/Code/fintech_innovation_summer_task/sec-edgar-filings/INTC/10-K/0001047469-99-011450/full-submission.txt', '/Users/rohan/Code/fintech_innovation_summer_task/sec-edgar-filings/INTC/10-K/0000050863-15-000015/full-submission.txt', '/Users/rohan/Code/fintech_innovation_summer_task/sec-edgar-filings/AAPL/10-K/0001193125-08-224958/full-submission.txt', '/Users/rohan/Code/fintech_innovation_summer_task/sec-edgar-filings/AAPL/10-K/0001047469-97-006960/full-submission.txt', '/Users/rohan/Code/fintech_innovation_summer_task/sec-edgar-filings/KO/10-K/0000021344-00-000009/full-submission.txt', '/Users/rohan/Code/fintech_innovation_summer_task/sec-edgar-filings/KO/10-K/0000021344-17-000009/full-submission.txt']}
df = pd.DataFrame(text_map)

text_lst = []
for i in range(len(df)):
    text_only = load_text(df.iloc[i].Path)
    text_lst.append(text_only)

df["Text"] = text_lst

In [None]:
df.head()

In [None]:
raw_text = None
with open(text_map["Path"][0]) as f:
    raw_text = f.read()

soup = BeautifulSoup(raw_text, "lxml")
print(type(soup.contents[0].get_text()))

# Load JSON

In [2]:
# load json parsed

import pandas as pd
df_intc = pd.read_json("data/sec_items/sec_items_INTC.json")
df_ko = pd.read_json("data/sec_items/sec_items_KO.json")
df_aapl = pd.read_json('data/sec_items/sec_items_AAPL.json')

df = pd.concat([df_intc, df_ko, df_aapl], axis=0)
df_7 = df[(df.Item == "ITEM 7") | (df.Item == "ITEM 7A") | (df.Item == "Item 7B")]

In [3]:
unique_companies = df_7.Company.unique()
unique_years = df_7.Year.unique()

# Extract Operating Segments - JSON

In [4]:
import re
pattern = r"(?i)operating\s+segment"
DELTA = 100

In [5]:
from llm import identify_operating_segments
operating_segments_llama_raw = []
for company in unique_companies:
    for year in unique_years[0:2]:
        df_temp = df_7[(df_7.Company == company) & (df_7.Year == year)]
        
        items_7_total_text = ""
        for i in range(len(df_temp)):
            items_7_total_text += df_temp.iloc[i].Text

        print(f"{company} {year} => {len(items_7_total_text)}")

        matches = list(re.finditer(pattern, items_7_total_text))
        snippets = [items_7_total_text[m.start() - DELTA: m.end() + DELTA] for m in matches]
        print(f"Segments found for\t{company, year}\t{len(snippets)}\t{len(" ".join(snippets))}")
        llama_response = identify_operating_segments("\n\n".join(snippets), company, year)
        operating_segments_llama_raw.append(llama_response)

INTC 2003 => 72098
Segments found for	('INTC', 2003)	6	1307
INTC 2013 => 80833
Segments found for	('INTC', 2013)	28	6103
KO 2003 => 726
Segments found for	('KO', 2003)	0	0
KO 2013 => 231263
Segments found for	('KO', 2013)	80	17439
AAPL 2003 => 142043
Segments found for	('AAPL', 2003)	14	3051
AAPL 2013 => 159
Segments found for	('AAPL', 2013)	0	0


In [6]:
operating_segments_llama_raw

['Based on the provided 10-K report, the operating segments of Intel Corporation in the year 2003 are:\n\nIntel Architecture Business, Communications and Computing Group, Intel Communications Group',
 'Here is the final list of operating segments:\n\nPC Client Group, Data Center Group, Other Intel Architecture Operating Segments, Software and Services Operating Segments',
 '',
 'Based on my previous responses, here is the final list of operating segments for KO in 2013:\n\nEurasia and Africa, Europe, Latin America, North America, Pacific, Bottling Investments, Corporate',
 'Based on the 10-K report, the operating segments of AAPL in 2003 are:\n\nAmericas, Europe, Japan, Retail\n\nLet me know if you need any further assistance!',
 '']

In [7]:
operating_segments_list = []
for llama_response in operating_segments_llama_raw:
    report_op_segs = []
    for line in llama_response.split("\n\n"):
        if line.count(",") > 1:
            for oper_seg in line.split(","):
                if "and " == oper_seg.strip()[:4]:
                    oper_seg = oper_seg.split("and ")[1]
                report_op_segs.append(oper_seg.strip())
    operating_segments_list.append(report_op_segs)

In [8]:
operating_segments_list

[['Intel Architecture Business',
  'Communications and Computing Group',
  'Intel Communications Group'],
 ['PC Client Group',
  'Data Center Group',
  'Other Intel Architecture Operating Segments',
  'Software and Services Operating Segments'],
 [],
 ['Eurasia and Africa',
  'Europe',
  'Latin America',
  'North America',
  'Pacific',
  'Bottling Investments',
  'Corporate'],
 ['Americas', 'Europe', 'Japan', 'Retail'],
 []]

# Identify Revenue Per Operating Segment

- Feed information about mentions of "by Operating Segment"
- Feed information about mentions of the *specific* operating segment

In [9]:
import re
by_op_seg_pattern = r"(?i)by\s+operating\s+segment"
def gen_spec_op_seg_pattern(operating_segment_name):
    words = operating_segment_name.split(" ")
    return r"(?i)" + r"\s+".join(words)
DELTA = 100

In [10]:
operating_segments_list

[['Intel Architecture Business',
  'Communications and Computing Group',
  'Intel Communications Group'],
 ['PC Client Group',
  'Data Center Group',
  'Other Intel Architecture Operating Segments',
  'Software and Services Operating Segments'],
 [],
 ['Eurasia and Africa',
  'Europe',
  'Latin America',
  'North America',
  'Pacific',
  'Bottling Investments',
  'Corporate'],
 ['Americas', 'Europe', 'Japan', 'Retail'],
 []]

In [16]:
import re

# Regex pattern for $2,345 million, $2345, and $2,451,321 billion
pattern = r"\$\d+(,\d{3})*(?:\.\d+)?(?:\s?(million|billion))?"

# Test strings
strings = ["$2,345 million", "$2345", "$2,451,321 billion"]

# Iterate through strings and test with regex
for string in strings:
  if re.match(pattern, string):
    print(f"{string} - Matched")
  else:
    print(f"{string} - Not Matched")


$2,345 million - Matched
$2345 - Matched
$2,451,321 billion - Matched


In [11]:
from llm import identify_revenue_by_operating_segment 
revenue_operating_segments = []

for i, company in enumerate(unique_companies):
    for j, year in enumerate(unique_years[0:2]):
        specific_op_segs = operating_segments_list[i * len(unique_years[0:2]) + j]
        df_temp = df_7[(df_7.Company == company) & (df_7.Year == year)]
        
        items_7_total_text = ""
        for k in range(len(df_temp)):
            items_7_total_text += df_temp.iloc[k].Text
            
        by_op_seg_matches = list(re.finditer(by_op_seg_pattern, items_7_total_text))
        by_op_seg_snippets = [items_7_total_text[m.start() - DELTA: m.end() + DELTA] for m in by_op_seg_matches]
        
        for op_seg in specific_op_segs:
            op_seg_regex = gen_spec_op_seg_pattern(op_seg)
            op_seg_matches = list(re.finditer(op_seg_regex, items_7_total_text))
            
            op_seg_snippets = by_op_seg_snippets + [items_7_total_text[m.start() - DELTA: m.end() + DELTA] for m in op_seg_matches]
        
            context = "\n\n".join(op_seg_snippets)
            print(f"Segments found for\t{company, year, op_seg}\tCount: {len(op_seg_snippets)}\tTotal Token Estimate: {len(context)}")
            llama_response = identify_revenue_by_operating_segment(context, company, year, op_seg)
            revenue_operating_segments.append(llama_response)

Segments found for	('INTC', 2003, 'Intel Architecture Business')	Count: 11	Total Token Estimate: 2517
Segments found for	('INTC', 2003, 'Communications and Computing Group')	Count: 3	Total Token Estimate: 706
Segments found for	('INTC', 2003, 'Intel Communications Group')	Count: 3	Total Token Estimate: 682
Segments found for	('INTC', 2013, 'PC Client Group')	Count: 11	Total Token Estimate: 2385
Segments found for	('INTC', 2013, 'Data Center Group')	Count: 10	Total Token Estimate: 2188
Segments found for	('INTC', 2013, 'Other Intel Architecture Operating Segments')	Count: 3	Total Token Estimate: 733
Segments found for	('INTC', 2013, 'Software and Services Operating Segments')	Count: 1	Total Token Estimate: 240
Segments found for	('KO', 2013, 'Eurasia and Africa')	Count: 27	Total Token Estimate: 5956
Segments found for	('KO', 2013, 'Europe')	Count: 46	Total Token Estimate: 9692
Segments found for	('KO', 2013, 'Latin America')	Count: 42	Total Token Estimate: 9091
Segments found for	('KO',

In [12]:
revenue_operating_segments

['According to the 10-K report, the revenue for the Intel Architecture Business operating segment in the year 2002 was approximately:\n\n$27',
 'According to the 10-K report, the revenue for the Wireless Communications and Computing Group (WCCG) operating segment for the year 2002 is:\n\n$8.66 billion',
 "According to the 10-K report, the stock's revenue for the Intel Communications Group operating segment in the year 2002 was:\n\n$460 million",
 'According to the 10-K report, the revenue for the PC Client Group operating segment in the year 2012 is:\n\n$33,566',
 'Based on the report, the revenue for the Data Center Group operating segment in the year 2012 is:\n\n$6 billion',
 'According to the 10-K report, the revenue for the Other Intel Architecture Operating Segments operating segment in 2012 was:\n\n$7,493 million',
 'According to the 10-K report for Intel (INTC) in 2013, the revenue for the Software and Services Operating Segments (including McAfee, Wind River Systems, and other 

In [13]:
# normalize the values to integers
operating_segment_revenue = []
for llama_response in revenue_operating_segments:
    for line in llama_response.split("\n\n"):
        if "$" not in line:
            continue
        value = None
        if line.count(",") == 1:
            line += " million"
        
        line = line.replace("$", "").replace(",", "")

        if "million" in line:
            value = int(line.split(" ")[0]) * 1e6
        elif "billion" in line:
            value = int(float(line.split(" ")[0]) * 1e9)
        else:
            value = int(line) * 1e6

        operating_segment_revenue.append(value)
            
                

In [14]:
operating_segment_revenue

[27000000.0,
 8660000000,
 460000000.0,
 33566000000.0,
 6000000000,
 7493000000.0,
 1830000000.0,
 5432000000000000.0,
 1574000000000000.0,
 6161000000.0,
 9366000000000000.0,
 447000000.0,
 447000000.0,
 38000000.0,
 3131000000.0,
 1251000000.0,
 710000000000000.0,
 283000000.0]

# Export Data

In [102]:
df_dict = {
    "Company": [],
    "Year": [],
    "Operating Segment": [],
    "Revenue": []
}

In [113]:
index = 0
for i, company in enumerate(unique_companies):
    for j, year in enumerate(unique_years[0:2]):
        specific_op_segs = operating_segments_list[i * len(unique_years[0:2]) + j]
        for operating_segment in specific_op_segs:
            revenue = operating_segment_revenue[index]
            df_dict["Company"].append(company)
            df_dict["Year"].append(year)
            df_dict["Operating Segment"].append(operating_segment)
            df_dict["Revenue"].append(revenue)
            index += 1

In [115]:
import pandas as pd
df_op_seg_revenue = pd.DataFrame(df_dict)
df_op_seg_revenue.head()

Unnamed: 0,Company,Year,Operating Segment,Revenue
0,INTC,2003,Intel Architecture Business,27446000000.0
1,INTC,2003,Communications and Computing Group,1447000000.0
2,INTC,2003,Intel Communications Group,2453000000.0
3,INTC,2003,Intel Architecture Business,27446000000.0
4,INTC,2003,Communications and Computing Group,1447000000.0


# Extract Operating Segments

1. Extract instances of operating segments
2. Find words around the operating segments -- generate clips
3. Feed clips to LLM and ask for operating segments
4. Take list of operating segments and create a set of them (may need to use set operating in the event multiple LLAMA requests)

*At the end, show the operating segments per stock.*

In [37]:
import re
pattern = r"(?i)operating\s+segment"
DELTA = 100
EXTRACT_OPERATING_SEGMENTS_PROMPT = "Given the aforementioned sections about the stocks operating segments, list the operating segments for the company. Output only the operating segments and in the following format: 'SEGMENT 1, SEGMENT 2, SEGMENT 3...'"

In [38]:
from llm import ask_llama

operating_segments_llama_raw = []
for i in range(len(df)):
    text = df.iloc[i].Text
    company_name = df.iloc[i].Company
    year = df.iloc[i].Year
    matches = list(re.finditer(pattern, text))
    snippets = [text[m.start() - DELTA: m.end() + DELTA] for m in matches]
    print(f"Segments found for\t{company_name, year}\t{len(snippets)}")
    llama_response = ask_llama("\n".join(snippets), EXTRACT_OPERATING_SEGMENTS_PROMPT)
    operating_segments_llama_raw.append(llama_response)

Segments found for	('INTC', 1999)	22
Segments found for	('INTC', 2015)	145
Segments found for	('AAPL', 2008)	25
Segments found for	('AAPL', 1997)	0
Segments found for	('KO', 2000)	17
Segments found for	('KO', 2017)	245


In [None]:
snippets

In [40]:
operating_segments_llama_raw

[['  Sure, based on the information provided in the previous sections, the operating segments for the company are:\n\nSEGMENT 1: Automotive\nSEGMENT 2: Industrial\nSEGMENT 3: Aerospace\nSEGMENT 4: Energy\n\nSo, the operating segments for the company are: Automotive, Industrial, Aerospace, and Energy.',
  '  Sure! Based on the information provided, the operating segments for the company are:\n\nSEGMENT 1: Wholesale\nSEGMENT 2: Retail\nSEGMENT 3: Direct-to-Consumer\nSEGMENT 4: Licensing\n\nSo, the operating segments for the company are: Wholesale, Retail, Direct-to-Consumer, and Licensing.'],
 ['  Sure, based on the information provided in the previous sections, the operating segments for the company are:\n\nSEGMENT 1: Agriculture\nSEGMENT 2: Food\nSEGMENT 3: Energy\nSEGMENT 4: Chemicals\nSEGMENT 5: Industrial\nSEGMENT 6: Healthcare\n\nTherefore, the operating segments for the company are: Agriculture, Food, Energy, Chemicals, Industrial, and Healthcare.',
  '  Sure, based on the informa

In [None]:
snippets

# Troubleshooting

In [64]:
# load json parsed

import pandas as pd
df_intc = pd.read_json("data/sec_items/sec_items_INTC.json")
df_ko = pd.read_json("data/sec_items/sec_items_KO.json")
df_aapl = pd.read_json('data/sec_items/sec_items_AAPL.json')

df = pd.concat([df_intc, df_ko, df_aapl], axis=0)
df_1 = df[(df.Item == "ITEM 1") | (df.Item == "ITEM 1A") | (df.Item == "Item 1B")]

In [65]:
unique_companies = df_7.Company.unique()
unique_years = df_7.Year.unique()

In [66]:
import re
pattern = r"(?i)(business|operating)\s+segment"
DELTA = 300

In [67]:
## custom test
df_temp = df_1[(df_1.Company == "AAPL") & (df_1.Year == 2013)]
total_text = ""
for i in range(len(df_temp)):
    total_text += df_temp.iloc[i].Text
matches = list(re.finditer(pattern, total_text))
snippets = [total_text[m.start() - DELTA: m.end() + DELTA] for m in matches]
snippets

['istribution network to effectively reach more customers and provide them with a high-quality sales and post-sales\nsupport experience. \n\xa0 1 \n\n\nTable of Contents\nBusiness Organization  The Company manages its business primarily on a geographic basis. Accordingly, the Company determined its reportable operating segments, which are generally based on the nature and location of its\ncustomers, to be the Americas, Europe, Japan, Greater China, Rest of Asia Pacific and Retail. The Americas segment includes both North and South America. The Europe segment includes European countries, as well as India, the Middle East a',
 ' America. The Europe segment includes European countries, as well as India, the Middle East and Africa.\nThe Greater China segment includes China, Hong Kong and Taiwan. The Rest of Asia Pacific segment includes Australia and Asian countries, other than those countries included in the Company\x92s other operating segments. The Retail segment operates\nApple retail 

In [68]:
len(df_temp.iloc[0].Text)

36162

In [69]:
from llm import identify_operating_segments
operating_segments_llama_raw = []
snippets
for company in unique_companies:
    for year in unique_years[0:2]:
        df_temp = df_1[(df_1.Company == company) & (df_1.Year == year)]
        
        total_text = ""
        for i in range(len(df_temp)):
            total_text += df_temp.iloc[i].Text

        print(f"{company} {year} => {len(total_text)}")

        matches = list(re.finditer(pattern, total_text))
        snippets = [total_text[m.start() - DELTA: m.end() + DELTA] for m in matches]
        print(f"Segments found for\t{company, year}\t{len(snippets)}\t{len(" ".join(snippets))}")
        llama_response = identify_operating_segments("\n\n".join(snippets), company, year)
        operating_segments_llama_raw.append(llama_response)

INTC 2003 => 90870
Segments found for	('INTC', 2003)	6	3707
INTC 2013 => 86438
Segments found for	('INTC', 2013)	18	11123
KO 2003 => 55396
Segments found for	('KO', 2003)	5	3090
KO 2013 => 100619
Segments found for	('KO', 2013)	9	5561
AAPL 2003 => 56327
Segments found for	('AAPL', 2003)	4	2471
AAPL 2013 => 87103
Segments found for	('AAPL', 2013)	4	2471


In [70]:
operating_segments_llama_raw

['Based on the 10-K report, the operating segments of Intel Corporation (INTC) in 2003 are:\n\nIntel Architecture, Wireless Communications and Computing Group, Intel Communications Group',
 'Here is the final list of operating segments for Intel Corporation (INTC) based on the 2013 10-K report:\n\nPC Client Group, Data Center Group, Other Intel Architecture, Software and Services',
 'Based on the 10-K report, the operating segments of KO are:\n\nNorth America, Africa, Europe, Eurasia and Middle East, Latin America, Asia, Corporate',
 'Based on the 10-K report, the operating segments of The Coca-Cola Company (KO) are:\n\nEurasia and Africa, Europe, Latin America, North America, Pacific, Bottling Investments, Corporate',
 'Based on the provided sections of the 10-K report for AAPL in 2003, the operating segments are:\n\nApples, Europe, Japan, Retail, and Asia-Pacific',
 'According to the 10-K report, the operating segments of AAPL in 2013 are:\n\nThe Americas, Europe, Japan, Greater Chin

In [71]:
operating_segments_list = []
for llama_response in operating_segments_llama_raw:
    report_op_segs = []
    for line in llama_response.split("\n\n"):
        if line.count(",") > 1:
            for oper_seg in line.split(","):
                if "and " == oper_seg.strip()[:4]:
                    oper_seg = oper_seg.split("and ")[1]
                report_op_segs.append(oper_seg.strip())
    operating_segments_list.append(report_op_segs)

In [72]:
operating_segments_list

[['Intel Architecture',
  'Wireless Communications and Computing Group',
  'Intel Communications Group'],
 ['PC Client Group',
  'Data Center Group',
  'Other Intel Architecture',
  'Software and Services'],
 ['North America',
  'Africa',
  'Europe',
  'Eurasia and Middle East',
  'Latin America',
  'Asia',
  'Corporate'],
 ['Eurasia and Africa',
  'Europe',
  'Latin America',
  'North America',
  'Pacific',
  'Bottling Investments',
  'Corporate'],
 ['Apples', 'Europe', 'Japan', 'Retail', 'Asia-Pacific'],
 ['The Americas',
  'Europe',
  'Japan',
  'Greater China',
  'Rest of Asia Pacific',
  'Retail']]

In [76]:
for i in range(len(df_1)):
    text_length = len(df_1.iloc[i].Text)
    if text_length < 200:
        print(df_1.iloc[i].Company, df.iloc[i].Year, text_length)

INTC 2003 146
INTC 2003 23
INTC 2003 132
INTC 2003 23
INTC 2013 133
INTC 2013 23
INTC 2013 23
INTC 2019 136
INTC 2019 23
