In [None]:
!pip install sovai[full]

## SEC 10-K filings

You can run the following commands to retrieve data (`df`) using `sov.data`:

To fetch the **latest data** for a specific query:

```python
df = sov.data("query")
```

To fetch the **full historical data** for a specific query:

```python
df = sov.data("query", full_history=True)
```

To fetch the **full data** multiple **tickers** or identifiers like **cusip** and **openfigi**:

```python
df = sov.data("query", tickers=["9033434", "IB94343", "43432", "AAPL"])
```

To filter **any dataframe** just write some queries:

```python
df.filter(["cash_short_term > 10m","start with ticker A","negative profits" ])
```


In [15]:
import sovai as sov
import pandas as pd

sov.token_auth(token="visit https://sov.ai/profile for your token")

data = sov.data("sec/10k", tickers=["AAPL"], limit=1)

In [16]:
data.to_parquet("park.parquet")

You don't have to downlaod the entire dataset, instead look at the companies that filed that week, and simply do an analysis of them. Future edits would include sentiment analysis.

In [12]:
import pandas as pd

# Define mandatory sections
mandatory_sections = [
    "FINANCIAL_STATEMENTS",
    "RISK_FACTORS",
    "MANAGEMENT_DISCUSSION",
    "MANAGEMENT",
    "COMPENSATION",
    "EXHIBITS",
    "BUSINESS",
    "CONTROLS_AND_PROCEDURES",
    "PRINCIPAL_STOCKHOLDERS",
    "ACCOUNTING_FEES"
]

# 1. Filter for mandatory sections using isin (vectorized operation)
mandatory_data = data[data['section'].isin(mandatory_sections)].copy()

# 2. Extract year from the 'date' column
mandatory_data['year'] = mandatory_data['date'].dt.year

# 3. Calculate word count using vectorized string operations
mandatory_data['word_count'] = mandatory_data['full_text'].str.count(r'\w+')

# 4. Group by 'ticker', 'year', and 'section', then sum the 'word_count'
word_count_agg = mandatory_data.groupby(['ticker', 'year', 'section'], as_index=False)['word_count'].sum()

# 5. Pivot the DataFrame to create a multi-index with 'ticker' and 'year'
word_count_pivot = word_count_agg.pivot_table(
    index=['ticker', 'year'],
    columns='section',
    values='word_count',
    fill_value=0  # Fill missing values with 0
)

# Optional: Flatten the column index if needed (not necessary in multi-index)
# word_count_pivot.columns = word_count_pivot.columns.get_level_values(0)

# 6. Reset index to ensure 'ticker' and 'year' are part of the index
word_count_pivot = word_count_pivot.reset_index().set_index(['ticker', 'year'])


In [13]:
word_count_pivot

Unnamed: 0_level_0,section,ACCOUNTING_FEES,BUSINESS,COMPENSATION,CONTROLS_AND_PROCEDURES,EXHIBITS,FINANCIAL_STATEMENTS,MANAGEMENT,MANAGEMENT_DISCUSSION,PRINCIPAL_STOCKHOLDERS,RISK_FACTORS
ticker,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
META,2013,38.0,107.0,38.0,345.0,13.0,10050.0,10422.0,10422.0,38.0,0.0
META,2014,34.0,107.0,34.0,444.0,13.0,9009.0,8243.0,8243.0,34.0,0.0
META,2015,34.0,173.0,34.0,440.0,13.0,9388.0,7237.0,7237.0,34.0,14522.0
META,2016,34.0,173.0,34.0,439.0,13.0,8453.0,7747.0,7747.0,34.0,15400.0
META,2017,34.0,173.0,34.0,439.0,13.0,9988.0,8545.0,8545.0,34.0,16548.0
META,2018,34.0,2154.0,11633.0,439.0,1261.0,272.0,32.0,8339.0,34.0,17118.0
META,2019,34.0,1853.0,11576.0,439.0,1241.0,422.0,25.0,7929.0,34.0,18148.0
META,2020,34.0,212.0,34.0,439.0,195.0,11312.0,7767.0,7767.0,34.0,21689.0
META,2021,38.0,88.0,38.0,446.0,190.0,12883.0,9594.0,9594.0,38.0,25126.0
META,2022,38.0,33.0,38.0,446.0,0.0,13240.0,9395.0,9395.0,38.0,26299.0


In [10]:
word_count_pivot

Unnamed: 0_level_0,section,ACCOUNTING_FEES,BUSINESS,COMPENSATION,CONTROLS_AND_PROCEDURES,EXHIBITS,FINANCIAL_STATEMENTS,MANAGEMENT,MANAGEMENT_DISCUSSION,PRINCIPAL_STOCKHOLDERS,RISK_FACTORS
ticker,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
AAPL,2011,29.0,4839.0,13454.0,646.0,15685.0,41.0,8740.0,8740.0,0.0,7529.0
AAPL,2012,76.0,4763.0,13181.0,644.0,5485.0,57.0,9074.0,9074.0,51.0,7177.0
AAPL,2013,0.0,563.0,0.0,652.0,0.0,13676.0,8916.0,8916.0,51.0,7532.0
AAPL,2014,0.0,582.0,0.0,653.0,0.0,14159.0,9194.0,9194.0,51.0,7866.0
AAPL,2015,0.0,550.0,0.0,653.0,0.0,13244.0,8300.0,8300.0,51.0,7931.0
AAPL,2016,18.0,537.0,18.0,635.0,0.0,11799.0,8005.0,8005.0,18.0,7769.0
AAPL,2017,18.0,552.0,26.0,635.0,0.0,11927.0,8091.0,8091.0,18.0,7873.0
AAPL,2018,18.0,545.0,26.0,635.0,0.0,10412.0,7470.0,7470.0,18.0,8172.0
AAPL,2019,18.0,345.0,26.0,635.0,0.0,10338.0,4183.0,4183.0,18.0,8114.0
AAPL,2020,0.0,297.0,77.0,642.0,0.0,11290.0,4381.0,4381.0,0.0,9015.0


In [5]:
data["category"].value_counts()

category
NarrativeText    7607
ListItem           98
Name: count, dtype: int64

In [6]:
data["section"].value_counts()

section
FINANCIAL_STATEMENTS                   2679
RISK_FACTORS                           1293
MANAGEMENT_DISCUSSION                  1250
MANAGEMENT                             1250
COMPENSATION                            426
EXHIBITS                                161
BUSINESS                                151
MARKET_FOR_REGISTRANT_COMMON_EQUITY     138
MARKET_RISK_DISCLOSURES                 133
CONTROLS_AND_PROCEDURES                 116
LEGAL_PROCEEDINGS                        41
PROPERTIES                               30
PRINCIPAL_STOCKHOLDERS                   15
RELATED_PARTY_TRANSACTIONS               12
ACCOUNTING_FEES                          10
Name: count, dtype: int64

In [7]:
data["tag"].value_counts()

tag
p       2345
span    2089
font    1951
td      1223
tr        49
div       47
a          1
Name: count, dtype: int64

In [8]:
data

Unnamed: 0,cik,ticker,date,company,accession_number,section,order,category,tag,row_id,full_text
0,320193,AAPL,2011-10-26,APPLE INC,0001193125-11-282113,BUSINESS,0,NarrativeText,p,e19f7471d8fb867c5bc0026bc798ad53,Apple Inc. and its wholly-owned subsidiaries (...
1,320193,AAPL,2011-10-26,APPLE INC,0001193125-11-282113,BUSINESS,1,NarrativeText,p,1ce3d4c53620d2d099ab59da4ebc3b9e,The Company is committed to bringing the best ...
2,320193,AAPL,2011-10-26,APPLE INC,0001193125-11-282113,BUSINESS,2,NarrativeText,p,a8eda7d4be126f5088578bb5b355dc0d,The Company believes a high-quality buying exp...
3,320193,AAPL,2011-10-26,APPLE INC,0001193125-11-282113,BUSINESS,3,NarrativeText,p,834e8b74a0fbb0cdd5fd390f1979bfac,The Company’s retail stores are typically loca...
4,320193,AAPL,2011-10-26,APPLE INC,0001193125-11-282113,BUSINESS,4,NarrativeText,p,1e2460c44fe3255902521a729dabc0ae,"Throughout its history, the Company has been c..."
...,...,...,...,...,...,...,...,...,...,...,...
7700,320193,AAPL,2023-11-03,Apple Inc.,0000320193-23-000106,MANAGEMENT,54,NarrativeText,span,0cedd2f8c148b1fc82f4dc0f5c9f35b8,The Company is subject to income taxes in the ...
7701,320193,AAPL,2023-11-03,Apple Inc.,0000320193-23-000106,MANAGEMENT,55,NarrativeText,span,b771b34af4ca51d17d3fbf42777c02c0,The Company is subject to various legal procee...
7702,320193,AAPL,2023-11-03,Apple Inc.,0000320193-23-000106,COMPENSATION,0,NarrativeText,span,f8825199078db1af16a75e19d38c8eff,The information required by this Item will be ...
7703,320193,AAPL,2023-11-03,Apple Inc.,0000320193-23-000106,PRINCIPAL_STOCKHOLDERS,0,NarrativeText,span,f8825199078db1af16a75e19d38c8eff,The information required by this Item will be ...


In [4]:
sov.data("sec/10k", tickers=["META"], limit=1)

Unnamed: 0,cik,ticker,date,company,accession_number,section,order,category,tag,row_id,full_text
0,1326801,META,2013-02-01,Facebook Inc,0001326801-13-000003,BUSINESS,0,NarrativeText,font,a6145cfc83491272af346a0eae2c3336,We allocate the fair value of purchase conside...
1,1326801,META,2013-02-01,Facebook Inc,0001326801-13-000003,PROPERTIES,0,NarrativeText,font,e0e664ee42b78d089f4ac1248e533e86,"As of December 31, 2012, we leased office and ..."
2,1326801,META,2013-02-01,Facebook Inc,0001326801-13-000003,LEGAL_PROCEEDINGS,0,NarrativeText,font,8ae3ba693a1fbd26c64a636d5c0d7aca,Paul D. Ceglia filed suit against us and Mark ...
3,1326801,META,2013-02-01,Facebook Inc,0001326801-13-000003,LEGAL_PROCEEDINGS,1,NarrativeText,font,0dc121ed1c32cf81c4195c359707ec12,"Beginning on May 22, 2012, multiple putative c..."
4,1326801,META,2013-02-01,Facebook Inc,0001326801-13-000003,LEGAL_PROCEEDINGS,2,NarrativeText,font,322930695bbf6c305a377de61245609b,We are also party to various legal proceedings...
...,...,...,...,...,...,...,...,...,...,...,...
10545,1326801,META,2024-02-02,"Meta Platforms, Inc.",0001326801-24-000012,EXHIBITS,3,NarrativeText,td,efc44904ae01a497b5549538d5938573,"Certification of Susan Li, Chief Financial Off..."
10546,1326801,META,2024-02-02,"Meta Platforms, Inc.",0001326801-24-000012,EXHIBITS,4,NarrativeText,td,83e1d06a719c9d49143c8e856969b30c,"Certification of Mark Zuckerberg, Chief Execut..."
10547,1326801,META,2024-02-02,"Meta Platforms, Inc.",0001326801-24-000012,EXHIBITS,5,NarrativeText,td,7a2dae8e4a37cb40603f71c65bb2a9ed,"Certification of Susan Li, Chief Financial Off..."
10548,1326801,META,2024-02-02,"Meta Platforms, Inc.",0001326801-24-000012,EXHIBITS,6,NarrativeText,span,ed1d31dff6a985dfad8bc386b6c565fd,+ Indicates a management contract or compensat...
