## Get the company tickers

In [1]:
import numpy as np
import pandas as pd
import requests
import json

headers = {'User-Agent':'robot.games@gmail.com'}
temp = requests.get("https://www.sec.gov/files/company_tickers.json",headers=headers).json()
company_tickers = pd.DataFrame(temp.values())

In [2]:
print(company_tickers.shape)
company_tickers.head()

(10321, 3)


Unnamed: 0,cik_str,ticker,title
0,789019,MSFT,MICROSOFT CORP
1,320193,AAPL,Apple Inc.
2,1045810,NVDA,NVIDIA CORP
3,1652044,GOOGL,Alphabet Inc.
4,1018724,AMZN,AMAZON COM INC


## Get the Edgar API information

This allows us to search for the keys that let us open the NPORT documents.

In [12]:
from selenium import webdriver

url = 'https://www.sec.gov/Archives/edgar/full-index/2020/QTR1/form.zip'
download_directory = 'C:\\Users\\robot\\Downloads\\'

driver = webdriver.Firefox()
driver.implicitly_wait(20)
# driver.get(url)  # this is hanging on this
driver.close()

NoSuchWindowException: Message: Browsing context has been discarded
Stacktrace:
RemoteError@chrome://remote/content/shared/RemoteError.sys.mjs:8:8
WebDriverError@chrome://remote/content/shared/webdriver/Errors.sys.mjs:193:5
NoSuchWindowError@chrome://remote/content/shared/webdriver/Errors.sys.mjs:679:5
assert.that/<@chrome://remote/content/shared/webdriver/Assert.sys.mjs:485:13
assert.open@chrome://remote/content/shared/webdriver/Assert.sys.mjs:147:4
GeckoDriver.prototype.close@chrome://remote/content/marionette/driver.sys.mjs:2316:15
despatch@chrome://remote/content/marionette/server.sys.mjs:318:40
execute@chrome://remote/content/marionette/server.sys.mjs:289:16
onPacket/<@chrome://remote/content/marionette/server.sys.mjs:262:20
onPacket@chrome://remote/content/marionette/server.sys.mjs:263:9
_onJSONObjectReady/<@chrome://remote/content/marionette/transport.sys.mjs:494:20


Move the zipfile into the data directory of the project folder.  Then extract it.

In [18]:
import zipfile
with zipfile.ZipFile('data/form.zip') as zip_object:
    zip_object.extractall('data')

Open the index file.

In [4]:
import pandas as pd
with open('data/form.idx','r') as f:
    temp = pd.DataFrame({'line':f.readlines()})
temp = temp[10:]

In [5]:
temp.head()

Unnamed: 0,line
10,"1-A Acacia Diversified Holdings, Inc. ..."
11,"1-A Amarantus Bioscience Holdings, Inc..."
12,"1-A Ani Capital Partners, Inc ..."
13,"1-A BEYOND WELLNESS INTERNATIONAL, INC..."
14,1-A Bioquest Corp ...


Now grab the URLs for the NPORT forms wherever mentioned in the index file.

In [7]:
nport_lines = temp[ temp.line.str.find('NPORT-')>=0 ]
nport_split = nport_lines.line.str.replace('[\s\t\n]{2,}',' ',regex=True).str.split(' ')
nport_url = [nport_split.iloc[z][-2] for z in range(nport_split.shape[0])]

For example,

In [11]:
nport_url[0]

'edgar/data/1605941/0001752724-20-059501.txt'

## Step through and extract information from each NPORT form

In [12]:
url_stem = 'https://www.sec.gov/Archives/'
nport_txt_doc_url = [url_stem + x for x in nport_url]

In [13]:
nport_txt_doc_url[0]

'https://www.sec.gov/Archives/edgar/data/1605941/0001752724-20-059501.txt'

Get all of the xml files and store them locally.

In [17]:
import requests
import time
N = len(nport_txt_doc_url)
s = requests.Session()
for i in range(N):
    try:
        #print('Working on file '+str(i)+'/'+str(N))
        headers = {'User-Agent':'robot.games@gmail.com'}
        temp = s.get(nport_txt_doc_url[i],headers=headers,timeout=20)
    except:
        print('Timeout on item '+str(i))
    time.sleep(0.1)
    with open('data/xml/'+str(i)+'.txt','wb') as f:
        f.write(temp.content)

Working on file 0/11260
Working on file 1/11260
Working on file 2/11260
Working on file 3/11260
Working on file 4/11260
Working on file 5/11260
Working on file 6/11260
Working on file 7/11260
Working on file 8/11260
Working on file 9/11260
Working on file 10/11260
Working on file 11/11260
Working on file 12/11260
Working on file 13/11260
Working on file 14/11260
Working on file 15/11260
Working on file 16/11260
Working on file 17/11260
Working on file 18/11260
Working on file 19/11260
Working on file 20/11260
Working on file 21/11260
Working on file 22/11260
Working on file 23/11260
Working on file 24/11260
Working on file 25/11260
Working on file 26/11260
Working on file 27/11260
Working on file 28/11260
Working on file 29/11260
Working on file 30/11260
Working on file 31/11260
Working on file 32/11260
Working on file 33/11260
Working on file 34/11260
Working on file 35/11260
Working on file 36/11260
Working on file 37/11260
Working on file 38/11260
Working on file 39/11260
Working on

Parse each xml document (each NPORT form) one by one.  Gather the results into two data frames and store them on disk as csv files.

Total execution time: ~ 450 minutes.

In [27]:
filer = {
    'name':[],
    'cik':[],
    'series':[],
    'series_id':[],
    'total_assets':[],
    'total_liabilities':[]
}

investment = {
    'cik':[],
    'investment_name':[],
    'value_USD':[],
    'percentage_investment':[]
}

In [28]:
from bs4 import BeautifulSoup
import pandas as pd
import os

file_names = ['data/xml/'+z for z in os.listdir('data/xml')]

start = 0
end = len(file_names)

file_names = file_names[start:end]

for i,file_name in enumerate(file_names):
    print('Working on',file_name,'which is',i,'/',end)
    with open(file_name,'r') as handle:
        #  Add to the filer database
        soup = BeautifulSoup(handle,'lxml')
        regname = soup.select('regName')
        if len(regname)>0:
            filer['name'].append(regname[0].text.replace(',',''))
        else:
            print('NPORT skipped')
            continue
        cik_value = soup.select('regCik')
        filer['cik'].append(cik_value[0].text)
        cik = cik_value[0].text # will need this a bit later on
        series = soup.select('seriesName')
        filer['series'].append(series[0].text)
        series_id = soup.select('seriesId')
        if len(series_id)>0:
            filer['series_id'].append(series_id[0].text)
        else:
            filer['series_id'].append(None)
        assets = soup.select('totAssets')
        filer['total_assets'].append(float(assets[0].text))
        liabilities = soup.select('totLiabs')
        filer['total_liabilities'].append(float(liabilities[0].text))
        # now add to the investment database
        # This is tricky because there are some missing values in some files
        investment_names = soup.select('name')
        for node in investment_names:
            try:
                vnode = node.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling
                pnode = vnode.next_sibling.next_sibling
                value_USD = float(vnode.text)
                percent = float(pnode.text)    
            except:
                print('one skipped')
                continue
            investment['cik'] += [cik]
            investment['investment_name'] += [node.text]
            investment['value_USD'] += [value_USD]
            investment['percentage_investment'] += [percent]
        
filer = pd.DataFrame(filer)
investment = pd.DataFrame(investment)
filer.replace(',','',regex=True,inplace=True)
investment.replace(',','',regex=True,inplace=True)
filer.to_csv('filer.csv',index=False)
investment.to_csv('investment.csv',index=False)

Working on data/xml/0.txt which is 0 / 11260
Working on data/xml/1.txt which is 1 / 11260
Working on data/xml/10.txt which is 2 / 11260
Working on data/xml/100.txt which is 3 / 11260
one skipped
Working on data/xml/1000.txt which is 4 / 11260
Working on data/xml/10000.txt which is 5 / 11260
one skipped
Working on data/xml/10001.txt which is 6 / 11260
Working on data/xml/10002.txt which is 7 / 11260
Working on data/xml/10003.txt which is 8 / 11260
Working on data/xml/10004.txt which is 9 / 11260
one skipped
Working on data/xml/10005.txt which is 10 / 11260
Working on data/xml/10006.txt which is 11 / 11260
Working on data/xml/10007.txt which is 12 / 11260
Working on data/xml/10008.txt which is 13 / 11260
Working on data/xml/10009.txt which is 14 / 11260
Working on data/xml/1001.txt which is 15 / 11260
one skipped
one skipped
one skipped
one skipped
one skipped
one skipped
one skipped
one skipped
one skipped
one skipped
one skipped
one skipped
one skipped
one skipped
Working on data/xml/1

## Explore and validate the data

Load the csv files.

In [1]:
import pandas as pd
filer = pd.read_csv('filer.csv',dtype={'cik':str})
investment = pd.read_csv('investment.csv',dtype={'cik':str})

The `filer` data contains data about the entity filing the NPORT form.

In [21]:
filer.head()

Unnamed: 0,name,cik,series,series_id,total_assets,total_liabilities
0,1290 Funds,1605941,1290 DoubleLine Dynamic Allocation Fund,S000052766,62750360.0,190927.56
1,1290 Funds,1605941,1290 Retirement 2060 Fund,S000052767,3750518.0,22631.18
2,1290 Funds,1605941,1290 Retirement 2020 Fund,S000052768,11352930.0,29830.84
3,AB VARIABLE PRODUCTS SERIES FUND INC.,825316,AB Global Risk Allocation-Moderate Portfolio,S000049082,96558420.0,1401996.55
4,Allianz Funds Multi-Strategy Trust,1423227,AllianzGI Short Duration High Income Fund,S000033713,1559062000.0,5873909.77


There are a few missing values in the series and series ID columns.  However, these are not crucial to our project.  We only need the `cik` number and the `name` of the filer.

In [22]:
filer.isna().sum()

name                   0
cik                    0
series                29
series_id            597
total_assets           0
total_liabilities      0
dtype: int64

Now let's look at the investment information.  The `cik` column contains the CIK number of the entity filing the NPORT form.

In [23]:
investment.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4317923 entries, 0 to 4317922
Data columns (total 4 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   cik                    object 
 1   investment_name        object 
 2   value_USD              float64
 3   percentage_investment  float64
dtypes: float64(2), object(2)
memory usage: 131.8+ MB


In [24]:
investment.head()

Unnamed: 0,cik,investment_name,value_USD,percentage_investment
0,1605941,Garda World Security Corp.,36531.25,0.058394
1,1605941,Tempo Acquisition LLC,56925.0,0.090993
2,1605941,Fortive Corp.,310285.13,0.495985
3,1605941,Adobe Inc.,467016.2,0.746516
4,1605941,Beacon Roofing Supply Inc.,49735.0,0.0795


In [29]:
print('There are',len(investment.cik.unique()),'unique funds and',len(investment.investment_name.unique()),'investments, with',investment.shape[0],'observations')

There are 1640 unique funds and 564608 investments, with 4317923 observations


There are some strange values in the `percentage_investment` column.  We should probably discard those.

In [26]:
investment.percentage_investment.describe()

count    4.317923e+06
mean     2.551238e-01
std      1.410259e+01
min     -2.916107e+04
25%      1.807689e-03
50%      2.808844e-02
75%      1.618385e-01
max      1.725562e+02
Name: percentage_investment, dtype: float64

After discarding, we have about 3.96 million observations in this data set.

In [28]:
investment[(investment.percentage_investment >= 0) & (investment.percentage_investment <= 1)].shape

(3956863, 4)

## Chunking the file for upload to github

In [11]:
import pandas as pd
def write_chunk_csv(df,n,file_stem='out'):
    N = round(df.shape[0]/n)
    i=0
    while i < n-1:
        temp = df.iloc[(i*N):((i+1)*N),:]
        temp.to_csv(file_stem+'_'+str(i)+'.csv',index=False)
        i+=1
    temp = df.iloc[((n-1)*N):,:]
    temp.to_csv(file_stem+'_'+str(n-1)+'.csv',index=False)

In [12]:
write_chunk_csv(investment,20,file_stem='investment')