In [1]:

from bs4 import BeautifulSoup
from os.path import exists
import numpy as np
import pandas as pd
import requests as req
import time
import os
import lxml
import pickle

In [2]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 100)
pd.options.display.float_format = '{:.2f}'.format

In [8]:
DATA_ROOT_PATH="/mnt/data/projects/MD1/data/R1000/stage1/"
PROJ_ROOT_PATH="/home/priyesh/projects/MD1"

In [13]:
def get_report_list(soup):
    
  # Create list of reports found in XML file

  #Create a list to hold all the reports found

  report_list = []

  #Find myreports tag

  reports = soup.find('myreports')

  #Loop through each report but ignore last one. 

  for report in reports.find_all('report')[:-2]:
    
    # Create dictionary and store information required
    
    report_dict = {}
    
    report_dict['short_name'] = report.shortname.text
    report_dict['long_name'] = report.longname.text
    
    if report.htmlfilename is not None:
      report_dict['filename'] = report.htmlfilename.text.strip()
      report_dict['filetype'] = 'htm'
    elif report.xmlfilename is not None:
      report_dict['filename'] = report.xmlfilename.text.strip() 
      report_dict['filetype'] = 'xml'
    else:
      report_dict['Filename'] = np.nan
      eport_dict['filetype'] = 'notset'
    report_list.append(report_dict)

  return report_list


In [3]:
def find_reports(master_report_list):
    
  # Loop through master report list and store reports which match

  report_dict = {}

  for report in master_report_list:
    
   if bal_match_report(report['short_name']):
     report_dict['Balance'] = report
    
   if income_match_report(report['short_name']):
     report_dict['Income'] = report

   if cash_match_report(report['short_name']):
     report_dict['Cash'] = report

  return report_dict

In [14]:
def get_reports(ticker):

  '''
  Generate a dictionary for given ticker which contains list of reports for each financial year.
  
  '''

  master_report_list = {}

  filepath=os.path.join(DATA_ROOT_PATH, ticker)
 
  # iterate over sub-directories

  for d in os.listdir(filepath):
    
    if os.path.isdir(filepath+"/"+d):
      yr=d
    
      file=filepath+"/"+d+"/FilingSummary.xml"
    
      if exists(file):
        with open(file, 'r')as f:
          soup=BeautifulSoup(f,'lxml')
        
        report_list = get_report_list(soup)
        master_report_list[yr] = report_list 
        
        # Save mapping to directory
        
        tmp = pd.DataFrame.from_dict(report_list)
        tmp = tmp.set_index('filename')
        tmp.to_csv(filepath+"/"+d+"/filemap.csv",sep="!")
      else:
        print("FilingSummary.xml not found: ",ticker,d)
   
  return master_report_list
  

In [9]:
get_reports('AAL')

FilingSummary.xml not found:  AAL 10


{'12': [{'short_name': 'Document And Entity Information',
   'long_name': '00090 - Document - Document And Entity Information',
   'filename': 'R1.htm',
   'filetype': 'htmlfilename'},
  {'short_name': 'Consolidated Statements Of Operations',
   'long_name': '00100 - Statement - Consolidated Statements Of Operations',
   'filename': 'R2.htm',
   'filetype': 'htmlfilename'},
  {'short_name': 'Consolidated Statements Of Operations (Parenthetical)',
   'long_name': '00105 - Statement - Consolidated Statements Of Operations (Parenthetical)',
   'filename': 'R3.htm',
   'filetype': 'htmlfilename'},
  {'short_name': 'Consolidated Statements Of Comprehensive Income',
   'long_name': '00200 - Statement - Consolidated Statements Of Comprehensive Income',
   'filename': 'R4.htm',
   'filetype': 'htmlfilename'},
  {'short_name': 'Consolidated Balance Sheets',
   'long_name': '00300 - Statement - Consolidated Balance Sheets',
   'filename': 'R5.htm',
   'filetype': 'htmlfilename'},
  {'short_name'

In [11]:
# Read in ticker list

filepath=os.path.join(PROJ_ROOT_PATH,'files','tickers')

with open(filepath, 'r') as f:
    
  tickers=f.readlines()
  
  #Remove trailing \n

  ticker_list = [item.strip() for item in tickers]
    
ticker_list

['AA',
 'AAL',
 'AAP',
 'AAPL',
 'ABBV',
 'ABNB',
 'ABT',
 'ACGL',
 'ACHC',
 'ACI',
 'ACM',
 'ACN',
 'ADBE',
 'ADI',
 'ADM',
 'ADP',
 'ADSK',
 'ADT',
 'AEE',
 'AEP',
 'AES',
 'AFG',
 'AFL',
 'AFRM',
 'AGCO',
 'AGL',
 'AGNC',
 'AGO',
 'AGR',
 'AIG',
 'AIZ',
 'AJG',
 'AKAM',
 'AL',
 'ALB',
 'ALGM',
 'ALGN',
 'ALK',
 'ALL',
 'ALLE',
 'ALLY',
 'ALNY',
 'ALSN',
 'AM',
 'AMAT',
 'AMC',
 'AMCR',
 'AMD',
 'AME',
 'AMED',
 'AMG',
 'AMGN',
 'AMH',
 'AMP',
 'AMT',
 'AMZN',
 'AN',
 'ANET',
 'ANSS',
 'AON',
 'AOS',
 'APA',
 'APD',
 'APH',
 'APO',
 'APP',
 'APTV',
 'AR',
 'ARE',
 'ARES',
 'ARMK',
 'ARW',
 'ASH',
 'ATO',
 'ATR',
 'ATUS',
 'ATVI',
 'AVB',
 'AVGO',
 'AVT',
 'AVTR',
 'AWI',
 'AWK',
 'AXON',
 'AXS',
 'AXTA',
 'AYI',
 'AYX',
 'AZEK',
 'AZO',
 'AZTA',
 'BA',
 'BAC',
 'BAH',
 'BALL',
 'BAX',
 'BBWI',
 'BBY',
 'BC',
 'BDX',
 'BEN',
 'BERY',
 'BFAM',
 'BG',
 'BHF',
 'BIIB',
 'BILL',
 'BIO',
 'BJ',
 'BK',
 'BKI',
 'BKNG',
 'BLD',
 'BLDR',
 'BLK',
 'BMRN',
 'BMY',
 'BOKF',
 'BR',
 'BRKR',
 'BRO

In [15]:
#Generate a dataframe with a list of all files for all tickers

df = pd.DataFrame()

for ticker in ticker_list:
  print(ticker) 
  reports = get_reports(ticker)

  for key, value in reports.items():
    tmp_df = pd.DataFrame.from_dict(reports[key])
    tmp_df['FYR'] = key
    tmp_df['ticker'] = ticker

    df = pd.concat([df,tmp_df])


AA
AAL
FilingSummary.xml not found:  AAL 10
AAP
FilingSummary.xml not found:  AAP 10
AAPL
ABBV
FilingSummary.xml not found:  ABBV 13
ABNB
FilingSummary.xml not found:  ABNB 21
ABT
ACGL
FilingSummary.xml not found:  ACGL 10
ACHC
ACI
ACM
ACN
ADBE
ADI
ADM
ADP
ADSK
ADT
FilingSummary.xml not found:  ADT 18
AEE
AEP
AES
AFG
FilingSummary.xml not found:  AFG 10
AFL
AFRM
AGCO
FilingSummary.xml not found:  AGCO 10
AGL
AGNC
FilingSummary.xml not found:  AGNC 10
FilingSummary.xml not found:  AGNC 11
AGO
FilingSummary.xml not found:  AGO 10
AGR
AIG
AIZ
AJG
FilingSummary.xml not found:  AJG 10
AKAM
AL
ALB
FilingSummary.xml not found:  ALB 10
ALGM
ALGN
FilingSummary.xml not found:  ALGN 10
ALK
FilingSummary.xml not found:  ALK 10
ALL
ALLE
FilingSummary.xml not found:  ALLE 14
ALLY
FilingSummary.xml not found:  ALLY 10
FilingSummary.xml not found:  ALLY 11
ALNY
FilingSummary.xml not found:  ALNY 10
ALSN
AM
AMAT
AMC
FilingSummary.xml not found:  AMC 14
AMCR
AMD
FilingSummary.xml not found:  AMD 10
AME


GME
GMED
GNRC
FilingSummary.xml not found:  GNRC 10
FilingSummary.xml not found:  GNRC 11
GNTX
FilingSummary.xml not found:  GNTX 10
GO
GOOG
GOOGL
GPC
GPK
FilingSummary.xml not found:  GPK 10
FilingSummary.xml not found:  GPK 11
GPN
FilingSummary.xml not found:  GPN 10
GPS
GRMN
GS
GTES
FilingSummary.xml not found:  GTES 18
GWRE
GWW
GXO
H
FilingSummary.xml not found:  H 10
FilingSummary.xml not found:  H 11
HAL
HAS
FilingSummary.xml not found:  HAS 10
HAYW
HBAN
FilingSummary.xml not found:  HBAN 10
HBI
FilingSummary.xml not found:  HBI 10
HCA
FilingSummary.xml not found:  HCA 10
FilingSummary.xml not found:  HCA 11
HD
HE
FilingSummary.xml not found:  HE 10
HEI
HES
HHC
FilingSummary.xml not found:  HHC 11
HIG
FilingSummary.xml not found:  HIG 11
HII
HIW
FilingSummary.xml not found:  HIW 10
HLT
FilingSummary.xml not found:  HLT 14
HOG
HOLX
HON
HPE
HPP
FilingSummary.xml not found:  HPP 15
FilingSummary.xml not found:  HPP 11
HPQ
FilingSummary.xml not found:  HPQ 17
HR
HRB
HRL
HSIC
FilingSu

PKG
FilingSummary.xml not found:  PKG 10
PKI
FilingSummary.xml not found:  PKI 10
PLD
FilingSummary.xml not found:  PLD 10
PLNT
PLTK
PLTR
PLUG
FilingSummary.xml not found:  PLUG 10
FilingSummary.xml not found:  PLUG 11
PM
PNC
PNFP
FilingSummary.xml not found:  PNFP 10
FilingSummary.xml not found:  PNFP 11
PNR
FilingSummary.xml not found:  PNR 10
PNW
FilingSummary.xml not found:  PNW 10
PODD
FilingSummary.xml not found:  PODD 10
FilingSummary.xml not found:  PODD 11
POOL
FilingSummary.xml not found:  POOL 10
POST
PPC
FilingSummary.xml not found:  PPC 11
PPG
PPL
PRGO
PRI
FilingSummary.xml not found:  PRI 11
PRU
PSA
PSTG
PSX
PTC
PTON
PVH
FilingSummary.xml not found:  PVH 10
PWR
PXD
PYCR
PYPL
QCOM
QDEL
QRVO
QS
RARE
FilingSummary.xml not found:  RARE 14
RE
FilingSummary.xml not found:  RE 10
REG
FilingSummary.xml not found:  REG 10
REGN
FilingSummary.xml not found:  REGN 10
REXR
REYN
FilingSummary.xml not found:  REYN 20
RF
FilingSummary.xml not found:  RF 11
RGA
FilingSummary.xml not found

In [9]:
#Save the dataframe as a pickle file

# Save to file

with open(PROJ_ROOT_PATH + '/pickle/filemap.pkl', 'wb') as f:
  pickle.dump(df, f)


NameError: name 'df' is not defined

In [10]:
# Save ticker list

with open(PROJ_ROOT_PATH + '/pickle/ticker_list.pkl', 'wb') as f:
  pickle.dump(ticker_list, f)

In [9]:
df = pd.read_pickle(PROJ_ROOT_PATH + '/pickle/filemap.pkl')

EOFError: Ran out of input

In [7]:
df

Unnamed: 0,short_name,long_name,filename,filetype,FYR,ticker
0,Document and Entity Information,1001 - Document - Document and Entity Information,R1.htm,htmlfilename,17,AA
1,Statement of Consolidated Operations,1003 - Statement - Statement of Consolidated Operations,R2.htm,htmlfilename,17,AA
2,Statement of Consolidated Comprehensive (Loss) Income,1004 - Statement - Statement of Consolidated Comprehensive (Loss) Income,R3.htm,htmlfilename,17,AA
3,Consolidated Balance Sheet,1005 - Statement - Consolidated Balance Sheet,R4.htm,htmlfilename,17,AA
4,Statement of Consolidated Cash Flows,1006 - Statement - Statement of Consolidated Cash Flows,R5.htm,htmlfilename,17,AA
...,...,...,...,...,...,...
98,Self-Insurance - Additional Information (Detail),2488447 - Disclosure - Self-Insurance - Additional Information (Detail),R99.htm,htmlfilename,20,ZG
99,Employee Benefit Plan - Additional Information (Detail),2490448 - Disclosure - Employee Benefit Plan - Additional Information (Detail),R100.htm,htmlfilename,20,ZG
100,Segment Information and Revenue - Additional Information (Detail),2493449 - Disclosure - Segment Information and Revenue - Additional Information (Detail),R101.htm,htmlfilename,20,ZG
101,Segment Information and Revenue - Revenue Categories (Detail),2494450 - Disclosure - Segment Information and Revenue - Revenue Categories (Detail),R102.htm,htmlfilename,20,ZG


In [8]:
s_temp = df[df['long_name'].str.contains(r" - Statement - ")]['long_name'] \
             .str.split(r" - Statement - ",expand=True)[1].str.lower().unique()
s_temp = pd.Series(s_temp)
s_temp

0                                                                      statement of consolidated operations
1                                                     statement of consolidated comprehensive (loss) income
2                                                                                consolidated balance sheet
3                                                                      statement of consolidated cash flows
4                                                               statement of changes in consolidated equity
                                                       ...                                                 
2658    consolidated and combined statements of changes in stockholders' equity and comprehensive income...
2659                                         consolidated and combined statements of cash flows (unaudited)
2660              consolidated statements of shareholders' equity (deficit) and comprehensive income (loss)
2661    parenthetical data f