In [1]:
import numpy as np
import pandas as pd
import time
import os
import pickle
import re

In [33]:
pd.set_option('display.max_rows', 700)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 200)
pd.options.display.float_format = '{:.2f}'.format

In [3]:
DATA_ROOT_PATH="/mnt/data/projects/MD1/data/R1000/reports/"
PROJ_ROOT_PATH="/home/priyesh/projects/MD1"

In [4]:
# Read from Pickle file

filepath = os.path.join(PROJ_ROOT_PATH,'pickle','clean_master_data_interim.pkl')
clean_master_data = pd.read_pickle(filepath)

In [5]:
def get_line_items(t):
 
  headings = []
  line_items = []
    
  for index, row in t.iterrows():
    
    if row.replace('', np.nan).isna().all():
      # Blank row, so assume heading
    
      headings.append(index)
    else:    
      line_items.append(index)

  return headings, line_items

In [6]:
def get_table_labels(stype):

  master_headings = []
  master_line_items = []

  ticker_list = list(clean_master_data.keys())

  for ticker in ticker_list:
    
    print(ticker)
    
    for yr in clean_master_data[ticker]:
      
      # Allow for the likelihood that statement might be missing for given ticker and yr
        
      if stype in clean_master_data[ticker][yr]:
        try:
          headings, line_items = get_line_items(clean_master_data[ticker][yr][stype]['table'])
    
          master_headings = master_headings + headings
          master_line_items = master_line_items + line_items
        except:
          print("Error : {} {}".format(ticker, yr))
          exit() 

  # Remove duplicates from headings and line items

  unique_list = set(master_headings)
  headings = list(unique_list)

  unique_list = set(master_line_items)
  line_items = list(unique_list)

  return {'headings' : headings,
          'line_items' : line_items}


In [7]:
table_labels = get_table_labels('income')

AAL
AAP
AAPL
ABBV
ABT
ACGL
ACHC
ACM
ACN
ADBE
ADI
ADM
ADP
ADSK
AEE
AEP
AES
AFG
AFL
AGCO
AGNC
AGO
AGR
AIG
AIZ
AJG
AKAM
AL
ALB
ALGN
ALK
ALL
ALLE
ALLY
ALNY
ALSN
AMAT
AMC
AMD
AME
AMED
AMG
AMGN
AMH
AMP
AMT
AMZN
AN
ANET
ANSS
AON
AOS
APD
APH
APTV
AR
ARE
ARES
ARMK
ARW
ATO
ATR
ATVI
AVB
AVT
AWI
AWK
AXS
AXTA
AYI
AZO
BA
BAC
BAH
BALL
BAX
BBY
BC
BDX
BEN
BERY
BFAM
BG
BIIB
BIO
BK
BKI
BKNG
BLD
BLDR
BLK
BMRN
BMY
BOKF
BR
BRKR
BRO
BRX
BSX
BURL
BWA
BWXT
BX
BXP
BYD
C
CABO
CACC
CACI
CAG
CAH
CAR
CASY
CAT
CB
CBOE
CBRE
CBSH
CC
CCI
CCK
CCL
CDNS
CDW
CE
CF
CFG
CFR
CG
CGNX
CHD
CHDN
CHE
CHH
CHK
CHRW
CHTR
CIEN
CINF
CL
CLH
CLX
CMA
CMCSA
CME
CMG
CMI
CMS
CNA
CNC
CNP
COF
COLB
COLM
COO
COST
COTY
CPB
CPRI
CPRT
CPT
CR
CRI
CRL
CRUS
CSCO
CSGP
CSL
CSX
CTAS
CTLT
CTSH
CUBE
CUZ
CVS
CVX
CW
D
DAL
DAR
DCI
DECK
DEI
DFS
DG
DGX
DHI
DHR
DISH
DKS
DLB
DLR
DLTR
DOV
DPZ
DRI
DTE
DUK
DVA
DVN
DXCM
EA
EBAY
ECL
ED
EEFT
EFX
EGP
EHC
EIX
EL
ELS
EMN
EMR
ENPH
ENTG
EOG
EPAM
EPR
EQIX
EQR
EQT
ERIE
ES
ESI
ESS
ETN
ETR
EVA
EVR
EW
EWBC
EXAS
EXC
EXEL
EXP
EXPD

In [8]:
all_table_labels = table_labels['headings'] + table_labels['line_items']
df = pd.DataFrame(all_table_labels,columns=['line_item'])

### Net Sales

In [9]:
mask = df['line_item'].str.contains(r"^Net Sales",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Related Part",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Passenger|Hardware|Software|Excl|Less",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Service|Estate|Customer|Cons|Gross",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Multi|Fee|Interest|Dividend|Oper",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Cloud|Subs|Maint|Util|Lease|Prod",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Insur|Rent|Office|Home|Comm|non",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Automotive|Cost|Property|Broker|Ext",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Other|Deferred|Recur|Member|Gas",case=True, regex=True, na=False)
         
df[mask].reset_index()

Unnamed: 0,index,line_item
0,1114,Net Sales:
1,3738,Net Sales And Revenue
2,4079,Net Sales
3,8501,Net Sales
4,9393,Net Sales Note
5,16277,Net Sales And Revenues
6,20641,Net Sales Including Special Charges Of 29.6 In 2011


In [10]:
mask = df['line_item'].str.contains(r"^Net Sales",case=False, regex=True, na=False) & \
          df['line_item'].str.contains(r"Related Part",case=True, regex=True, na=False) & \
          df['line_item'].str.contains(r"Including",case=True, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Passenger|Hardware|Software|Excl|Less",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Service|Estate|Customer|Cons|Gross",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Multi|Fee|Interest|Dividend|Oper",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Cloud|Subs|Maint|Util|Lease|Prod",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Insur|Rent|Office|Home|Comm|non",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Automotive|Cost|Property|Broker|Ext",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Other|Deferred|Recur|Member|Gas",case=True, regex=True, na=False)
         
df[mask].reset_index()

Unnamed: 0,index,line_item
0,9423,"Net Sales Including Sales To Related Parties Of 23.5 Million, 68.8 Million And 43.5 Million, In ..."
1,11339,"Net Sales Including Sales To Related Parties, See Note 17"
2,16592,"Net Sales Including Sales To Related Parties, See Note 18"
3,18034,Net Sales Including Related Party Sales Of 68.8 Million In 2011 And 43.5 Million In 2010
4,20519,"Net Sales Including Sales To Related Parties, See Note 16"


### Revenue

In [11]:
mask = df['line_item'].str.contains(r"Revenues",case=True, regex=True, na=False) & \
          df['line_item'].str.contains(r"Including",case=True, regex=True, na=False) & \
          df['line_item'].str.contains(r"Related Parties",case=True, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Other",case=True, regex=True, na=False) 

df[mask].reset_index()

Unnamed: 0,index,line_item
0,5371,"Revenues Including Amounts From Related Parties Of 0, 13,882 And 49,788, Respectively"
1,12285,"Revenues Including Amounts From Related Parties Of 13,882, In 2011, Note 19"
2,14874,"Revenues Including Excise Taxes Includes 8,269 In 2022, 7,822 In 2021 And 7,572 In 2020 From Rel..."
3,15747,"Revenues Including Amounts From Related Parties Of 13,882, 49,788 And 60,192 For The Years Ended..."


### Total Revenue

In [12]:
mask = df['line_item'].str.contains(r"Revenue",case=False, regex=True, na=False) & \
          df['line_item'].str.contains(r"Total",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Passenger|Hardware|Software|Excl|Less",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Service|Estate|Customer|Cons|Gross",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Multi|Fee|Interest|Dividend|Oper",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Cloud|Subs|Maint|Util|Lease|Net",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Insur|Rent|Office|Home|Comm|non",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Automotive|Cost|Property|Broker|Sales",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Other|Deferred|Recur|Member|Gas",case=True, regex=True, na=False)
         
df[mask].reset_index()

Unnamed: 0,index,line_item
0,213,Total Revenues
1,1110,Total Revenue
2,8980,Total Revenues
3,9058,Total Revenues Note
4,17504,Total Revenue Note
5,17725,"Revenues, Total"
6,18898,Total Revenue
7,21502,"Revenue, Total"


### Total Net Revenue

In [13]:
mask = df['line_item'].str.contains(r"Revenue",case=False, regex=True, na=False) & \
          df['line_item'].str.contains(r"Total",case=False, regex=True, na=False) & \
          df['line_item'].str.contains(r"Net",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Passenger|Hardware|Software|Excl|Less",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Service|Estate|Customer|Cons|Gross",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Multi|Fee|Interest|Dividend|Oper",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Cloud|Subs|Maint|Util|Lease|Sales",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Insur|Rent|Office|Home|Comm|non",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Automotive|Cost|Property|Broker",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Other|Deferred|Recur|Member|Gas",case=True, regex=True, na=False)
         
df[mask].reset_index()

Unnamed: 0,index,line_item
0,7522,Total Net Revenue Loss
1,10474,"Total Revenues, Net"
2,15110,Total Net Revenue
3,16020,"Total Revenue, Net"
4,22095,Total Net Revenues


In [14]:
mask = df['line_item'].str.contains(r"Revenue",case=False, regex=True, na=False) & \
          df['line_item'].str.contains(r"Total",case=False, regex=True, na=False) & \
          df['line_item'].str.contains(r"Net",case=False, regex=True, na=False) & \
          df['line_item'].str.contains(r"Sales",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Passenger|Hardware|Software|Excl|Less",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Service|Estate|Customer|Cons|Gross",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Multi|Fee|Interest|Dividend|Oper",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Cloud|Subs|Maint|Util|Lease",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Insur|Rent|Office|Home|Comm|non",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Automotive|Cost|Property|Broker",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Other|Deferred|Recur|Member|Gas",case=True, regex=True, na=False)
         
df[mask].reset_index()

Unnamed: 0,index,line_item
0,10186,Total Net Sales And Revenue
1,19976,Total Net Sales And Revenue Note


In [15]:
mask = df['line_item'].str.contains(r"Revenue",case=False, regex=True, na=False) & \
          df['line_item'].str.contains(r"Net",case=False, regex=True, na=False) & \
          df['line_item'].str.contains(r"Includes",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Total",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Passenger|Hardware|Software|Excl|Less",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Service|Estate|Customer|Cons|Gross",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Multi|Fee|Interest|Dividend|Oper",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Cloud|Subs|Maint|Util|Lease",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Insur|Rent|Office|Home|Comm|non",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Automotive|Cost|Property|Broker",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Other|Deferred|Recur|Member|Gas",case=True, regex=True, na=False)
         
df[mask].reset_index()

Unnamed: 0,index,line_item
0,11209,"Net Revenues Includes 3,658 In 2022, 3,330 In 2021 And 3,233 In 2020 From Related Parties"


### Total Operating Revenue

In [16]:
mask = df['line_item'].str.contains(r"Revenue",case=False, regex=True, na=False) & \
          df['line_item'].str.contains(r"Total",case=False, regex=True, na=False) & \
          df['line_item'].str.contains(r"Oper",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Passenger|Hardware|Software|Excl|Less",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Multi|Fee|Interest|Dividend|Non",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Automotive|Cost|Property|Broker",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Other|Deferred|Recur|Member|Gas|Util",case=True, regex=True, na=False)
         
df[mask].reset_index()

Unnamed: 0,index,line_item
0,2332,Total Operating Revenues
1,12868,Total Operating Revenue
2,15694,Total Operating Revenues


### Total Cost of Revenue

In [17]:
mask = df['line_item'].str.contains(r"Revenue",case=False, regex=True, na=False) & \
          df['line_item'].str.contains(r"Total",case=False, regex=True, na=False) & \
          df['line_item'].str.contains(r"Cost",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Passenger|Hardware|Software",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Excl|Recurr",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Service|Estate|Customer",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Multi|Fee|Interest|Dividend",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Cloud|Subs|Maint|Util|Lease",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Insur|Rent|Office|Home|Comm|non",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Automotive|Property|Broker",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Other|Deferred",case=False, regex=True, na=False)
         
df[mask].reset_index()

Unnamed: 0,index,line_item
0,8643,Total Cost Of Revenue
1,10950,Total Cost Of Revenues
2,12650,Total Costs Of Revenue
3,16508,Total Costs Of Revenues


### Cost of Revenue

In [18]:
mask = df['line_item'].str.contains(r"Revenue",case=False, regex=True, na=False) & \
          df['line_item'].str.contains(r"^Cost",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Total|Dep|Prod|Lic|Sup|Con|Bio",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Party|Related|Less|Rest|Land",case=True, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Passenger|Hardware|Software",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Service|Estate|Customer|Form",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Royal|Coal|Excl|Sales|Net",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Multi|Fee|Interest|Dividend",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Cloud|Subs|Maint|Util|Lease|Reimb",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Insur|Rent|Office|Home|Comm|non",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Automotive|Property|Broker",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Other|Deferred|Member|Abstract",case=False, regex=True, na=False)
         
df[mask].reset_index()

Unnamed: 0,index,line_item
0,1477,Cost Of Revenue
1,1657,Cost Of Revenues:
2,2036,Costs Of Revenues:
3,2050,Cost Of Revenue:
4,2551,Cost Of Revenues
5,2697,Costs Of Revenue:
6,5783,Cost Of Revenue
7,10544,Cost Of Revenues:
8,15910,Cost Of Revenues
9,18094,Cost Of Revenue.


In [19]:
mask = df['line_item'].str.contains(r"Revenue",case=False, regex=True, na=False) & \
          df['line_item'].str.contains(r"^Cost",case=False, regex=True, na=False) & \
          df['line_item'].str.contains(r"Including",case=False, regex=True, na=False) & \
          df['line_item'].str.contains(r"Amort",case=False, regex=True, na=False) & \
          df['line_item'].str.contains(r"Impair",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Definite",case=True, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Other|Deferred|Member|Abstract",case=True, regex=True, na=False)
         
df[mask].reset_index()

Unnamed: 0,index,line_item
0,6507,"Cost Of Revenues Including Amortization And Impairments Of Feature Film Production Assets Of 39,..."
1,9079,Cost Of Revenues Including Amortization And Impairments Of Feature Film And Television Productio...
2,13303,"Cost Of Revenues Including Amortization And Impairments Of Feature Film Production Assets Of 8,7..."


In [20]:
mask = df['line_item'].str.contains(r"Revenue",case=False, regex=True, na=False) & \
          df['line_item'].str.contains(r"^Cost",case=False, regex=True, na=False) & \
          df['line_item'].str.contains(r"Excluding|Exclusive",case=False, regex=True, na=False) & \
          df['line_item'].str.contains(r"Amort",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Definite|Dep|Intang",case=True, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Other|Deferred|Member|Abstract",case=True, regex=True, na=False)
         
df[mask].reset_index()

Unnamed: 0,index,line_item
0,1220,Cost Of Revenue Exclusive Of Amortization:
1,2009,Cost Of Revenue Exclusive Of Amortization
2,12002,Cost Of Revenues Exclusive Of Amortization Shown Below
3,15358,Cost Of Revenue Exclusive Of Amortization
4,18229,Cost Of Revenue Exclusive Of Amortization Shown Below


In [21]:
mask = df['line_item'].str.contains(r"Revenue",case=False, regex=True, na=False) & \
          df['line_item'].str.contains(r"^Cost",case=False, regex=True, na=False) & \
          df['line_item'].str.contains(r"Excluding|Exclusive",case=False, regex=True, na=False) & \
          df['line_item'].str.contains(r"Amort",case=False, regex=True, na=False) & \
          df['line_item'].str.contains(r"Depr",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Definite|Intang",case=True, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Other|Deferred|Member|Abstract",case=True, regex=True, na=False)
         
df[mask].reset_index()

Unnamed: 0,index,line_item
0,5600,"Costs Of Revenue, Excluding Depreciation And Amortization"
1,5929,Cost Of Revenue Exclusive Of Depreciation And Amortization Shown Separately Below
2,7773,Cost Of Revenues Exclusive Of Depreciation And Amortization
3,10737,"Cost Of Revenue Exclusive Of Depreciation And Amortization, Which Is Shown Separately Below"
4,11219,Cost Of Revenues Exclusive Of Depreciation And Amortization Expense Shown Separately Below
5,12032,Cost Of Revenue Exclusive Of Depreciation And Amortization As Shown Separately Below
6,14703,Cost Of Revenue Excluding Depreciation And Amortization
7,16995,"Costs Of Revenue, Exclusive Of Depreciation And Amortization"
8,21226,"Cost Of Revenues, Exclusive Of Depreciation And Amortization"


In [22]:
mask = df['line_item'].str.contains(r"Cost Of Sales",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Total",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Part|Serv|Prod|Unit|Comm|^Dep",case=True, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Software|Chem|Mids|Auto",case=True, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Excl|Incl|Res|Rev|Online|Fuel",case=True, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Occ|Member|Ext|Energy|Type",case=True, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Other|Man|Oper|Sub|Food|Bev",case=False, regex=True, na=False)
         
df[mask].reset_index()

Unnamed: 0,index,line_item
0,1581,Cost Of Sales:
1,4301,Cost Of Sales
2,11913,Cost Of Sales-A
3,17679,Cost Of Sales
4,18124,Cost Of Sales A


In [23]:
mask = df['line_item'].str.contains(r"Cost Of Sales",case=False, regex=True, na=False) & \
          df['line_item'].str.contains(r"Excl",case=False, regex=True, na=False) & \
          df['line_item'].str.contains(r"Dep",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Total",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Part|Serv|Prod|Unit|Comm|^Dep",case=True, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Software|Chem|Mids|Auto",case=True, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Res|Rev|Online|Fuel",case=True, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Occ|Member|Ext|Energy|Type",case=True, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Other|Man|Oper|Sub|Food|Bev",case=False, regex=True, na=False)
         
df[mask].reset_index()

Unnamed: 0,index,line_item
0,1180,Cost Of Sales Excluding Depreciation Shown Below:
1,5837,Cost Of Sales Excluding Depreciation And Amortization Expense Shown Separately Below
2,5996,"Cost Of Sales Excludes Depreciation, Depletion And Amortization Of 3,584 In 2011, 3,145 In 2010 ..."
3,6584,"Cost Of Sales Excludes Depreciation, Depletion, And Amortization Of 4,000 In 2017, 4,266 In 2016..."
4,6878,"Cost Of Sales, Exclusive Of Depreciation And Amortization"
5,8041,"Cost Of Sales Excludes Depreciation, Depletion And Amortization Of 5,341 In 2013, 4,504 In 2012 ..."
6,8198,Cost Of Sales Exclusive Of Depreciation Shown Below
7,13451,"Cost Of Sales, Excluding Depreciation"
8,13697,Cost Of Sales Excluding Depreciation Shown Below
9,14137,"Cost Of Sales Excludes Depreciation, Depletion, And Amortization Of 3,976 In 2018, 4,000 In 2017..."


### Total Cost of Revenue

In [25]:
mask = df['line_item'].str.contains(r"Cost Of Sales",case=False, regex=True, na=False) & \
          df['line_item'].str.contains(r"Total",case=False, regex=True, na=False) & \
         ~df['line_item'].str.contains(r"Veh|Other|Oper|Excl",case=True, regex=True, na=False)
df[mask].reset_index()

Unnamed: 0,index,line_item
0,3341,Total Cost Of Sales
1,7691,Total Cost Of Sales


In [26]:
mask = df['line_item'].str.contains(r"Cost Of Sales",case=False, regex=True, na=False) & \
          df['line_item'].str.contains(r"Total",case=False, regex=True, na=False) & \
          df['line_item'].str.contains(r"Excl",case=False, regex=True, na=False) & \
         ~df['line_item'].str.contains(r"Veh|Other|Oper",case=True, regex=True, na=False)
df[mask].reset_index()

Unnamed: 0,index,line_item
0,4715,"Total Cost Of Sales, Excluding Restaurant Depreciation And Amortization Of 295.6, 283.4, 267.1, ..."
1,10340,"Total Cost Of Sales, Excluding Restaurant Depreciation And Amortization Of 373.7, 326.9 And 295...."
2,12276,"Total Cost Of Sales, Excluding Restaurant Depreciation And Amortization Of 326.9, 295.6, 283.4, ..."
3,16370,Total Cost Of Sales Excluding Depreciation Shown Below
4,17560,"Total Cost Of Sales, Excluding Restaurant Depreciation And Amortization Of 282.3, 257.5 And 219...."


In [27]:
mask = df['line_item'].str.contains(r"Cost Of Revenue",case=True, regex=True, na=False) & \
          df['line_item'].str.contains(r"Total",case=True, regex=True, na=False) & \
          df['line_item'].str.contains(r"Excl",case=False, regex=True, na=False) & \
         ~df['line_item'].str.contains(r"Veh|Other|Oper|Non|Auto",case=True, regex=True, na=False)
df[mask].reset_index()

Unnamed: 0,index,line_item
0,5962,Total Cost Of Revenues Exclusive Of Amortization Shown Separately Below
1,16308,Total Cost Of Revenues Exclusive Of Acquired Intangible Assets Amortization Shown Separately Below


### Selling, General, Administrative Including Stock-Based Comp

In [28]:
mask = df['line_item'].str.contains(r"Selling",case=False, regex=True, na=False) & \
          df['line_item'].str.contains(r"General",case=False, regex=True, na=False) & \
          df['line_item'].str.contains(r"Admin",case=False, regex=True, na=False) & \
          df['line_item'].str.contains(r"Including Stock",case=False, regex=True, na=False)
       
df[mask].reset_index()

Unnamed: 0,index,line_item
0,4858,"Selling, General, Administrative And Development Expense Including Stock-Based Compensation Expe..."
1,4959,"Selling, General, Administrative And Development Expense Including Stock-Based Compensation Expe..."
2,5332,"Selling, General And Administrative Expenses Including Stock-Based Compensation Of 13,274 In 201..."
3,6243,"Selling, General And Administrative Including Stock-Based Compensation And Excluding Depreciatio..."
4,7083,"Selling, General, Administrative And Development Expense Including Stock-Based Compensation Expe..."
5,7335,"Selling, General And Administrative Expenses Including Stock-Based Compensation Of 19,530 And 13..."
6,8345,"Selling, General And Administrative, Including Stock-Based Compensation And Transaction Costs No..."
7,8497,"Selling, General And Administrative Expenses Including Stock-Based Compensation Of 21,462, 19,53..."
8,9063,"Selling, General, Administrative And Development Expense Including Stock-Based Compensation Expe..."
9,12475,"Selling, General And Administrative Including Stock-Based Compensation Of 62,875 In 2012, 54,261..."


In [34]:
mask = df['line_item'].str.contains(r"Selling",case=False, regex=True, na=False) & \
          df['line_item'].str.contains(r"General",case=False, regex=True, na=False) & \
          df['line_item'].str.contains(r"Admin",case=False, regex=True, na=False) & \
          df['line_item'].str.contains(r"Excluding",case=False, regex=True, na=False) & \
          df['line_item'].str.contains(r"Dep|Amort",case=False, regex=True, na=False) & \
         ~ df['line_item'].str.contains(r"Including",case=False, regex=True, na=False) 
         
df[mask].reset_index()

Unnamed: 0,index,line_item
0,4793,"Selling, General And Administrative Expenses, Excluding Depreciation And Amortization"
1,5167,"Selling, General, And Administrative Expenses, Excluding Depreciation And Amortization"
2,17710,"Selling, General And Administrative Expenses Excluding Depreciation And Amortization Expense Of 81,695, 75,211, And 56,011 Shown Separately Below"


In [36]:
mask = df['line_item'].str.contains(r"Selling",case=False, regex=True, na=False) & \
          df['line_item'].str.contains(r"General",case=False, regex=True, na=False) & \
          df['line_item'].str.contains(r"Admin",case=False, regex=True, na=False) & \
          df['line_item'].str.contains(r"Excluding",case=False, regex=True, na=False) & \
         ~df['line_item'].str.contains(r"Dep|Amort",case=False, regex=True, na=False) & \
         ~df['line_item'].str.contains(r"Including",case=False, regex=True, na=False) 
         
df[mask].reset_index()

Unnamed: 0,index,line_item
0,7063,"Selling, General And Administrative Expenses, Excluding Impairment Losses"
1,10992,"Selling, General And Administrative Expenses, Excluding Goodwill Impairment"
2,11618,"Selling, General And Administrative Expenses, Excluding Receivable Impairment"
3,14919,"Selling, General And Administrative Expenses, Excluding Goodwill Impairment And Receivable Impairment"


### General And Administrative

In [9]:
mask = df['line_item'].str.contains(r"General And Administrative",case=False, regex=True, na=False) & \
          df['line_item'].str.contains(r"Incl",case=True, regex=True, na=False) & \
          df['line_item'].str.contains(r"Stock|Equity",case=True, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Dep|Selling|Excl|Member",case=True, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Party",case=True, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Asset|Total|Other|Home|Prop",case=True, regex=True, na=False) 
       
df[mask].reset_index()

Unnamed: 0,index,line_item
0,4084,"General And Administrative Including 73,517, 12,778 And 13,529 Of Equity-Based Compensation In 2..."
1,4340,"General And Administrative Including Equity-Based Compensation Expense Of 365,280, 112,252 And 9..."
2,5320,"General And Administrative Including Equity-Based Compensation Expense Of 23,317, 20,437 And 35,..."
3,5770,"General And Administrative Expenses Including Non-Cash Equity-Based Compensation, Net Of Capital..."
4,5844,"General And Administrative Including Equity-Based Compensation Expense Of 70,414, 23,559 And 23,..."
5,5891,"General And Administrative Including 250, 102 And 28 Of Non-Cash Stock-Based Compensation, Respe..."
6,6452,"General And Administrative Expenses Including Non-Cash Equity-Based Compensation, Net Of Capital..."
7,8964,"General And Administrative Including Equity-Based Compensation Expense Of 23,559, 23,317 And 20,..."
8,9163,"General And Administrative Including 448, 250 And 102 For The Years Ending December 31, 2012, 20..."
9,10701,"General And Administrative Including 12,778, 13,529 And 19,654 Of Equity-Based Compensation In 2..."


In [18]:
mask = df['line_item'].str.contains(r"General And Admin",case=False, regex=True, na=False) & \
          df['line_item'].str.contains(r"Excl",case=True, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Sell|Asset",case=True, regex=True, na=False) 
       
df[mask].reset_index()

Unnamed: 0,index,line_item
0,8295,General And Administrative Expenses Excluding Depreciation And Amortization Expense Reflected Below


### Selling, General And Admin

In [None]:
mask = df['line_item'].str.contains(r"General",case=False, regex=True, na=False) & \
          df['line_item'].str.contains(r"Administrative",case=False, regex=True, na=False) & \
          df['line_item'].str.contains(r"Selling",case=False, regex=True, na=False) & \
          df['line_item'].str.contains(r"Including|Excluding",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"^Dep|Intangible|Amort",case=True, regex=True, na=False)
       
df[mask].reset_index()

In [None]:
mask = df['line_item'].str.contains(r"General",case=False, regex=True, na=False) & \
          df['line_item'].str.contains(r"Administrative",case=False, regex=True, na=False) & \
          df['line_item'].str.contains(r"Sales",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"^Dep|Home|Percent|Intan",case=True, regex=True, na=False)
       
df[mask].reset_index()

## Operating Expenses

In [22]:
mask = df['line_item'].str.contains(r"Operat.*Expense",case=False, regex=True, na=False) & \
          df['line_item'].str.contains(r"Excl",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Asset|Segment|Item",case=True, regex=True, na=False)
       
df[mask].reset_index()

Unnamed: 0,index,line_item
0,5448,Operating Costs And Expenses Excluding Depreciation And Amortization
1,7242,"Direct Operating Expenses, Excluding Depreciation And Amortization"
2,8734,"Operating Expenses, Excluding Depreciation And Amortization Below"
3,11495,Direct Operating Expense Exclusive Of Depreciation And Amortization
4,14250,Operating Expense Exclusive Of Depreciation And Amortization Shown Separately Below
5,15533,"Operating Expense, Excluding Depreciation And Amortization Below"
6,15580,Operating Expenses Exclusive Of Depreciation And Amortization
7,17692,Operating Expenses Excluding Depreciation And Amortization Shown Separately Below
8,19993,Operating Expenses Excluding Depreciation And Amortization Expense Reflected Below


In [27]:
mask = df['line_item'].str.contains(r"Operat.*Expense",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Incl|Excl",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Asset|Segment|Item|In |Other",case=True, regex=True, na=False)
       
df[mask].reset_index()

Unnamed: 0,index,line_item
0,283,Operating Expense [Member]
1,326,Non-Operating Expense:
2,386,"Non-Operating Income Expense, Net:"
3,399,Cost And Operating Expenses:
4,448,Nonoperating Expenses:
5,488,Operating Expenses
6,524,Non-Operating Income And Expense
7,554,"Non-Operating Expense Income, Net"
8,590,Non-Operating Expenses Income:
9,600,Non-Operating Income Expense


### Gross Profit

In [None]:
mask = df['line_item'].str.contains(r"Gross Profit",case=False, regex=True, na=False) & \
         ~ df['line_item'].str.contains(r"Member|Per|Total",case=False, regex=True, na=False)
   
df[mask].reset_index()

In [28]:
mask = df['line_item'].str.contains(r"Gross Profit",case=False, regex=True, na=False) & \
           df['line_item'].str.contains(r"Total",case=False, regex=True, na=False) & \
         ~ df['line_item'].str.contains(r"Member|Per",case=False, regex=True, na=False)
   
df[mask].reset_index()

Unnamed: 0,index,line_item
0,12117,Total Gross Profit
1,19692,"Gross Profit, Total"


# Depreciation