In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from os import listdir
import re

import csv
import datetime
import pickle
import os

import calendar
import random

In [2]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 500)
pd.options.display.float_format = '{:.0f}'.format

In [3]:
DATA_ROOT_PATH="/mnt/data/projects/MD1/data/R1000/reports/"
PROJ_ROOT_PATH="/home/priyesh/projects/MD1"

In [4]:
# Read from Pickle file

filepath = os.path.join(PROJ_ROOT_PATH,'pickle','std_statements_stage4.pkl')
master_std_statements = pd.read_pickle(filepath)

In [5]:
# Read from Pickle file

filepath = os.path.join(PROJ_ROOT_PATH,'pickle','clean_master_data_stage2.pkl')
clean_master_data = pd.read_pickle(filepath)

In [6]:
# Read from Pickle file

filepath = os.path.join(PROJ_ROOT_PATH,'pickle','master_data_stage2.pkl')
master_data = pd.read_pickle(filepath)

In [7]:
# Read from Pickle file

table_labels = {}

filepath = os.path.join(PROJ_ROOT_PATH,'pickle','mappings_cash_stage3.pkl')
table_labels['cash'] = pd.read_pickle(filepath)
                        
filepath = os.path.join(PROJ_ROOT_PATH,'pickle','mappings_bal_stage3.pkl')
table_labels['balance'] = pd.read_pickle(filepath)
                        
filepath = os.path.join(PROJ_ROOT_PATH,'pickle','mappings_income_stage3.pkl')
table_labels['income'] = pd.read_pickle(filepath)

In [9]:
stype='balance'
table_labels[stype]['headings']

Unnamed: 0_level_0,type
heading,Unnamed: 1_level_1
Eidp | 3.50 Series Preferred Stock [Member],EXCL
International Annuity [Member],EXCL
Non-Controlling Common Units In The Operating Partnership [Member],EXCL
Electric [Member],EXCL
"Houston Electric [Member] | Variable Interest Entity, Primary Beneficiary [Member]",EXCL
...,...
Ohio Power Co [Member],EXCL
First Horizon National Corporation Shareholders Equity:,SE
Ku [Member],EXCL
Unity Software Inc. Stockholders Equity:,SE


In [10]:
heading_lookup = {}
line_item_lookup = {}

for stype in ['balance','income','cash']:
  print(stype) 
  temp = table_labels[stype]['headings'].to_dict()
  heading_lookup[stype] = temp['type']

  temp = table_labels[stype]['line_items'].to_dict()
  line_item_lookup[stype] = temp['type']

balance
income
cash


In [11]:
clean_master_data['GOOGL'].keys()

dict_keys(['17', '23', '16', '22', '21', '19', '18', '20'])

In [12]:
def swap_text(type, text, stype):
  
  # Default to original text. Swap if found in lookup.
    
  new_text = text
    
  if text in swap_text_dict[stype][type]:
      
    new_text = swap_text_dict[stype][type][text]

  return new_text  

In [13]:
def find_line_items(table, heading_lookup, line_item_lookup, stype):
 
  df_new = pd.DataFrame()

  h = 'No Heading'
  items_found = []
    
  h_lookup = heading_lookup[stype]
  l_lookup = line_item_lookup[stype]
    
  items_required = list(set(l_lookup.values()))

  # Go through each line in table, if row contains only nan then assume heading

  for index, row in table.iterrows():

    if row.isnull().all():
   
      # If all columns are null then it is either a header we are interested in, a header not recognised 
      # or a line item where values are not specified.
    
      # If we encounter a header that has Member or Abstract then terminate loop and ignore subsequent items.
    
      if index in h_lookup:
        
        h = h_lookup[index]
        
        if h == 'EXCL':
          break
        else:
          r1 = row.copy()
          #r1 = r1.rename(swap_text('H',index,stype))
          r1['type'] = h
          r1['line_type'] = 'H'
          df_new = pd.concat([df_new, pd.DataFrame([r1])])
  
    else:
      # If row matches one of the items mapped for current header then store
    
      if index.startswith('Basic'):
        break
        
      if index.startswith('Net Revenues From Collaborators'):
        break
        
      if re.match(r'^Weighted.*Average.*Shares',index, re.IGNORECASE):
        break
        
      if re.match(r'^Weighted.*Average.*Basic',index, re.IGNORECASE):
        break
        
      if re.match(r'^Earnings.*Common Share.*Basic',index, re.IGNORECASE): 
        break
        
      if re.match(r'^Earnings.*per Share.*Basic',index, re.IGNORECASE): 
        break
        
      if re.match(r'^Net.*Income.*Attrib',index, re.IGNORECASE): 
        break
        
      if index in l_lookup:
        
        l = l_lookup[index]
        items_found.append(l)
          
        r1 = row.copy()
        #r1.rename(swap_text('L',index,stype))
        r1['type'] = l
        r1['line_type'] = 'L'
        df_new = pd.concat([df_new, pd.DataFrame([r1])])

  return(df_new)

In [14]:
stype='income'
l_lookup = line_item_lookup[stype]
l_lookup['Revenues']

KeyError: 'Revenues'

In [15]:
def extract_summary(stype):
    
  ticker_list = list(clean_master_data.keys())

  master_statements = {}

  for ticker in ticker_list:

    master_statements[ticker] = {}
    print(ticker)
    
    for yr in clean_master_data[ticker]:
    
      # Extract statement for each year
    
      master_statements[ticker]
      if stype in clean_master_data[ticker][yr]:
        
        master_statements[ticker][yr] = find_line_items(clean_master_data[ticker][yr][stype]['table'],
                                                          heading_lookup,line_item_lookup,stype)
  return master_statements


In [16]:
def conv_date_cols(cols):
    
  mnth_lookup = {month: index for index, month in enumerate(calendar.month_abbr) if month}

  new_cols = []

  for c in cols:
    
    x = re.search(r"^[A-Z][a-z][a-z]\. \d{2}, \d{4}", c)
    
    if x:
      mnth = mnth_lookup[c.split(".")[0]]
      yr = c[-4:]
      t = yr + '-' + str(mnth)
      new_cols = new_cols + [t]
    else:
      new_cols = new_cols + [c] 

  return new_cols

In [17]:
def extract_statements(stype, master_summary_data):
    
  ticker_list = list(master_summary_data.keys())

  master_statements = {}

  for ticker in master_summary_data:
    
    print(ticker)
    statement = master_summary_data[ticker]
    keys = list(statement.keys())
    
    if len(keys) > 0:
                                            
      keys = sorted(keys,reverse=True)
      df = statement[keys[0]].copy()
      df = df[~df.index.duplicated(keep='first')]

      for i in range(1,len(keys)):
        
        print("Building Table for",ticker,i)
        
        df1_cols = list(set(statement[keys[i]].columns)- set(df.columns))
        df1 = statement[keys[i]][df1_cols].copy()
        df1 = df1[~df1.index.duplicated(keep='first')] 
        df = df.merge(df1, right_index=True, left_index=True, how='left', suffixes=['_x', '_y'])  
    
      # We could end up with an empty dataframe if there are no matches. So skip if that is the case.
     
      if len(df) > 0:
        # Amend date format of column names
        
        cols = list(df.columns)
        new_cols = conv_date_cols(cols)
        df.columns = new_cols

        # Sort in reverse date order, place type and line_type at end
    
        new_cols.remove('type')
        new_cols.remove('line_type')
        cols = sorted(new_cols,reverse=True) + ['type','line_type']
        df = df[cols]
        
      master_statements[ticker] = df

  return master_statements

In [18]:
def find_duplicates(df1):

  df_temp = df1.copy()
  df_temp = df_temp.reset_index(names=['line_item'])
  df_dup = df_temp[df_temp.duplicated(subset=['line_item','line_type'])]

  return df_dup

In [19]:
df1 = clean_master_data['ADI']['11']['income']['table']
df1

Unnamed: 0_level_0,"Oct. 29, 2011","Oct. 30, 2010","Oct. 31, 2009"
line_item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Revenue,,,
Revenue,2993320.0,2761503.0,2014908.0
Costs And Expenses,,,
Cost Of Sales,1006779.0,962081.0,896271.0
Gross Margin,1986541.0,1799422.0,1118637.0
Operating Expenses:,,,
Research And Development,505570.0,492305.0,446980.0
SGA,406707.0,390560.0,333184.0
Special Charges,2239.0,16483.0,53656.0
Total Operating Expenses,914516.0,899348.0,833820.0


In [20]:
df1 = find_line_items(df1, heading_lookup, line_item_lookup, 'income')
df1

Unnamed: 0,"Oct. 29, 2011","Oct. 30, 2010","Oct. 31, 2009",type,line_type
Revenue,,,,R,H
Costs And Expenses,,,,OE,H
Cost Of Sales,1006779.0,962081.0,896271.0,CR,L
Gross Margin,1986541.0,1799422.0,1118637.0,GM,L
Research And Development,505570.0,492305.0,446980.0,RD,L
SGA,406707.0,390560.0,333184.0,SGA,L
Total Operating Expenses,914516.0,899348.0,833820.0,TOE,L
Operating Income From Continuing Operations,1072025.0,900074.0,284817.0,ICO,L
Income From Continuing Operations Before Income Taxes,1061447.0,901665.0,297444.0,ICO,L
"Income From Continuing Operations, Net Of Tax",860894.0,711225.0,247408.0,ICO,L


In [80]:
master_data['NFG']['22']['income']

Unnamed: 0_level_0,Consolidated Statements Of Income And Earnings Reinvested In The Business - USD ($) $ in Thousands,12 Months Ended,12 Months Ended,12 Months Ended
Unnamed: 0_level_1,Consolidated Statements Of Income And Earnings Reinvested In The Business - USD ($) $ in Thousands,"Sep. 30, 2022","Sep. 30, 2021","Sep. 30, 2020"
0,INCOME,,,
1,Operating Revenues,"$ 2,186,046","$ 1,742,659","$ 1,546,291"
2,Operating Expenses:,,,
3,Purchased Gas,392093,171827,233890
4,"Property, Franchise and Other Taxes",101182,94713,88400
5,"Depreciation, Depletion and Amortization",369790,335303,306158
6,Impairment of Oil and Gas Producing Properties,0,76152,449438
7,Total Operating Expenses,1384266,1153801,1516433
8,Gain on Sale of Assets,"(12,736)","(51,066)",0
9,Operating Income,814516,639924,29858


In [18]:
# Create lookup dictionaries

swap_text_dict = {}
swap_text_dict['income'] = {}
swap_text_dict['cash'] = {}
swap_text_dict['balance'] = {}
              
swap_text_dict['income']['H'] = {}
swap_text_dict['income']['L'] = {}

swap_text_dict['cash']['H'] = {}
swap_text_dict['cash']['L'] = {}

swap_text_dict['balance']['H'] = {}
swap_text_dict['balance']['L'] = {}

## Income Statement

In [19]:
income_summary = extract_summary('income')

AA
AAL
AAP
AAPL
ABBV
ABNB
ABT
ACGL
ACHC
ACI
ACM
ACN
ADBE
ADI
ADM
ADP
ADSK
ADT
AEE
AEP
AES
AFG
AFL
AFRM
AGCO
AGL
AGNC
AGO
AGR
AIG
AIZ
AJG
AKAM
AL
ALB
ALGM
ALGN
ALK
ALL
ALLE
ALLY
ALNY
ALSN
AM
AMAT
AMC
AMCR
AMD
AME
AMED
AMG
AMGN
AMP
AMZN
AN
ANET
ANSS
AON
AOS
APA
APD
APH
APO
APP
APTV
AR
ARES
ARMK
ARW
ASH
ATO
ATR
ATUS
ATVI
AVGO
AVT
AVTR
AWI
AWK
AXON
AXS
AXTA
AYI
AYX
AZEK
AZO
AZTA
BA
BAC
BAH
BALL
BAX
BBWI
BBY
BC
BDX
BEN
BERY
BFAM
BG
BHF
BIIB
BILL
BIO
BJ
BK
BKNG
BLD
BLDR
BLK
BMRN
BMY
BOKF
BR
BRKR
BRO
BSX
BSY
BURL
BWA
BWXT
BX
BYD
C
CABO
CACC
CAH
CAR
CARR
CAT
CB
CBOE
CBSH
CC
CCCS
CCK
CCL
CDAY
CDNS
CDW
CE
CEG
CERT
CF
CFG
CFLT
CFR
CG
CGNX
CHD
CHDN
CHE
CHH
CHPT
CHRW
CHTR
CIEN
CINF
CL
CLH
CLVT
CLX
CMA
CMCSA
CME
CMG
CMI
CMS
CNA
CNM
CNP
CNXC
COF
COIN
COLB
COLM
COO
COST
COTY
CPB
CPRI
CPRT
CR
CRI
CRL
CRUS
CRWD
CSCO
CSL
CTLT
CTRA
CTSH
CTVA
CVNA
CVX
CW
CZR
D
DAL
DAR
DASH
DBX
DCI
DD
DDOG
DECK
DFS
DG
DGX
DHI
DHR
DINO
DIS
DISH
DKNG
DKS
DLB
DLTR
DNA
DOCS
DOCU
DOV
DOW
DPZ
DRVN
DT
DTE
DTM
DUK
DV
DVA
DVN
DXC
DX

In [20]:
income_summary['NFG']['22']

Unnamed: 0,"Dec. 31, 2019","Dec. 31, 2018","Dec. 31, 2017",type,line_type
Revenues,,,,R,H
Total Revenue,4891.0,4494.0,4268.0,TR,L
Total Losses And Expenses,4369.0,4212.0,3975.0,TOE,L
Income From Continuing Operations Before Income Taxes,522.0,282.0,293.0,ICO,L
Income From Continuing Operations,429.0,239.0,216.0,ICO,L
Net Income,425.0,391.0,186.0,NI,L


In [21]:

dups_list = []
tickers_with_dups = {}

ticker_list = list(income_summary.keys())

for ticker in ticker_list:
  
  print(ticker)
    
  for yr in income_summary[ticker]:
    
    df_dup = find_duplicates(income_summary[ticker][yr])
    
    if len(df_dup) > 0:
      
       dups = {}
       dups['yr'] = yr
       dups['df_orig'] = income_summary[ticker][yr]
       dups['df_dup']  = df_dup
        
       tickers_with_dups[ticker] = dups


AA
AAL
AAP
AAPL
ABBV
ABNB
ABT
ACGL
ACHC
ACI
ACM
ACN
ADBE
ADI
ADM
ADP
ADSK
ADT
AEE
AEP
AES
AFG
AFL
AFRM
AGCO
AGL
AGNC
AGO
AGR
AIG
AIZ
AJG
AKAM
AL
ALB
ALGM
ALGN
ALK
ALL
ALLE
ALLY
ALNY
ALSN
AM
AMAT
AMC
AMCR
AMD
AME
AMED
AMG
AMGN
AMP
AMZN
AN
ANET
ANSS
AON
AOS
APA
APD
APH
APO
APP
APTV
AR
ARES
ARMK
ARW
ASH
ATO
ATR
ATUS
ATVI
AVGO
AVT
AVTR
AWI
AWK
AXON
AXS
AXTA
AYI
AYX
AZEK
AZO
AZTA
BA
BAC
BAH
BALL
BAX
BBWI
BBY
BC
BDX
BEN
BERY
BFAM
BG
BHF
BIIB
BILL
BIO
BJ
BK
BKNG
BLD
BLDR
BLK
BMRN
BMY
BOKF
BR
BRKR
BRO
BSX
BSY
BURL
BWA
BWXT
BX
BYD
C
CABO
CACC
CAH
CAR
CARR
CAT
CB
CBOE
CBSH
CC
CCCS
CCK
CCL
CDAY
CDNS
CDW
CE
CEG
CERT
CF
CFG
CFLT
CFR
CG
CGNX
CHD
CHDN
CHE
CHH
CHPT
CHRW
CHTR
CIEN
CINF
CL
CLH
CLVT
CLX
CMA
CMCSA
CME
CMG
CMI
CMS
CNA
CNM
CNP
CNXC
COF
COIN
COLB
COLM
COO
COST
COTY
CPB
CPRI
CPRT
CR
CRI
CRL
CRUS
CRWD
CSCO
CSL
CTLT
CTRA
CTSH
CTVA
CVNA
CVX
CW
CZR
D
DAL
DAR
DASH
DBX
DCI
DD
DDOG
DECK
DFS
DG
DGX
DHI
DHR
DINO
DIS
DISH
DKNG
DKS
DLB
DLTR
DNA
DOCS
DOCU
DOV
DOW
DPZ
DRVN
DT
DTE
DTM
DUK
DV
DVA
DVN
DXC
DX

In [22]:
tickers_with_dups.keys()

dict_keys(['AEE', 'ALLY', 'AMC', 'ARES', 'ATO', 'ATVI', 'BALL', 'BKNG', 'CE', 'CLH', 'CLX', 'CVNA', 'CVX', 'DECK', 'DISH', 'DLTR', 'DTE', 'DVA', 'ED', 'EIX', 'EPAM', 'EQT', 'EVA', 'EXEL', 'FCNCA', 'FITB', 'FOUR', 'FTNT', 'GPS', 'HAYW', 'HBI', 'HSY', 'HWM', 'IART', 'IEX', 'IQV', 'ITT', 'KHC', 'L', 'LII', 'LKQ', 'LSCC', 'LUV', 'MASI', 'MCHP', 'MRCY', 'MRTX', 'MS', 'MSA', 'NI', 'NOC', 'NVT', 'OWL', 'PARA', 'PCG', 'PENN', 'PKI', 'PNR', 'PSX', 'QS', 'RARE', 'RMD', 'SEB', 'SGEN', 'SMG', 'SPB', 'SSNC', 'STZ', 'TFX', 'TNDM', 'TRU', 'VRTX', 'VST', 'WRK', 'X', 'Z'])

In [23]:
len(tickers_with_dups)

76

In [24]:
# Remove companies with duplicates

for ticker in tickers_with_dups.keys():
   print(ticker)
   clean_master_data.pop(ticker)

AEE
ALLY
AMC
ARES
ATO
ATVI
BALL
BKNG
CE
CLH
CLX
CVNA
CVX
DECK
DISH
DLTR
DTE
DVA
ED
EIX
EPAM
EQT
EVA
EXEL
FCNCA
FITB
FOUR
FTNT
GPS
HAYW
HBI
HSY
HWM
IART
IEX
IQV
ITT
KHC
L
LII
LKQ
LSCC
LUV
MASI
MCHP
MRCY
MRTX
MS
MSA
NI
NOC
NVT
OWL
PARA
PCG
PENN
PKI
PNR
PSX
QS
RARE
RMD
SEB
SGEN
SMG
SPB
SSNC
STZ
TFX
TNDM
TRU
VRTX
VST
WRK
X
Z


In [25]:
# Repeat the extraction. This time we shouldn't have any duplicates.

income_summary = extract_summary('income')

AA
AAL
AAP
AAPL
ABBV
ABNB
ABT
ACGL
ACHC
ACI
ACM
ACN
ADBE
ADI
ADM
ADP
ADSK
ADT
AEP
AES
AFG
AFL
AFRM
AGCO
AGL
AGNC
AGO
AGR
AIG
AIZ
AJG
AKAM
AL
ALB
ALGM
ALGN
ALK
ALL
ALLE
ALNY
ALSN
AM
AMAT
AMCR
AMD
AME
AMED
AMG
AMGN
AMP
AMZN
AN
ANET
ANSS
AON
AOS
APA
APD
APH
APO
APP
APTV
AR
ARMK
ARW
ASH
ATR
ATUS
AVGO
AVT
AVTR
AWI
AWK
AXON
AXS
AXTA
AYI
AYX
AZEK
AZO
AZTA
BA
BAC
BAH
BAX
BBWI
BBY
BC
BDX
BEN
BERY
BFAM
BG
BHF
BIIB
BILL
BIO
BJ
BK
BLD
BLDR
BLK
BMRN
BMY
BOKF
BR
BRKR
BRO
BSX
BSY
BURL
BWA
BWXT
BX
BYD
C
CABO
CACC
CAH
CAR
CARR
CAT
CB
CBOE
CBSH
CC
CCCS
CCK
CCL
CDAY
CDNS
CDW
CEG
CERT
CF
CFG
CFLT
CFR
CG
CGNX
CHD
CHDN
CHE
CHH
CHPT
CHRW
CHTR
CIEN
CINF
CL
CLVT
CMA
CMCSA
CME
CMG
CMI
CMS
CNA
CNM
CNP
CNXC
COF
COIN
COLB
COLM
COO
COST
COTY
CPB
CPRI
CPRT
CR
CRI
CRL
CRUS
CRWD
CSCO
CSL
CTLT
CTRA
CTSH
CTVA
CW
CZR
D
DAL
DAR
DASH
DBX
DCI
DD
DDOG
DFS
DG
DGX
DHI
DHR
DINO
DIS
DKNG
DKS
DLB
DNA
DOCS
DOCU
DOV
DOW
DPZ
DRVN
DT
DTM
DUK
DV
DVN
DXC
DXCM
EA
EBAY
ECL
EEFT
EFX
EHAB
EHC
EL
ELAN
EMN
EMR
ENOV
ENPH
ENTG
EOG
EQH
ERIE
ES


In [90]:
random.randint(0,)

In [27]:
key_list = list(income_summary.keys())
len(key_list)

757

In [100]:
key_list = list(income_summary.keys())
i = random.randint(0,len(key_list))
ticker=key_list[i]
print(ticker)
income_summary[ticker]['22']

CABO


Unnamed: 0,"Dec. 31, 2021","Dec. 31, 2020","Dec. 31, 2019",type,line_type
Revenues,1605836.0,1325229.0,1167997.0,R,L
Costs And Expenses:,,,,OE,H
SGA,347058.0,255163.0,245120.0,SGA,L
Total Costs And Expenses,1149264.0,855879.0,857546.0,TOE,L
Net Income,291824.0,304391.0,178582.0,NI,L


In [101]:
print(ticker)
clean_master_data[ticker]['22']['income']['table']

CABO


Unnamed: 0_level_0,"Dec. 31, 2021","Dec. 31, 2020","Dec. 31, 2019"
line_item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Revenues,1605836.0,1325229.0,1167997.0
Costs And Expenses:,,,
Operating Excluding Depreciation And Amortization,455352.0,418704.0,388552.0
SGA,347058.0,255163.0,245120.0
Depreciation And Amortization,339025.0,265658.0,216687.0
"-Gain Loss On Asset Sales And Disposals, Net",7829.0,-1072.0,7187.0
Gain On Sale Of Business,0.0,-82574.0,0.0
Total Costs And Expenses,1149264.0,855879.0,857546.0
Income From Operations,456572.0,469350.0,310451.0
Interest Expense,-113449.0,-73607.0,-71729.0


In [72]:
tickers_with_dups['LUV']

{'yr': '19',
 'df_orig':                           Dec. 31, 2018  Dec. 31, 2017  Dec. 31, 2016 type  \
 Operating Revenues:                 NaN            NaN            NaN    R   
 Operating Revenue                 21965          21146          20289    R   
 Operating Expenses:                 NaN            NaN            NaN   OE   
 Total Operating Expenses          18759          17739          16767  TOE   
 Operating Income                   3206           3407           3522   OI   
 Net Income                         2465           3357           2183   NI   
 Operating Revenues:                 NaN            NaN            NaN    R   
 Operating Revenue                 20455          19763          19068    R   
 Operating Revenues:                 NaN            NaN            NaN    R   
 Operating Revenue                   175            173            171    R   
 Operating Revenues:                 NaN            NaN            NaN    R   
 Operating Revenue          

In [73]:
income_summary['LUV']['19']

Unnamed: 0,"Dec. 31, 2018","Dec. 31, 2017","Dec. 31, 2016",type,line_type
Operating Revenues:,,,,R,H
Operating Revenue,21965.0,21146.0,20289.0,R,L
Operating Expenses:,,,,OE,H
Total Operating Expenses,18759.0,17739.0,16767.0,TOE,L
Operating Income,3206.0,3407.0,3522.0,OI,L
Net Income,2465.0,3357.0,2183.0,NI,L
Operating Revenues:,,,,R,H
Operating Revenue,20455.0,19763.0,19068.0,R,L
Operating Revenues:,,,,R,H
Operating Revenue,175.0,173.0,171.0,R,L


In [74]:
clean_master_data['LUV']['19']['income']['table']

Unnamed: 0_level_0,"Dec. 31, 2018","Dec. 31, 2017","Dec. 31, 2016"
line_item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Operating Revenues:,,,
Operating Revenue,21965.0,21146.0,20289.0
Operating Expenses:,,,
"Salaries, Wages, And Benefits",7649.0,7305.0,6786.0
Fuel And Oil,4616.0,4076.0,3801.0
Maintenance Materials And Repairs,1107.0,1001.0,1045.0
Landing Fees And Airport Rentals,1334.0,1292.0,1211.0
Depreciation And Amortization,1201.0,1218.0,1221.0
Other Operating Expenses,2852.0,2847.0,2703.0
Total Operating Expenses,18759.0,17739.0,16767.0


In [112]:
master_data['ADM']['20']['income']

Unnamed: 0_level_0,"Consolidated Statements Of Earnings - USD ($) shares in Millions, $ in Millions",12 Months Ended,12 Months Ended,12 Months Ended
Unnamed: 0_level_1,"Consolidated Statements Of Earnings - USD ($) shares in Millions, $ in Millions","Dec. 31, 2019","Dec. 31, 2018","Dec. 31, 2017"
0,Income Statement [Abstract],,,
1,Revenues,"$ 64,656","$ 64,341","$ 60,828"
2,Cost of products sold,60509,60160,57310
3,Gross Profit,4147,4181,3518
4,"Selling, general and administrative expenses",2493,2165,1978
5,"Asset impairment, exit, and restructuring costs",303,171,173
6,Interest expense,402,364,330
7,Equity in earnings of unconsolidated affiliates,(454),(518),(456)
8,Interest income,(192),(162),(106)
9,Other (income) expense - net,7,101,(10)


In [None]:
ticker='TGT'
yr='20'
df_dup = find_duplicates(income_summary[ticker][yr])
df_dup

In [None]:
idx=65
print(dups_list[idx]['company'])
print(dups_list[idx]['yr'])
dups_list[idx]['df_dup']

In [None]:
dups_list[idx]['df_orig']

In [None]:
len(dups_list)

In [None]:
#df_temp = income_summary[income_summary['AAL']['21'].duplicated()]

df_temp = income_summary['MSFT']['21']
df_temp = df_temp.reset_index(names=['line_item'])
X = df_temp[df_temp.duplicated(subset=['line_item','line_type'])]
X

In [None]:
master_statements = extract_statements('income',income_summary)

In [None]:
master_statements['GOOG']

In [None]:
df_headings = df[df['line_type'] == 'H']
df_headings[['line_item','line_type']].value_counts()

In [None]:
swap_text_dict['income']['H'] = { 'Net Sales:' : 'Revenue:',
                                  'Revenues': 'Revenue:',
                                  'Revenues:': 'Revenue:',
                                  'Revenue': 'Revenue:',
                                  'Net Revenue:': 'Revenue:',
                                  'Operating Revenue:': 'Revenue:',
                                  'Sales And Revenues:': 'Revenue:',
                                  'Sales And Other Operating Revenues:': 'Revenue:',
                                  'Net Sales And Revenue': 'Revenue:',
                                  'Costs And Operating Expenses': 'Operating Expenses:',
                                  'Cost And Expenses:': 'Operating Expenses:',
                                  'Cost And Expenses': 'Operating Expenses:',
                                  'Cost Of Sales And Operating Expenses:': 'Operating Expenses:',
                                  'Cost Of Sales And Service Revenues': 'Cost of Sales'                 
                                }

In [None]:
df['line_item'] = df['line_item'].apply(lambda x: swap_text('H',x,'income'))


In [None]:
swap_text('H','Net Sales:', 'income')

In [None]:
df_line_items = df[df['line_type'] == 'L']
df_line_items[['line_item','type']].value_counts()

In [None]:
swap_text_dict['income']['L'] = 
         {'Selling, General And Administrative Expense' : 'Selling, General And Admin',
          'Selling, General, And Administrative Expenses': 'Selling, General And Admin',
          'Selling, General & Administrative Expenses': 'Selling, General And Admin',
          'Selling, General, And Administrative Expense': 'Selling, General And Admin',
          'Selling, General, And Administrative': 'Selling, General And Admin',
          'Selling, General and Administrative': 'Selling, General And Admin',
          'Operating, Selling, General And Administrative Expenses': 'Selling, General And Admin',
          'General And Administrative Expenses': 'General And Admin',
          'Marketing, General And Administrative': 'Marketing, General And Admin',
          'Marketing, General And Administrative Expenses': 'Marketing, General And Admin',
          'General, Administrative And Other': 'General And Admin',
          'Selling And Administrative': 'Selling, General And Admin',
          'General, Administrative And Other Expenses': 'General And Admin',
          'Total General And Administrative Expenses': 'Total General And Admin',
          'Corporate General And Administrative': 'General And Admin',
          'Selling, General Administrative, And Other Expenses': 'Selling, General And Admin',
          'General And Administrative Expenses -Exclusive Of Depreciation And Amortization': 'General And Admin',
          'Operating, General And Administrative': 'General And Admin'                             
          }


In [None]:
df['line_item'] = df['line_item'].apply(lambda x: swap_text('L',x,'income'))

In [None]:
df_headings

In [None]:
df_headings = df[df['line_type'] == 'H']
df_headings[['line_item','line_type']].value_counts()

In [None]:
df_line_items = df[df['line_type'] == 'L']
df_line_items[['line_item','line_type']].value_counts()

In [None]:
df = master_std_statements['income']['CSCO']
df.reset_index(names=['line_item'])

## Balance Statement

In [None]:
balance_summary = extract_summary('balance')

In [None]:
balance_summary['T']['17']

In [None]:
cols = list(df.columns)

new_cols = []

for c in cols:
    if (c != 'type') & (c != 'line_type'):
        new_cols = new_cols + [c.split("-")[0]]
        
new_cols

In [None]:
master_statements = extract_statements('balance',balance_summary)

In [None]:
master_statements['CSCO']

## Cash Statement

In [None]:
cash_summary = extract_summary('cash')

In [None]:
master_statements = extract_statements('cash',cash_summary)

In [None]:
master_statements['F']

In [None]:
df_headings = df[df['line_type'] == 'H']
df_headings[['line_item','type']].value_counts()

In [None]:
cash_summary['NVDA']['22']

In [None]:
len(master_statements)

In [84]:
# Save to file

with open(PROJ_ROOT_PATH + '/pickle/clean_master_data_stage5.pkl', 'wb') as f:
  pickle.dump(clean_master_data, f)

In [85]:
# Save to file

with open(PROJ_ROOT_PATH + '/pickle/income_summary_stage5.pkl', 'wb') as f:
  pickle.dump(income_summary, f)