In [1]:
from os.path import exists
import numpy as np
import pandas as pd
import requests as req
import time
import os
import pickle

import re

In [2]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 100)
pd.options.display.float_format = '{:.2f}'.format

In [3]:
DATA_ROOT_PATH="/mnt/data/projects/MD1/data/R1000/reports/"
PROJ_ROOT_PATH="/home/priyesh/projects/MD1"

In [4]:
# Read from Pickle file

filepath = os.path.join(PROJ_ROOT_PATH,'pickle','clean_master_data_stage2.pkl')
clean_master_data = pd.read_pickle(filepath)

In [5]:
def get_line_items(t):
 
  headings = []
  line_items = []
    
  for index, row in t.iterrows():
    
    if row.replace('', np.nan).isna().all():
      # Blank row, so assume heading
    
      headings.append(index)
    else:    
      line_items.append(index)

  return headings, line_items

In [6]:
def get_table_labels(stype):

  master_headings = []
  master_line_items = []

  ticker_list = list(clean_master_data.keys())

  for ticker in ticker_list:
    
    print(ticker)
    
    for yr in clean_master_data[ticker]:
      
      # Allow for the likelihood that statement might be missing for given ticker and yr
        
      if stype in clean_master_data[ticker][yr]:
        try:
          headings, line_items = get_line_items(clean_master_data[ticker][yr][stype]['table'])
    
          master_headings = master_headings + headings
          master_line_items = master_line_items + line_items
        except:
          print("Error : {} {}".format(ticker, yr))
          exit() 

  # Remove duplicates from headings and line items

  unique_list = set(master_headings)
  headings = list(unique_list)

  unique_list = set(master_line_items)
  line_items = list(unique_list)

  return {'headings' : headings,
          'line_items' : line_items}

In [7]:
table_labels = get_table_labels('balance')

AA
AAL
AAP
AAPL
ABBV
ABNB
ABT
ACGL
ACHC
ACI
ACM
ACN
ADBE
ADI
ADM
ADP
ADSK
ADT
AEE
AEP
AES
AFG
AFL
AFRM
AGCO
AGL
AGNC
AGO
AGR
AIG
AIZ
AJG
AKAM
AL
ALB
ALGM
ALGN
ALK
ALL
ALLE
ALLY
ALNY
ALSN
AM
AMAT
AMC
AMCR
AMD
AME
AMED
AMG
AMGN
AMP
AMZN
AN
ANET
ANSS
AON
AOS
APA
APD
APH
APO
APP
APTV
AR
ARES
ARMK
ARW
ASH
ATO
ATR
ATUS
ATVI
AVGO
AVT
AVTR
AWI
AWK
AXON
AXS
AXTA
AYI
AYX
AZEK
AZO
AZTA
BA
BAC
BAH
BALL
BAX
BBWI
BBY
BC
BDX
BEN
BERY
BFAM
BG
BHF
BIIB
BILL
BIO
BJ
BK
BKNG
BLD
BLDR
BLK
BMRN
BMY
BOKF
BR
BRKR
BRO
BSX
BSY
BURL
BWA
BWXT
BX
BYD
C
CABO
CACC
CAH
CAR
CARR
CAT
CB
CBOE
CBSH
CC
CCCS
CCK
CCL
CDAY
CDNS
CDW
CE
CEG
CERT
CF
CFG
CFLT
CFR
CG
CGNX
CHD
CHDN
CHE
CHH
CHPT
CHRW
CHTR
CIEN
CINF
CL
CLH
CLVT
CLX
CMA
CMCSA
CME
CMG
CMI
CMS
CNA
CNM
CNP
CNXC
COF
COIN
COLB
COLM
COO
COST
COTY
CPB
CPRI
CPRT
CR
CRI
CRL
CRUS
CRWD
CSCO
CSL
CTLT
CTRA
CTSH
CTVA
CVNA
CVX
CW
CZR
D
DAL
DAR
DASH
DBX
DCI
DD
DDOG
DECK
DFS
DG
DGX
DHI
DHR
DINO
DIS
DISH
DKNG
DKS
DLB
DLTR
DNA
DOCS
DOCU
DOV
DOW
DPZ
DRVN
DT
DTE
DTM
DUK
DV
DVA
DVN
DXC
DX

In [8]:
clean_master_data['WMT']['15']['balance']['table']

Unnamed: 0_level_0,"Jan. 31, 2015","Jan. 31, 2014"
line_item,Unnamed: 1_level_1,Unnamed: 2_level_1
Current Assets:,,
Cash And Cash Equivalents,9135.0,7281.0
"Receivables, Net",6778.0,6677.0
Inventories,45141.0,44858.0
Prepaid Expenses And Other,2224.0,1909.0
Current Assets Of Discontinued Operations,0.0,460.0
Total Current Assets,63278.0,61185.0
Property And Equipment:,,
Property And Equipment,177395.0,173089.0
Less Accumulated Depreciation,-63115.0,-57725.0


## Map Headings

In [9]:
## Map Headings

#CA   Current Assets
#CL   Current Liabilities
#SE   Shareholder Equity

In [10]:
df = pd.DataFrame(table_labels['headings'],columns=['heading'])
df['type'] = 'X'
df = df.set_index('heading')
df

Unnamed: 0_level_0,type
heading,Unnamed: 1_level_1
Operating Lease Liability Current,X
Union Electric Company,X
Liabilities & Stockholders Equity,X
Class B [Member],X
"Accumulated Other Comprehensive Loss, Net Of Tax Benefits",X
...,...
Common Stock Voting [Member],X
"Common Stock .001 Par Value; Authorized 1,600,000 Shares; Issued 260,624 And 255,672 Shares, Respectively, And Outstanding 76,829 And 72,595 Shares, Respectively",X
Convertible Preferred Stock [Member],X
Non-Agency Mortgage-Backed Securities,X


For each heading of interest, create a search criteria which isolates strings of interests. Verify and then assign
code these rows.

In [11]:
# Current Assets

filter = df.index.str.contains(r"Asset",case=False, regex=True, na=False) & \
   ~ df.index.str.contains(r"long|other|non|deferred|discontinued|Intangible",case=False, regex=True, na=False) & \
   ~ df.index.str.contains(r"Construction|Estate|Credit|Collateral|Lease",case=False, regex=True, na=False) & \
   ~ df.index.str.contains(r"Derivative|Beneficiary|Sale|Programs",case=False, regex=True, na=False) & \
   ~ df.index.str.contains(r"Held|Banking|Finance|Management|Insurance",case=False, regex=True, na=False) & \
    ~ df.index.str.contains(r"Trading|Backed|Firm|land|Total|regulatory|investment",case=False, regex=True, na=False)

df[filter]

Unnamed: 0_level_0,type
heading,Unnamed: 1_level_1
Assets:,X
Current Assets:,X
Current Assets: [Abstract],X
Assets [Abstract],X
Current Assets [Abstract],X
"Assets, Current [Abstract]",X
Current Assets,X
Assets,X


In [12]:
df[filter] = 'CA'

In [13]:
# Current Liabilities

filter = df.index.str.contains(r"Liabilities",case=False, regex=True, na=False) & \
   ~ df.index.str.contains(r"long|other|non|deferred|discontinued|Intangible",case=False, regex=True, na=False) & \
   ~ df.index.str.contains(r"Construction|Estate|Credit|Collateral|Lease",case=False, regex=True, na=False) & \
   ~ df.index.str.contains(r"Derivative|Beneficiary|Deposit|Programs",case=False, regex=True, na=False) & \
   ~ df.index.str.contains(r"Commitments|Interest|Finance|Equity|Capital",case=False, regex=True, na=False) & \
   ~ df.index.str.contains(r"Fair|Financial|Compromise|Firm|land|Total|Policy|investment",case=False, regex=True, na=False)

df[filter]

Unnamed: 0_level_0,type
heading,Unnamed: 1_level_1
Accrued Liabilities,X
Current Liabilities,X
Current Liabilities: [Abstract],X
Current And Accrued Liabilities,X
Regulatory Liabilities:,X
Liabilities:,X
"Liabilities, Current [Abstract]",X
Liabilities And Stockholders Deficit,X
Accrued Liabilities:,X
Current Liabilities:,X


In [14]:
df[filter] = 'CL'

In [15]:
# Shareholder Equity

filter = df.index.str.contains(r"Equity",case=False, regex=True, na=False) & \
   ~ df.index.str.contains(r"long|other|non|deferred|discontinued|Intangible",case=False, regex=True, na=False) & \
   ~ df.index.str.contains(r"Construction|Estate|Credit|Collateral|Lease",case=False, regex=True, na=False) & \
   ~ df.index.str.contains(r"Derivative|Beneficiary|Deposit|Programs",case=False, regex=True, na=False) & \
   ~ df.index.str.contains(r"Commitments|Securities|Common|Capital",case=False, regex=True, na=False) & \
   ~ df.index.str.contains(r"Fair|Financial|Compromise|Note|land|Total|Policy|investment",case=False, regex=True, na=False)

df[filter]

Unnamed: 0_level_0,type
heading,Unnamed: 1_level_1
Liabilities & Stockholders Equity,X
Equity Attributable To Invesco Ltd.:,X
Liabilities And Stockholders Deficit Equity,X
Liabilities And Stockholdersʼ Equity,X
Equity Deficit,X
Liabilities And Equity,X
Equity/-Deficit,X
Liabilities And Ralcorp Equity,X
Stockholders Equity Attributable To Parent [Abstract],X
"Liabilities, Mezzanine Equity And Equity",X


In [16]:
df[filter] = 'SE'

In [17]:
filter = df.index.str.contains(r"Shareholders Equity",case=False, regex=True, na=False) * \
            ~df.index.str.contains(r"Total|Liabilities",case=False, regex=True, na=False)

df[filter]

Unnamed: 0_level_0,type
heading,Unnamed: 1_level_1
Shareholders Equity Deficit:,SE
Shareholders Equity [Abstract],SE
Common Shareholders Equity [Abstract],X
Shareholders Equity Deficit,SE
Shareholders Equity:,SE
Common Shareholders Equity,X
Shareholders Equity,SE
Shareholders Equity / Members Deficit:,SE
Company Shareholders Equity,SE
Shareholders Equity Attributable To Snap-On Incorporated:,SE


In [18]:
df[filter] = 'SE'

In [19]:
filter = df.index.str.contains(r"Member\]$",case=True, regex=True, na=False)
df[filter]

Unnamed: 0_level_0,type
heading,Unnamed: 1_level_1
Class B [Member],X
Investor [Member] | Contributed Shares [Member],X
Covered Loans [Member],X
"Kinder Morgan, Inc. [Member]",X
Cecony [Member] | Gas Transmission [Member],X
"Variable Interest Entity, Primary Beneficiary [Member] | Credit Card Securitization Trusts [Member]",X
Removal Costs [Member],X
Delmarva Power & Light Company [Member] | Common Stock [Member],X
Loyalty Program [Member],X
"Variable Interest Entity, Primary Beneficiary [Member] | Exelon Generation Co L L C [Member]",X


In [20]:
df[filter] = 'EXCL'

In [21]:
df_headings = df[df['type'] != 'X']

## Map Line Items

In [22]:
# Map Line Items

#TA   Total Assets
#TCA  Total Current Assets
#TNCA Total non current assets
#TL   Total Liabilities
#TCL  Total Current Liabilities
#TNCL Total Non Current Liabilities
#TLSE Total Liabilities and Shareholder Equity
#TSE  Total Shareholder Equity
#IA   Intangible assets
#TC   Total Cash
#NPPE Net property, plant and equipment
#D    Debt
#B    Borrowing

In [23]:
df = pd.DataFrame(table_labels['line_items'],columns=['line_items'])
df['type'] = 'X'
df = df.set_index('line_items')



In [24]:
# Total Assets

filter = df.index.str.contains(r"Total",case=False, regex=True, na=False) & \
            df.index.str.contains(r"assets",case=False, regex=True, na=False) & \
          ~ df.index.str.contains(r"other|non|vie|deferred|tax|income",case=False, regex=True, na=False) & \
          ~ df.index.str.contains(r"banking|finance|intangible|interest",case=False, regex=True, na=False) & \
          ~ df.index.str.contains(r"long|management|restricted|identifiable",case=False, regex=True, na=False) & \
          ~ df.index.str.contains(r"program|insurance|consumer|trading|estate",case=False, regex=True, na=False) & \
          ~ df.index.str.contains(r"-|services",case=False, regex=True, na=False) & \
          ~ df.index.str.contains(r"held|sale|discontinued|customer|current",case=False, regex=True, na=False)

df[filter]

Unnamed: 0_level_0,type
line_items,Unnamed: 1_level_1
Total Assets A,X
"Assets, Total",X
Total Assets,X


In [25]:
df[filter] = 'TA'

In [26]:
# Total Current Assets

filter = df.index.str.contains(r"Total",case=False, regex=True, na=False) & \
            df.index.str.contains(r"current",case=False, regex=True, na=False) & \
            df.index.str.contains(r"assets",case=False, regex=True, na=False) & \
          ~ df.index.str.contains(r"other|non|vie|deferred|tax|income",case=False, regex=True, na=False) & \
          ~ df.index.str.contains(r"held|sale|discontinued|customer",case=False, regex=True, na=False)
   
df[filter]

Unnamed: 0_level_0,type
line_items,Unnamed: 1_level_1
Current Assets Total,X
Total Current Assets,X
"Assets, Current, Total",X


In [27]:
df[filter] = 'TCA'

In [28]:
#TNCA Total non current assets

filter = df.index.str.contains(r"Total",case=False, regex=True, na=False) & \
            df.index.str.contains(r"non*current",case=False, regex=True, na=False) & \
            df.index.str.contains(r"assets",case=False, regex=True, na=False) & \
          ~ df.index.str.contains(r"other|vie|deferred|tax|income",case=False, regex=True, na=False) & \
          ~ df.index.str.contains(r"held|sale|discontinued|customer",case=False, regex=True, na=False)
   
df[filter]

Unnamed: 0_level_0,type
line_items,Unnamed: 1_level_1
Total Noncurrent Assets,X


In [29]:
df[filter] = 'TNCA'

In [30]:
#TL   Total Liabilities

filter = df.index.str.contains(r"Total",case=False, regex=True, na=False) & \
            df.index.str.contains(r"liabilities",case=False, regex=True, na=False) & \
          ~ df.index.str.contains(r"-|services|controllling|discontinued",case=False, regex=True, na=False) & \
          ~ df.index.str.contains(r"long|current|sale|equity|obligations",case=False, regex=True, na=False) & \
          ~ df.index.str.contains(r"investment|deposit|deficit|capital",case=False, regex=True, na=False) & \
          ~ df.index.str.contains(r"other|subject|vehicle|deferred|tax",case=False, regex=True, na=False) & \
          ~ df.index.str.contains(r"deficiency|Vie|insurance|accruals",case=False, regex=True, na=False) & \
          ~ df.index.str.contains(r"banking|policy|accrued|consumer|lease",case=False, regex=True, na=False) & \
          ~ df.index.str.contains(r"-|services|controllling|discontinued",case=False, regex=True, na=False)

df[filter]

Unnamed: 0_level_0,type
line_items,Unnamed: 1_level_1
Total Liabilities,X
Total Restricted Liabilities,X


In [31]:
df[filter] = 'TL'

In [32]:
#TCL  Total Current Liabilities

filter = df.index.str.contains(r"Total",case=False, regex=True, na=False) & \
            df.index.str.contains(r"current",case=False, regex=True, na=False) & \
            df.index.str.contains(r"liabilities",case=False, regex=True, na=False) & \
          ~ df.index.str.contains(r"other|non|vie|deferred|tax|income",case=False, regex=True, na=False) & \
          ~ df.index.str.contains(r"obligations",case=False, regex=True, na=False) & \
          ~ df.index.str.contains(r"held|sale|discontinued|customer",case=False, regex=True, na=False)
   
df[filter]

Unnamed: 0_level_0,type
line_items,Unnamed: 1_level_1
Total Current Liabilities,X
Total Current And Accrued Liabilities,X


In [33]:
df[filter] = 'TCL'

In [34]:
#TNCL Total Non Current Liabilitie

filter = df.index.str.contains(r"Total",case=False, regex=True, na=False) & \
            df.index.str.contains(r"non.*current",case=False, regex=True, na=False) & \
            df.index.str.contains(r"liabilities",case=False, regex=True, na=False) & \
          ~ df.index.str.contains(r"other|vie|deferred|tax|income",case=False, regex=True, na=False) & \
          ~ df.index.str.contains(r"obligations",case=False, regex=True, na=False) & \
          ~ df.index.str.contains(r"held|sale|discontinued|customer",case=False, regex=True, na=False)

df[filter]

Unnamed: 0_level_0,type
line_items,Unnamed: 1_level_1
Total Non-Current Liabilities,X


In [35]:
df[filter] = 'TNCL'

In [36]:
#TLSE Total Liabilities and Shareholder Equity

filter = df.index.str.contains(r"Total",case=False, regex=True, na=False) & \
            df.index.str.contains(r"equity",case=False, regex=True, na=False) & \
            df.index.str.contains(r"liabilities",case=False, regex=True, na=False) & \
          ~ df.index.str.contains(r"other|non|vie|deferred|tax|income",case=False, regex=True, na=False) & \
          ~ df.index.str.contains(r"obligations",case=False, regex=True, na=False) & \
          ~ df.index.str.contains(r"held|sale|discontinued|customer",case=False, regex=True, na=False)
   
df[filter]

Unnamed: 0_level_0,type
line_items,Unnamed: 1_level_1
"Total Liabilities, Redeemable Preferred Stock, And Stockholders Deficit Equity",X
"Total Liabilities, Redeemable Convertible Preferred Stock, Stockholders Equity, And Invested Equity",X
Total Liabilities And Equity Deficit,X
Total Liabilities And Total Equity,X
"Total Liabilities, Convertible Preferred Stock, And Shareholders Equity Deficit",X
"Total Liabilities And Shareholders And Parents Equity, Respectively",X
Total Liabilities And Owners Equity,X
Total Liabilities And Equity,X
"Total Liabilities, Redeemable Preferred Stock And Stockholders Equity",X
"Total Liabilities, Contingently Redeemable Common Stock And Stockholders Equity Deficit",X


In [37]:
df[filter] = 'TLSE'

In [38]:
#TSE  Total Shareholder Equity

pd.set_option('display.max_rows', 900)

filter = df.index.str.contains(r"Total",case=False, regex=True, na=False) & \
            df.index.str.contains(r"equity",case=False, regex=True, na=False) & \
          ~ df.index.str.contains(r"other|non|vie|deferred|tax|income",case=False, regex=True, na=False) & \
          ~ df.index.str.contains(r"obligations|liabilities|note|temporary|deficit",case=False, regex=True, na=False) & \
          ~ df.index.str.contains(r"held|sale|discontinued|customer",case=False, regex=True, na=False)
   
df[filter]

Unnamed: 0_level_0,type
line_items,Unnamed: 1_level_1
"Total Equity Attributable To Raymond James Financial, Inc.",X
Total Equity Attributable To Vf Corporation,X
Total Stockholders Equity Attributable To Amphenol Corporation,X
Total Owners Equity,X
Total Ipg Photonics Corporation Equity,X
Total Stockholders Equity Attributable To Blue Owl Capital Inc.,X
Total Stockholders Equity Attributable To Common Stockholders,X
Total Dte Energy Company Equity,X
"Total Stockholders Equity Attributable To Principal Financial Group, Inc.",X
Total Members Equity,X


In [39]:
df[filter] = 'TSE'

In [40]:
#TSE  Total Shareholder Equity

pd.set_option('display.max_rows', 900)

filter = df.index.str.contains(r"Equity",case=False, regex=True, na=False) & \
            df.index.str.contains(r"shareholder|stockholder|shareowner",case=False, regex=True, na=False) & \
          ~ df.index.str.contains(r"other|non|vie|deferred|tax|income",case=False, regex=True, na=False) & \
          ~ df.index.str.contains(r"obligations|liabilities|note|temporary|deficit",case=False, regex=True, na=False) & \
          ~ df.index.str.contains(r"held|sale|discontinued|customer|Total",case=False, regex=True, na=False)
   
df[filter]

Unnamed: 0_level_0,type
line_items,Unnamed: 1_level_1
Common Shareholders Equity,X
Shareholders Equity Before Deduction Of Treasury Stock,X
"Ingersoll-Rand Plc Shareholders Equity Ordinary Shares, 1 Par Value 282,700,041 And 295,605,736 Shares Issued At December 31, 2013 And 2012, Respectively, And Net Of 21,137 And 22,562 Shares Owned By Subsidiary At December 31, 2013 And 2012, Respectively",X
"Shareholders Equity 82,280,033 Common Units Issued And Outstanding At December 31, 2017",X
Stockholders Equity Before Adjustment For Treasury Stock,X
"Enhabit, Inc. Stockholders Equity: Common Stock, 0.01 Par Value; 200,000,000 Shares Authorized; 50,099,716 And 49,618,402 Shares Issued And Outstanding In 2022 And 2021, Respectively",X
"Landstar System, Inc. And Subsidiary Shareholders Equity Common Stock, 0.01 Par Value, Authorized 160,000,000 Shares, Issued 68,083,419 And 67,870,962 Shares",X
Stockholders Equity Parent Before Treasury Stock,X
Shareholders Equity Attributable To Pentair Ltd.,X
Stockholders Equity Before Treasury Stock,X


In [41]:
df[filter] = 'SE'

In [42]:
#TIA  Total Intangible assets

filter = df.index.str.contains(r"Total",case=False, regex=True, na=False) & \
            df.index.str.contains(r"intangible",case=False, regex=True, na=False) & \
            df.index.str.contains(r"assets",case=False, regex=True, na=False) & \
          ~ df.index.str.contains(r"other|non|vie|deferred|tax|income",case=False, regex=True, na=False) & \
          ~ df.index.str.contains(r"held|sale|discontinued|customer",case=False, regex=True, na=False)
   
df[filter]

Unnamed: 0_level_0,type
line_items,Unnamed: 1_level_1
Total Intangible Assets Acquired,X
Total Intangible Assets,X
"Total Intangible Assets, Net",X


In [43]:
df[filter] = 'TIA'

In [44]:
#IA Intangible assets

filter =    df.index.str.contains(r"intangible",case=False, regex=True, na=False) & \
            df.index.str.contains(r"assets",case=False, regex=True, na=False) & \
          ~ df.index.str.contains(r"other|non|vie|deferred|tax|income|Total",case=False, regex=True, na=False) & \
          ~ df.index.str.contains(r"held|sale|discontinued|customer",case=False, regex=True, na=False)
   
df[filter]

Unnamed: 0_level_0,type
line_items,Unnamed: 1_level_1
"Identifiable Intangible Assets, Net Of 15,411 And 4,592 In Accumulated Amortization, Respectively",X
"Identifiable Intangible Assets, Net Of 612 And 536 In Accumulated Amortization, Respectively",X
Intangible Assets-Trading Products,X
"Intangible Assets, Less Accumulated Amortization Of 367.7 And 339.9, Respectively",X
"Identifiable Intangible Assets, Net Note",X
"Intangible Assets, Less Accumulated Amortization Of 4 In 2015 And 2014",X
"Identifiable Intangible Assets Less Accumulated Amortization Of 32,887 2016 33,225 Note",X
"Identifiable Intangible Assets, Net Of 224.5 And 74.6 In Accumulated Amortization, Respectively",X
"Identifiable Intangible Assets, Net Of 377.1 And 210.2 In Accumulated Amortization, Respectively",X
"Intangible Assets, Less Accumulated Amortization Of 151,025 In 2015 And 114,922 In 2014",X


In [45]:
df[filter] ='IA'

In [46]:
#TC   Total Cash

filter = df.index.str.contains(r"Total",case=False, regex=True, na=False) & \
            df.index.str.contains(r"cash",case=False, regex=True, na=False) & \
          ~ df.index.str.contains(r"other|non|vie|deferred|tax|income",case=False, regex=True, na=False) & \
          ~ df.index.str.contains(r"obligations|liabilities|note|temporary|deficit",case=False, regex=True, na=False) & \
          ~ df.index.str.contains(r"held|sale|discontinued|customer",case=False, regex=True, na=False)
   
df[filter]

Unnamed: 0_level_0,type
line_items,Unnamed: 1_level_1
Total Cash And Short-Term Investments In Marketable Securities,X
Total Investments And Cash,X
Total Cash And Investments,X
Total Cash And Marketable Securities,X
Total Restricted Cash And Cash Equivalents,X
Total Cash And Cash Equivalents,X


In [47]:
df[filter] = 'TC'

In [48]:
#NPPE Net property, plant and equipment

filter = df.index.str.contains(r"net",case=False, regex=True, na=False) & \
            df.index.str.contains(r"Plant",case=False, regex=True, na=False) & \
          ~ df.index.str.contains(r"other|non|vie|deferred|tax|income|investments",case=False, regex=True, na=False) & \
          ~ df.index.str.contains(r"regulated|service|Mine",case=False, regex=True, na=False) & \
          ~ df.index.str.contains(r"obligations|liabilities|note|temporary|deficit",case=False, regex=True, na=False) & \
          ~ df.index.str.contains(r"held|sale|discontinued|customer",case=False, regex=True, na=False)
   
df[filter]


Unnamed: 0_level_0,type
line_items,Unnamed: 1_level_1
"Property, Plant, And Equipment, Net Of Accumulated Depreciation Of 419 And 386 Including Net Ppe Of 24 And 29 Owned By A Variable Interest Entity",X
Net Plant And Equipment,X
"Plant And Equipment, Net",X
"Total Net Property, Plant And Equipment",X
"Utility Plant-At Original Cost, Net Of Accumulated Depreciation Of 3,657,221 In 2012 And 3,360,005 In 2011",X
"Total Property, Plant And Equipment Net",X
"Property, Plant, And Equipment, Net Of Accumulated Depreciation Of 329 And 324 Including Net Ppe Of 34 And 36 Owned By A Variable Interest Entity",X
"Plant Assets, Net Of Depreciation",X
"Property, Plant, And Equipment, Net Of Accumulated Depreciation Of 385 And 397 Including Net Ppe Of 57 And 67 Owned By A Variable Interest Entity",X
"Total Property, Plant And Equipment, Net",X


In [49]:
df[filter] = 'NPPE'

In [50]:
#TD Total Debt

filter = df.index.str.contains(r"total",case=False, regex=True, na=False) & \
            df.index.str.contains(r"debt",case=False, regex=True, na=False) & \
          ~ df.index.str.contains(r"other|non|vie|deferred|tax|income|investments",case=False, regex=True, na=False) & \
          ~ df.index.str.contains(r"regulated|year|maturities|long|current|short|securities",case=False, regex=True, na=False) & \
          ~ df.index.str.contains(r"obligations|liabilities|note|temporary|deficit",case=False, regex=True, na=False) & \
          ~ df.index.str.contains(r"held|sale|discontinued|customer",case=False, regex=True, na=False)
   
df[filter]



Unnamed: 0_level_0,type
line_items,Unnamed: 1_level_1
Total Debt,X


In [51]:
df[filter] = 'TD'

In [52]:
#TLTD Total Long Term Debt

filter = df.index.str.contains(r"total",case=False, regex=True, na=False) & \
            df.index.str.contains(r"debt",case=False, regex=True, na=False) & \
            df.index.str.contains(r"long",case=False, regex=True, na=False) & \
          ~ df.index.str.contains(r"other|non|vie|deferred|tax|income|investments",case=False, regex=True, na=False) & \
          ~ df.index.str.contains(r"regulated|year|maturities|current|short|securities",case=False, regex=True, na=False) & \
          ~ df.index.str.contains(r"obligations|liabilities|note|temporary|deficit",case=False, regex=True, na=False) & \
          ~ df.index.str.contains(r"held|sale|discontinued|customer",case=False, regex=True, na=False)
   
df[filter]


Unnamed: 0_level_0,type
line_items,Unnamed: 1_level_1
Total Long-Term Debt,X


In [53]:
df[filter] = 'TLTD'

In [54]:
#TSTD Total Short term Debt

filter = df.index.str.contains(r"total",case=False, regex=True, na=False) & \
            df.index.str.contains(r"debt",case=False, regex=True, na=False) & \
            df.index.str.contains(r"short",case=False, regex=True, na=False) & \
          ~ df.index.str.contains(r"other|non|vie|deferred|tax|income|investments",case=False, regex=True, na=False) & \
          ~ df.index.str.contains(r"regulated|year|maturities|long|securities",case=False, regex=True, na=False) & \
          ~ df.index.str.contains(r"obligations|liabilities|note|temporary|deficit",case=False, regex=True, na=False) & \
          ~ df.index.str.contains(r"held|sale|discontinued|customer",case=False, regex=True, na=False)
   
df[filter]


Unnamed: 0_level_0,type
line_items,Unnamed: 1_level_1
Total Short-Term Debt,X


In [55]:
df[filter] = 'TSTD'

In [56]:
#CD current Debt

filter = df.index.str.contains(r"debt",case=False, regex=True, na=False) & \
            df.index.str.contains(r"current",case=False, regex=True, na=False) & \
          ~ df.index.str.contains(r"other|non|vie|deferred|tax|income|investments",case=False, regex=True, na=False) & \
          ~ df.index.str.contains(r"regulated|year|maturities|non|long|securities",case=False, regex=True, na=False) & \
          ~ df.index.str.contains(r"obligations|liabilities|note|temporary|deficit",case=False, regex=True, na=False) & \
          ~ df.index.str.contains(r"indebtedness|recourse|lease|indexed|secured",case=False, regex=True, na=False) & \
          ~ df.index.str.contains(r"convertible|affiliates|party",case=False, regex=True, na=False) & \
          ~ df.index.str.contains(r"held|sale|discontinued|customer|debtor",case=False, regex=True, na=False)
   
df[filter]

Unnamed: 0_level_0,type
line_items,Unnamed: 1_level_1
"Debt, Current",X
"Debt, Current Portion",X
"Current Debt, Net",X
"Current Debt, Net Of Discount",X
"Debt, Net Of Current Portion",X
"Current Portion Of Debt, Net",X
"Current Debt, Net Of Discount And Debt Issuance Costs",X
Current Portion Of Debt,X
"Term Debt, Current Portion",X
"Debt, Less Current Portion",X


In [57]:
df[filter] = 'CD'

In [58]:
#Short term Debt

filter = df.index.str.contains(r"short",case=False, regex=True, na=False) & \
            df.index.str.contains(r"debt",case=False, regex=True, na=False) & \
          ~ df.index.str.contains(r"other|non|vie|deferred|tax|income|investments",case=False, regex=True, na=False) & \
          ~ df.index.str.contains(r"regulated|year|maturities|long|securities",case=False, regex=True, na=False) & \
          ~ df.index.str.contains(r"obligations|liabilities|note|temporary|deficit",case=False, regex=True, na=False) & \
          ~ df.index.str.contains(r"held|sale|discontinued|customer|Total",case=False, regex=True, na=False)
   
df[filter]

Unnamed: 0_level_0,type
line_items,Unnamed: 1_level_1
Short-Term Debt,X
Short-Term Debt Financing,X
Convertible Short-Term Debt,X
"Short-Term Debt, Carrying Amount",X
Short-Term Debt To Affiliates,X
Unsecured Short-Term Debt,X
"Short-Term Convertible Debt, Net",X
Short-Term Intercompany Debt,X
Current Payables And Short-Term Debt To Affiliates,X
"Convertible Debt, Short-Term",X


In [59]:
df[filter] = 'CD'

In [60]:
#Short term Debt

filter = df.index.str.contains(r"short",case=False, regex=True, na=False) & \
            df.index.str.contains(r"Borrowing",case=False, regex=True, na=False) & \
          ~ df.index.str.contains(r"other|non|vie|deferred|tax|income|investments",case=False, regex=True, na=False) & \
          ~ df.index.str.contains(r"regulated|year|maturities|long|securities",case=False, regex=True, na=False) & \
          ~ df.index.str.contains(r"obligations|liabilities|note|temporary|deficit",case=False, regex=True, na=False) & \
          ~ df.index.str.contains(r"held|sale|discontinued|customer|Total",case=False, regex=True, na=False)
   
df[filter]

Unnamed: 0_level_0,type
line_items,Unnamed: 1_level_1
Short-Term Borrowings Due To Dte Energy,X
Short-Term Borrowings-Trade Receivable Securitization Facility,X
Short Term Borrowings Under Money Market Liquidity Facility,X
Short-Term Borrowings—Trade Receivable Securitization Facility,X


In [61]:
df[filter] = 'CD'

In [62]:
#NCD non current Debt

filter = df.index.str.contains(r"debt",case=False, regex=True, na=False) & \
            df.index.str.contains(r"non*current",case=False, regex=True, na=False) & \
          ~ df.index.str.contains(r"other|vie|deferred|tax|income|investments",case=False, regex=True, na=False) & \
          ~ df.index.str.contains(r"regulated|year|maturities|securities",case=False, regex=True, na=False) & \
          ~ df.index.str.contains(r"obligations|liabilities|note|temporary|deficit",case=False, regex=True, na=False) & \
          ~ df.index.str.contains(r"indebtedness|recourse|lease|indexed|secured",case=False, regex=True, na=False) & \
          ~ df.index.str.contains(r"convertible|affiliates|party",case=False, regex=True, na=False) & \
          ~ df.index.str.contains(r"held|sale|discontinued|customer|debtor",case=False, regex=True, na=False)
   
df[filter]

Unnamed: 0_level_0,type
line_items,Unnamed: 1_level_1
"Debt, Noncurrent",X
"Debt, Noncurrent, Net",X
Noncurrent Portion Of Debt,X


In [63]:
df[filter] = 'NCD'

In [64]:
#Long Term Debt

filter = df.index.str.contains(r"long",case=False, regex=True, na=False) & \
            df.index.str.contains(r"debt",case=False, regex=True, na=False) & \
          ~ df.index.str.contains(r"other|non|vie|deferred|tax|income|investments",case=False, regex=True, na=False) & \
          ~ df.index.str.contains(r"regulated|year|maturities|current|short|securities",case=False, regex=True, na=False) & \
          ~ df.index.str.contains(r"obligations|liabilities|note|temporary|deficit",case=False, regex=True, na=False) & \
          ~ df.index.str.contains(r"held|sale|discontinued|customer|total",case=False, regex=True, na=False)
   
df[filter]

Unnamed: 0_level_0,type
line_items,Unnamed: 1_level_1
"Long-Term Debt Includes 35 And 63, Respectively, At Estimated Fair Value, Relating To Variable Interest Entities",X
Accrued Interest On Long-Term Debt,X
"Long-Term Debt Includes 5 And 6, Respectively, At Estimated Fair Value, Relating To Variable Interest Entities",X
"Long-Term Debt December 31, 2022 And December 31, 2021 Include 94.1 And 102.7, Respectively, Related To Wepco Environmental Trust",X
Long-Term Debt Of Variable Financing Trusts,X
Long-Term Debt Due To Former Parent,X
Long-Term Debt To Related Parties,X
Long-Term Debt 2014 And 2013 Include 82.3 Million And 47.7 Million Related To Consolidated Variable Interest Entities,X
"Long-Term Debt Includes 63 And 151, Respectively, At Estimated Fair Value, Relating To Variable Interest Entities",X
"Long-Term Debt Includes Securitization Bonds Of 1,070,556 As Of December 31, 2011 And 931,131 As Of December 31, 2010",X


In [65]:
df[filter] = 'NCD'

In [66]:
# Cash and Cash Equivalent

filter = df.index.str.contains(r"cash",case=False, regex=True, na=False) & \
            df.index.str.contains(r"equivalent",case=False, regex=True, na=False) & \
          ~ df.index.str.contains(r"interest",case=False, regex=True, na=False) & \
          ~ df.index.str.contains(r"held|sale|discontinued|total|note|vie",case=False, regex=True, na=False)

df[filter]

Unnamed: 0_level_0,type
line_items,Unnamed: 1_level_1
Restricted Cash Equivalents,X
Cash And Cash Equivalents,X
"Cash And Cash Equivalents, At December 31, 2022 And 2021 Includes 242 And 480, Respectively, Of Pledged Cash Related To Secured Trust Deposits",X
"Default Funds And Margin Deposits Including Restricted Cash And Cash Equivalents Of 5,074 And 3,197, Respectively",X
Cash And Equivalents Segregated Under Federal Or Other Regulations,X
"Cash And Cash Equivalents, At December 31, 2016 And 2015, Includes Pledged Cash Of 331 And 108, Respectively, Related To Secured Trust Deposits",X
"Cash, Cash Equivalents, Restricted Cash And Restricted Cash Equivalents",X
"Cash And Cash Equivalents, At December 31, 2018 And 2017, Includes Pledged Cash Of 412 And 475, Respectively, Related To Secured Trust Deposits",X
"Cash And Cash Equivalents, At December 31, 2020 And December 31, 2019 Includes 270 And 384, Respectively, Of Pledged Cash Related To Secured Trust Deposits",X
Cash Equivalents Including Restricted,X


In [67]:
df[filter] = 'CCE'

In [68]:
filter = df.index.str.contains(r"retained",case=False, regex=True, na=False) & \
            df.index.str.contains(r"earnings",case=False, regex=True, na=False) & \
          ~ df.index.str.contains(r"restricuted|interest",case=False, regex=True, na=False) & \
          ~ df.index.str.contains(r"held|sale|discontinued|total|note|vie",case=False, regex=True, na=False)
df[filter] 

Unnamed: 0_level_0,type
line_items,Unnamed: 1_level_1
"Retained Earnings, Appropriated",X
Appropriated Retained Earnings Of Consolidated Investment Entities,X
Retained Earnings Deficit Appropriated,X
Retained Earnings Appropriated For Investors In Consolidated Investment Products,X
Retained Earnings / Accumulated Deficit,X
"Retained Earnings, Unappropriated",X
Retained Earnings/-Accumulated Deficit,X
Retained Earnings Loss,X
Retained Earnings/-Deficit,X
Retained Earnings And Net Proceeds From All Sources,X


In [69]:
df[filter] = 'RE'

In [70]:
filter = df.index.str.contains(r"goodwill",case=False, regex=True, na=False) & \
          ~ df.index.str.contains(r"beg|restricuted|interest",case=False, regex=True, na=False) & \
          ~ df.index.str.contains(r"held|sale|discontinued|total|note|vie",case=False, regex=True, na=False)
df[filter] 

Unnamed: 0_level_0,type
line_items,Unnamed: 1_level_1
Goodwill Acquired From Business Acquisition,X
Goodwill And Indefinite-Lived Intangible Assets,IA
Goodwill And Intangibles,X
Intangible And Goodwill,X
"Intangible Assets, Gross Excluding Goodwill",IA
"Goodwill And Tradename, Net Of Amortization",X
Goodwill Recorded For Palmetto Heritage,X
"Goodwill And Tradenames, Net Of Amortization",X
Indefinite-Lived Intangible Assets Excluding Goodwill,IA
"Goodwill, Intangible Assets And Deferred Costs",X


In [71]:
df[filter] = 'IA'

In [72]:
df_line_items = df[df['type'] != 'X']

In [73]:
# Save mappings

mappings = {}

mappings['headings'] = df_headings
mappings['line_items'] = df_line_items

# Save to file

filepath = os.path.join(PROJ_ROOT_PATH,'pickle','mappings_balance_stage3.pkl')

with open(filepath, 'wb') as f:
  pickle.dump(mappings, f)
