In [1]:
import numpy as np
import pandas as pd
import time
import os
import pickle
import re

In [2]:
pd.set_option('display.max_rows', 600)
pd.set_option('display.max_columns', 600)
pd.set_option('display.max_colwidth', 100)
pd.options.display.float_format = '{:.2f}'.format

In [3]:
DATA_ROOT_PATH="/mnt/data/projects/MD1"
PROJ_ROOT_PATH="/home/priyesh/projects/MD1"

In [4]:
# Read from Pickle file

filepath = os.path.join(PROJ_ROOT_PATH,'pickle','clean_master_data_interim.pkl')
clean_master_data = pd.read_pickle(filepath)

In [5]:
def bal_fix_labels(orig_df):
  
  df = orig_df.copy()
  df = df.reset_index()

  # Company Shareholder Equity

  mask = df['line_item'].str.contains(r"[A-Z]* Shareholder",case=False, regex=True, na=False) & \
           df['line_item'].str.contains(r"Equity",case=False, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Common",case=False, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Ordinary|Par Value|Per Share",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Parent|Abstract",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Total|Liabilit|Non",case=True, regex=True, na=False)

  df.loc[mask,'line_item'] = 'Company Shareholders Equity'

  mask = df['line_item'].str.contains(r"Stockholder[s]?",case=False, regex=True, na=False) & \
           df['line_item'].str.contains(r"Equity",case=False, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Common",case=False, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Ordinary|Par Value|Per Share",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Parent|Temp|Abstract|Before",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Total|Liabilit|Non|Sub",case=True, regex=True, na=False)

  df.loc[mask,'line_item'] = 'Company Shareholders Equity'
    
  mask = df['line_item'].str.contains(r" Shareowner[s]?",case=False, regex=True, na=False) & \
           df['line_item'].str.contains(r"Equity",case=False, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Total|Liabilit|Abs|Attrib",case=False, regex=True, na=False)
   
  df.loc[mask,'line_item'] = 'Company Shareholders Equity'  

  # Common Shareholders Equity
    
  mask = df['line_item'].str.contains(r" Shareholder",case=False, regex=True, na=False) & \
           df['line_item'].str.contains(r"Equity",case=False, regex=True, na=False) & \
           df['line_item'].str.contains(r"Common",case=False, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Ordinary|Par Value|Per Share",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Parent|Abstract|Attrib",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Total|Liabilit|Non",case=True, regex=True, na=False)

  df.loc[mask,'line_item'] = 'Common Shareholders Equity'  
    
  mask = df['line_item'].str.contains(r" Stockholder[s]?",case=False, regex=True, na=False) & \
           df['line_item'].str.contains(r"Equity",case=False, regex=True, na=False) & \
           df['line_item'].str.contains(r"Common",case=False, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Ordinary|Par Value|Per Share",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Parent|Temp|Abstract|Before",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Total|Liabilit|Non|Sub",case=True, regex=True, na=False)
  
  df.loc[mask,'line_item'] = 'Common Shareholders Equity' 

  # Shareholders Equity
    
  mask = df['line_item'].str.contains(r"^Shareowner|^Stockholder|^Shareholder",case=False, regex=True, na=False) & \
           df['line_item'].str.contains(r"Equity$",case=False, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"NonCon",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Total|Liabilit|Abs|Attrib",case=False, regex=True, na=False)

  df.loc[mask,'line_item'] = 'Shareholders Equity' 

  # Total Company Shareholders Equity

  mask = df['line_item'].str.contains(r"Total.* Shareholder",case=False, regex=True, na=False) & \
           df['line_item'].str.contains(r"Equity",case=False, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Common",case=False, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Ordinary|Par Value|Per Share",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Parent|Abstract",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Liabilit|Non",case=True, regex=True, na=False)
 
  df.loc[mask,'line_item'] = 'Total Company Shareholders Equity' 
 
  mask = df['line_item'].str.contains(r"Total.* Stockholder[s]?",case=False, regex=True, na=False) & \
           df['line_item'].str.contains(r"Equity",case=False, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Common",case=False, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Ordinary|Par Value|Per Share",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Parent|Temp|Abstract|Before",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Liabilit|Attrib",case=True, regex=True, na=False)
  
  df.loc[mask,'line_item'] = 'Total Company Shareholders Equity' 
    
  mask = df['line_item'].str.contains(r"Total.* Shareowner[s]?",case=False, regex=True, na=False) & \
           df['line_item'].str.contains(r"Equity",case=False, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Liabilit|Abs|Attrib",case=False, regex=True, na=False)

  df.loc[mask,'line_item'] = 'Total Company Shareholders Equity' 
    
  # Total Common Shareholders Equity

  mask = df['line_item'].str.contains(r"Total.* Shareholder",case=False, regex=True, na=False) & \
           df['line_item'].str.contains(r"Equity",case=False, regex=True, na=False) & \
           df['line_item'].str.contains(r"Common",case=False, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Ordinary|Par Value|Per Share",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Parent|Abstract",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Liabilit|Non|Attrib",case=True, regex=True, na=False)

  df.loc[mask,'line_item'] = 'Total Common Shareholders Equity' 

  mask = df['line_item'].str.contains(r"Total.* Stockholder[s]?",case=False, regex=True, na=False) & \
           df['line_item'].str.contains(r"Equity",case=False, regex=True, na=False) & \
           df['line_item'].str.contains(r"Common",case=False, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Ordinary|Par Value|Per Share",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Parent|Temp|Abstract|Before",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Liabilit|Non|Sub|Attrib",case=True, regex=True, na=False)

  df.loc[mask,'line_item'] = 'Total Common Shareholders Equity' 

  mask = df['line_item'].str.contains(r"Total.* Shareowner[s]?",case=False, regex=True, na=False) & \
           df['line_item'].str.contains(r"Equity",case=False, regex=True, na=False) & \
           df['line_item'].str.contains(r"Common",case=False, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Ordinary|Par Value|Per Share",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Parent|Temp|Abstract|Before",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Liabilit|Non|Sub|Attrib",case=True, regex=True, na=False)
  
  df.loc[mask,'line_item'] = 'Total Common Shareholders Equity' 
   
  # Total Liabilities And Shareholder Equity
    
  mask = df['line_item'].str.contains(r"Stockholder",case=False, regex=True, na=False) & \
           df['line_item'].str.contains(r"Equity",case=False, regex=True, na=False) & \
           df['line_item'].str.contains(r"Total",case=True, regex=True, na=False) & \
           df['line_item'].str.contains(r"Liab",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Parent|Indemnity",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Non|Convert|Redeem",case=True, regex=True, na=False)
    
  df.loc[mask,'line_item'] = 'Total Liabilities And Shareholders Equity' 

  mask = df['line_item'].str.contains(r"Shareholder",case=False, regex=True, na=False) & \
           df['line_item'].str.contains(r"Equity",case=False, regex=True, na=False) & \
           df['line_item'].str.contains(r"Total",case=True, regex=True, na=False) & \
           df['line_item'].str.contains(r"Liab",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Parent|Indemnity",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Non|Convert|Redeem",case=True, regex=True, na=False)
    
  df.loc[mask,'line_item'] = 'Total Liabilities And Shareholders Equity'

  mask = df['line_item'].str.contains(r"Shareowner",case=False, regex=True, na=False) & \
           df['line_item'].str.contains(r"Equity",case=False, regex=True, na=False) & \
           df['line_item'].str.contains(r"Total",case=True, regex=True, na=False) & \
           df['line_item'].str.contains(r"Liab",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Parent|Indemnity",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Non|Convert|Redeem",case=True, regex=True, na=False)
    
  df.loc[mask,'line_item'] = 'Total Liabilities And Shareholders Equity' 

  # Total Liabilities
    
  mask = df['line_item'].str.contains(r"Total",case=False, regex=True, na=False) & \
           df['line_item'].str.contains(r"liabilities",case=False, regex=True, na=False) & \
         ~ df['line_item'].str.contains(r"Variable Interest Entities|Res",case=False, regex=True, na=False) & \
         ~ df['line_item'].str.contains(r"-|services|controllling|discontinued",case=False, regex=True, na=False) & \
         ~ df['line_item'].str.contains(r"long|current|sale|equity|obligations",case=False, regex=True, na=False) & \
         ~ df['line_item'].str.contains(r"investment|deposit|deficit|capital",case=False, regex=True, na=False) & \
         ~ df['line_item'].str.contains(r"other|subject|vehicle|deferred|tax",case=False, regex=True, na=False) & \
         ~ df['line_item'].str.contains(r"deficiency|Vie|insurance|accruals",case=False, regex=True, na=False) & \
         ~ df['line_item'].str.contains(r"banking|policy|accrued|consumer|lease",case=False, regex=True, na=False) & \
         ~ df['line_item'].str.contains(r"-|services|controllling|discontinued",case=False, regex=True, na=False)

  df.loc[mask,'line_item'] = 'Total Liabilities' 
    
  # Total Restricted Liabilities

  mask = df['line_item'].str.contains(r"Total",case=False, regex=True, na=False) & \
           df['line_item'].str.contains(r"liabilities",case=False, regex=True, na=False) & \
           df['line_item'].str.contains(r"Res",case=False, regex=True, na=False) & \
         ~ df['line_item'].str.contains(r"Variable Interest Entities",case=False, regex=True, na=False) & \
         ~ df['line_item'].str.contains(r"long|current|sale|equity|obligations",case=False, regex=True, na=False) & \
         ~ df['line_item'].str.contains(r"investment|deposit|deficit|capital",case=False, regex=True, na=False) & \
         ~ df['line_item'].str.contains(r"other|subject|vehicle|deferred|tax",case=False, regex=True, na=False) & \
         ~ df['line_item'].str.contains(r"deficiency|Vie|insurance|accruals",case=False, regex=True, na=False) & \
         ~ df['line_item'].str.contains(r"banking|policy|accrued|consumer|lease",case=False, regex=True, na=False) & \
         ~ df['line_item'].str.contains(r"-|services|controllling|discontinued",case=False, regex=True, na=False)

  df.loc[mask,'line_item'] = 'Total Restricted Liabilities' 

  # Property, Plant and Equipment
    
  mask = df['line_item'].str.contains(r"Property",case=True, regex=True, na=False) & \
         df['line_item'].str.contains(r"Plant",case=True, regex=True, na=False) & \
         df['line_item'].str.contains(r"Equipment",case=True, regex=True, na=False) & \
        ~df['line_item'].str.contains(r"Vie|Exchange|interest|Held",case=False, regex=True, na=False) & \
        ~df['line_item'].str.contains(r"Related|Agree|^Other",case=False, regex=True, na=False) & \
        ~df['line_item'].str.contains(r"Disc|Marg|Reg|Total|Member",case=True, regex=True, na=False)

  df.loc[mask,'line_item'] = 'Property Plant And Equipment' 
    
  # Total Current Liabilities

  mask = df['line_item'].str.contains(r"Total",case=False, regex=True, na=False) & \
           df['line_item'].str.contains(r"liabilities",case=False, regex=True, na=False) & \
           df['line_item'].str.contains(r"current",case=False, regex=True, na=False) & \
         ~ df['line_item'].str.contains(r"Variable Interest Entities",case=False, regex=True, na=False) & \
         ~ df['line_item'].str.contains(r"long|Res|Vie|sale|equity|obligations",case=False, regex=True, na=False) & \
         ~ df['line_item'].str.contains(r"investment|deposit|deficit|capital",case=False, regex=True, na=False) & \
         ~ df['line_item'].str.contains(r"other|subject|vehicle|deferred|tax",case=False, regex=True, na=False) & \
         ~ df['line_item'].str.contains(r"deficiency|insurance|accruals|non",case=False, regex=True, na=False) & \
         ~ df['line_item'].str.contains(r"banking|policy|accrued|consumer|lease",case=False, regex=True, na=False) & \
         ~ df['line_item'].str.contains(r"-|services|controllling|discontinued",case=False, regex=True, na=False)
  
  df.loc[mask,'line_item'] = 'Total Current Liabilities' 
  
  #Total Non-Current Liabilities

  mask = df['line_item'].str.contains(r"Total",case=False, regex=True, na=False) & \
            df['line_item'].str.contains(r"liabilities",case=False, regex=True, na=False) & \
            df['line_item'].str.contains(r"Non.*Current",case=False, regex=True, na=False) & \
          ~ df['line_item'].str.contains(r"Variable Interest Entities",case=False, regex=True, na=False) & \
          ~ df['line_item'].str.contains(r"long|Res|Vie|sale|equity|obligations",case=False, regex=True, na=False) & \
          ~ df['line_item'].str.contains(r"investment|deposit|deficit|capital",case=False, regex=True, na=False) & \
          ~ df['line_item'].str.contains(r"other|subject|vehicle|deferred|tax",case=False, regex=True, na=False) & \
          ~ df['line_item'].str.contains(r"deficiency|insurance|accruals",case=False, regex=True, na=False) & \
          ~ df['line_item'].str.contains(r"banking|policy|accrued|consumer|lease",case=False, regex=True, na=False) & \
          ~ df['line_item'].str.contains(r"-|services|controllling|discontinued",case=False, regex=True, na=False)

  df.loc[mask,'line_item'] = 'Total Non-Current Liabilities'  

  # Accounts Receivables Net Of Allowances
    
  mask = df['line_item'].str.contains(r"^Accounts Rec",case=False, regex=True, na=False) & \
            df['line_item'].str.contains(r"Net Of Allow",case=True, regex=True, na=False) & \
           ~df['line_item'].str.contains(r"Reser|Vies|Cred|Resp",case=False, regex=True, na=False)

  df.loc[mask,'line_item'] = 'Accounts Receivables Net Of Allowances' 
    
  # Accounts Receivables Less Allowances

  mask = df['line_item'].str.contains(r"^Accounts Rec",case=False, regex=True, na=False) & \
            df['line_item'].str.contains(r"Less",case=False, regex=True, na=False) & \
           ~df['line_item'].str.contains(r"Net of allow|Party|Parties",case=False, regex=True, na=False) & \
           ~df['line_item'].str.contains(r"Pledge|Total|Finan",case=False, regex=True, na=False) & \
           ~df['line_item'].str.contains(r"Collat|Trade|From|Other|Aff",case=True, regex=True, na=False) & \
           ~df['line_item'].str.contains(r"Reser|Vies|Cred|Resp|bill",case=False, regex=True, na=False)

  df.loc[mask,'line_item'] = 'Accounts Receivables Net Of Allowances' 

  # Accounts Reveivables
    
  mask = df['line_item'].str.contains(r"^Accounts Rec",case=False, regex=True, na=False) & \
         ~df['line_item'].str.contains(r"Net|Allow",case=True, regex=True, na=False) & \
         ~df['line_item'].str.contains(r"Interest|Carrying",case=True, regex=True, na=False) & \
         ~df['line_item'].str.contains(r"Less|Sec|No Allow|Non[- ]?Current",case=True, regex=True, na=False) & \
         ~df['line_item'].str.contains(r"Net of allow|Party|Parties|Invest",case=False, regex=True, na=False) & \
         ~df['line_item'].str.contains(r"Pledge|Total|FinanInvest|Abstract",case=False, regex=True, na=False) & \
         ~df['line_item'].str.contains(r"Collat|Trade|From|Other|Aff|Long",case=True, regex=True, na=False) & \
         ~df['line_item'].str.contains(r"Reser|Vies|Cred|Resp|bill|Fina|Accru",case=False, regex=True, na=False)
    
  df.loc[mask,'line_item'] = 'Accounts Receivables' 

  # Accounts Receivables No Allowance
    
  mask = df['line_item'].str.contains(r"^Accounts Rec",case=False, regex=True, na=False) & \
          df['line_item'].str.contains(r"No Allow",case=False, regex=True, na=False) & \
         ~df['line_item'].str.contains(r"Less|Sec|Non[- ]?Current",case=True, regex=True, na=False) & \
         ~df['line_item'].str.contains(r"Net of allow|Party|Parties",case=False, regex=True, na=False) & \
         ~df['line_item'].str.contains(r"Pledge|Total|FinanInvest|Abstract",case=False, regex=True, na=False) & \
         ~df['line_item'].str.contains(r"Collat|Trade|From|Other|Aff",case=True, regex=True, na=False) & \
         ~df['line_item'].str.contains(r"Reser|Vies|Cred|Resp|bill",case=False, regex=True, na=False)

  df.loc[mask,'line_item'] = 'Accounts Receivables'

  # Accounts Receivables Non-Current

  mask = df['line_item'].str.contains(r"^Accounts Rec",case=False, regex=True, na=False) & \
          df['line_item'].str.contains(r"Non|Long",case=True, regex=True, na=False) & \
         ~df['line_item'].str.contains(r"Interest",case=True, regex=True, na=False) & \
         ~df['line_item'].str.contains(r"Less|Sec|No Allow",case=True, regex=True, na=False) & \
         ~df['line_item'].str.contains(r"Net of allow|Party|Parties|Invest",case=False, regex=True, na=False) & \
         ~df['line_item'].str.contains(r"Pledge|Total|FinanInvest|Abstract",case=False, regex=True, na=False) & \
         ~df['line_item'].str.contains(r"Collat|Trade|From|Other|Aff",case=True, regex=True, na=False) & \
         ~df['line_item'].str.contains(r"Reser|Vies|Cred|Resp|bill|Fina|Accru",case=False, regex=True, na=False)
    
  df.loc[mask,'line_item'] = 'Accounts Receivables Non-Current'

  # Accounts Receivables Current
    
  mask = df['line_item'].str.contains(r"^Accounts Rec",case=False, regex=True, na=False) & \
          df['line_item'].str.contains(r"Current|Short",case=True, regex=True, na=False) & \
         ~df['line_item'].str.contains(r"Interest",case=True, regex=True, na=False) & \
         ~df['line_item'].str.contains(r"Less|Sec|No Allow|Non",case=True, regex=True, na=False) & \
         ~df['line_item'].str.contains(r"Net of allow|Party|Parties|Invest",case=False, regex=True, na=False) & \
         ~df['line_item'].str.contains(r"Pledge|Total|FinanInvest|Abstract",case=False, regex=True, na=False) & \
         ~df['line_item'].str.contains(r"Collat|Trade|From|Other|Aff",case=True, regex=True, na=False) & \
         ~df['line_item'].str.contains(r"Reser|Vies|Cred|Resp|bill|Fina|Accru",case=False, regex=True, na=False)

  df.loc[mask,'line_item'] = 'Accounts Receivables Current'
   
  # Accounts Payable Current
    
  mask = df['line_item'].str.contains(r"^Accounts Payable",case=False, regex=True, na=False) & \
           df['line_item'].str.contains(r"Current",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Affiliat|Party|Parties|Subsid",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Cost|Entit|Interest",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Merchant|Invest|Secu|Proj|Equip",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Other|To|Trade|Finan|Client",case=True, regex=True, na=False)
   
  df.loc[mask,'line_item'] = 'Accounts Payable Current'
    
  # Accounts Payable

  mask = df['line_item'].str.contains(r"^Accounts Payable",case=False, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Current|Abstract",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Affiliat|Party|Parties|Subsid",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Cost|Entit|Interest",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Merchant|Invest|Secu|Proj|Equip",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Other|To|Trade|Finan|Client",case=True, regex=True, na=False)
    
  df.loc[mask,'line_item'] = 'Accounts Payable'
  
  # Total Long Term Debt

  mask = df['line_item'].str.contains(r"Debt",case=False, regex=True, na=False) & \
           df['line_item'].str.contains(r"Long",case=False, regex=True, na=False) & \
           df['line_item'].str.contains(r"Total",case=False, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Lease",case=False, regex=True, na=False)
    
  df.loc[mask,'line_item'] = 'Total Long-Term Debt'  

  # Long Term Debt
 
  mask = df['line_item'].str.contains(r"Debt",case=False, regex=True, na=False) & \
           df['line_item'].str.contains(r"Long",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Current",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Net",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Intercompany|Party|Convert|Abstract",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"To|Due Within|Fin|Bank|Hedg",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Inv|Sec|Mark|Payable Within",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Other|Subord|Mort|Affil|Vie",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Total|Short|Interest|Mat",case=True, regex=True, na=False)
         
  df.loc[mask,'line_item'] = 'Long-Term Debt'  

  # Net Long Term Debt
    
  mask = df['line_item'].str.contains(r"Debt",case=False, regex=True, na=False) & \
          df['line_item'].str.contains(r"Long",case=True, regex=True, na=False) & \
         ~df['line_item'].str.contains(r"Current",case=True, regex=True, na=False) & \
          df['line_item'].str.contains(r"Net",case=True, regex=True, na=False) & \
         ~df['line_item'].str.contains(r"Intercompany|Party|Convert|Abstract",case=True, regex=True, na=False) & \
         ~df['line_item'].str.contains(r"To|Due Within|Fin|Bank|Hedg",case=True, regex=True, na=False) & \
         ~df['line_item'].str.contains(r"Inv|Sec|Mark|Payable Within",case=True, regex=True, na=False) & \
         ~df['line_item'].str.contains(r"Other|Subord|Mort|Affil|Vie",case=True, regex=True, na=False) & \
         ~df['line_item'].str.contains(r"Total|Short|Interest|Mat",case=True, regex=True, na=False)
         
  df.loc[mask,'line_item'] = 'Net Long-Term Debt'  

  # Short-Term Debt
    
  mask = df['line_item'].str.contains(r"Debt",case=False, regex=True, na=False) & \
            df['line_item'].str.contains(r"Short",case=False, regex=True, na=False) & \
           ~df['line_item'].str.contains(r"Interest|Long|Total|Part|Aff|Carr",case=True, regex=True, na=False) & \
           ~df['line_item'].str.contains(r"Mort|Conver|Mat|Sub|Fin|Inv|Sec",case=True, regex=True, na=False) & \
           ~df['line_item'].str.contains(r"Note|Intercompany|Current|Other",case=False, regex=True, na=False)

  df.loc[mask,'line_item'] = 'Short-Term Debt'
    
  mask = df['line_item'].str.contains(r"Borrow",case=False, regex=True, na=False) & \
           df['line_item'].str.contains(r"Short",case=False, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Due|Under|Unsec",case=False, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Interest|Long|Total|Part|Aff|Carr",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Mort|Conver|Mat|Sub|Fin|Inv|Sec",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Note|Intercompany|Current|Other",case=False, regex=True, na=False)

  df.loc[mask,'line_item'] = 'Short-Term Debt'
    
  # Unsecured Short-Term Borrowing

  mask = df['line_item'].str.contains(r"Borrow",case=False, regex=True, na=False) & \
            df['line_item'].str.contains(r"Short",case=False, regex=True, na=False) & \
            df['line_item'].str.contains(r"Unsec",case=False, regex=True, na=False) & \
           ~df['line_item'].str.contains(r"Due|Under",case=False, regex=True, na=False) & \
           ~df['line_item'].str.contains(r"Interest|Long|Total|Part|Aff|Carr",case=True, regex=True, na=False) & \
           ~df['line_item'].str.contains(r"Mort|Conver|Mat|Sub|Fin|Inv",case=True, regex=True, na=False) & \
           ~df['line_item'].str.contains(r"Note|Intercompany|Current|Other",case=False, regex=True, na=False)

  df.loc[mask,'line_item'] = 'Unsecured Short-Term Debt'
    
  # Retained Earnings

  mask = df['line_item'].str.contains(r"Earnings$",case=False, regex=True, na=False) & \
          df['line_item'].str.contains(r"Retained",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Dist|Appropriated|Capital",case=False, regex=True, na=False)
    
  df.loc[mask,'line_item'] = 'Retained Earnings'

  # Intangible Assets
    
  mask = df['line_item'].str.contains(r"Intangible",case=True, regex=True, na=False) & \
         ~df['line_item'].str.contains(r"Goodwill|other|brand|acquired",case=False, regex=True, na=False) & \
         ~df['line_item'].str.contains(r"Finite|Liabil",case=False, regex=True, na=False) & \
         ~df['line_item'].str.contains(r"Held|Prod|Trad|Dep|Ind|Ide|Ins",case=True, regex=True, na=False) & \
         ~df['line_item'].str.contains(r"Acquisition|Not Subject|Amort",case=False, regex=True, na=False) & \
         ~df['line_item'].str.contains(r"Estate|Deferred|Leasing|Customer",case=False, regex=True, na=False) & \
         ~df['line_item'].str.contains(r"market|Total|Purchase|Lease",case=False, regex=True, na=False) 
  
  df.loc[mask,'line_item'] = 'Intangible Assets'

  mask = df['line_item'].str.contains(r"Intangible",case=True, regex=True, na=False) & \
          df['line_item'].str.contains(r"Amort",case=True, regex=True, na=False) & \
          df['line_item'].str.contains(r"Net",case=True, regex=True, na=False) & \
         ~df['line_item'].str.contains(r"Goodwill|other|brand|acquired",case=False, regex=True, na=False) & \
         ~df['line_item'].str.contains(r"Finite|Less",case=False, regex=True, na=False) & \
         ~df['line_item'].str.contains(r"Held|Prod|Trad|Dep|Ind|Ide|Ins",case=True, regex=True, na=False) & \
         ~df['line_item'].str.contains(r"Acquisition|Not Subject",case=False, regex=True, na=False) & \
         ~df['line_item'].str.contains(r"Estate|Deferred|Leasing|Customer",case=False, regex=True, na=False) & \
         ~df['line_item'].str.contains(r"market|Total|Purchase|Lease",case=False, regex=True, na=False)

  df.loc[mask,'line_item'] = 'Intangible Assets Net Of Amortization'

  mask = df['line_item'].str.contains(r"Intangible",case=True, regex=True, na=False) & \
          df['line_item'].str.contains(r"Amort",case=True, regex=True, na=False) & \
         ~df['line_item'].str.contains(r"Net",case=True, regex=True, na=False) & \
         ~df['line_item'].str.contains(r"Goodwill|other|brand|acquired",case=False, regex=True, na=False) & \
         ~df['line_item'].str.contains(r"Finite|Less",case=False, regex=True, na=False) & \
         ~df['line_item'].str.contains(r"Held|Prod|Trad|Dep|Ind|Ide|Ins",case=True, regex=True, na=False) & \
         ~df['line_item'].str.contains(r"Acquisition|Not Subject",case=False, regex=True, na=False) & \
         ~df['line_item'].str.contains(r"Estate|Deferred|Leasing|Customer",case=False, regex=True, na=False) & \
         ~df['line_item'].str.contains(r"market|Total|Purchase|Lease",case=False, regex=True, na=False)
    
  df.loc[mask,'line_item'] = 'Amortizable Intangible Assets'

  # Acquired Intangible Assets
    
  mask = df['line_item'].str.contains(r"Intangible",case=True, regex=True, na=False) & \
          df['line_item'].str.contains(r"Acqui",case=True, regex=True, na=False) & \
         ~df['line_item'].str.contains(r"Goodwill|other|brand",case=False, regex=True, na=False) & \
         ~df['line_item'].str.contains(r"Finite",case=False, regex=True, na=False) & \
         ~df['line_item'].str.contains(r"Held|Prod|Trad|Dep|Ind|Ide|Ins",case=True, regex=True, na=False) & \
         ~df['line_item'].str.contains(r"Not Subject",case=False, regex=True, na=False) & \
         ~df['line_item'].str.contains(r"Estate|Deferred|Leasing|Customer",case=False, regex=True, na=False) & \
         ~df['line_item'].str.contains(r"market|Total|Purchase|Lease",case=False, regex=True, na=False)

  df.loc[mask,'line_item'] = 'Acquired Intangible Assets'

  # Goodwill Net
    
  mask = df['line_item'].str.contains(r"Goodwill",case=True, regex=True, na=False) & \
          df['line_item'].str.contains(r"Net",case=True, regex=True, na=False) & \
         ~df['line_item'].str.contains(r"Intangible|other|brand|acquired",case=False, regex=True, na=False) & \
         ~df['line_item'].str.contains(r"Finite|Beg|End|Rec",case=False, regex=True, na=False) & \
         ~df['line_item'].str.contains(r"Held|Prod|Trad|Dep|Ind|Ide|Ins",case=True, regex=True, na=False) & \
         ~df['line_item'].str.contains(r"Acquisition|Not Subject|Amort",case=False, regex=True, na=False) & \
         ~df['line_item'].str.contains(r"Estate|Deferred|Leasing|Customer",case=False, regex=True, na=False) & \
         ~df['line_item'].str.contains(r"market|Total|Purchase|Lease",case=False, regex=True, na=False)

  df.loc[mask,'line_item'] = 'Goodwill - Net'
    
  mask = df['line_item'].str.contains(r"Goodwill",case=True, regex=True, na=False) & \
         ~df['line_item'].str.contains(r"Net",case=True, regex=True, na=False) & \
         ~df['line_item'].str.contains(r"Intangible",case=False, regex=True, na=False) & \
         ~df['line_item'].str.contains(r"Finite|Beg|End|Rec|Total",case=False, regex=True, na=False) & \
         ~df['line_item'].str.contains(r"Held|Trad|Dep|Ind|Ide|Ins",case=True, regex=True, na=False) & \
         ~df['line_item'].str.contains(r"Acquisition|Not Subject",case=False, regex=True, na=False) & \
         ~df['line_item'].str.contains(r"Estate|Deferred|Leasing|Customer",case=False, regex=True, na=False) & \
         ~df['line_item'].str.contains(r"market|Purchase|Lease",case=False, regex=True, na=False)

  df.loc[mask,'line_item'] = 'Goodwill'
    
  # Intangible and Goodwill
    
  mask = df['line_item'].str.contains(r"Goodwill",case=True, regex=True, na=False) & \
          df['line_item'].str.contains(r"Intangible",case=True, regex=True, na=False) & \
          df['line_item'].str.contains(r"Net|Other",case=True, regex=True, na=False) & \
         ~df['line_item'].str.contains(r"Subsid|Eclud",case=True, regex=True, na=False)

  df.loc[mask,'line_item'] = 'Intangible And Goodwill'
    
  mask = df['line_item'].str.contains(r"Goodwill",case=True, regex=True, na=False) & \
          df['line_item'].str.contains(r"Intangible",case=True, regex=True, na=False) & \
          df['line_item'].str.contains(r"Other",case=True, regex=True, na=False) & \
         ~df['line_item'].str.contains(r"Excl|Amort|Subsid|Total",case=True, regex=True, na=False)
    
  df.loc[mask,'line_item'] = 'Goodwill And Other Intangible Assets'

  mask = df['line_item'].str.contains(r"Goodwill",case=True, regex=True, na=False) & \
          df['line_item'].str.contains(r"Intangible",case=True, regex=True, na=False) & \
          df['line_item'].str.contains(r"Other",case=True, regex=True, na=False) & \
          df['line_item'].str.contains(r"Accum",case=True, regex=True, na=False) & \
         ~df['line_item'].str.contains(r"Excl|Subsid|Total",case=True, regex=True, na=False)
    
  df.loc[mask,'line_item'] = 'Goodwill And Intangible Assets Net Of Amortization'

  # Common Stock
    
  mask = df['line_item'].str.contains(r"^Common Stock",case=True, regex=True, na=False) & \
         df['line_item'].str.contains(r"Additional Paid",case=False, regex=True, na=False)

  df.loc[mask,'line_item'] = 'Common Stock'
    
  mask = df['line_item'].str.contains(r"^Preferred Stock",case=True, regex=True, na=False)

  df.loc[mask,'line_item'] = 'Preferred Stock'

  # Cash And Cash Equivalents

  mask = df['line_item'].str.contains(r"Cash",case=True, regex=True, na=False) & \
           df['line_item'].str.contains(r"Equivalent",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Vie|Exchange|interest|Held",case=False, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Related|Agree",case=False, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Total|Restricted|Disc|Marg|Reg",case=True, regex=True, na=False)

  df.loc[mask,'line_item'] = 'Cash And Cash Equivalents'

  mask = df['line_item'].str.contains(r"Cash",case=True, regex=True, na=False) & \
           df['line_item'].str.contains(r"Equivalent",case=True, regex=True, na=False) & \
           df['line_item'].str.contains(r"Total",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Total Restricted",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Vie|Exchange|interest|Held",case=False, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Related|Agree",case=False, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Disc|Marg|Reg",case=True, regex=True, na=False)

  df.loc[mask,'line_item'] = 'Total Cash And Cash Equivalents'

  mask = df['line_item'].str.contains(r"Cash",case=True, regex=True, na=False) & \
           df['line_item'].str.contains(r"Equivalent",case=True, regex=True, na=False) & \
           df['line_item'].str.contains(r"^Restricted",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Vie|Exchange|interest|Held",case=False, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Related|Agree",case=False, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Disc|Marg|Reg",case=True, regex=True, na=False)
    
  df.loc[mask,'line_item'] = 'Restricted Cash Equivalents'

  mask = df['line_item'].str.contains(r"Cash",case=True, regex=True, na=False) & \
         df['line_item'].str.contains(r"Equivalent",case=True, regex=True, na=False) & \
         df['line_item'].str.contains(r"Restricted$",case=True, regex=True, na=False) & \
        ~df['line_item'].str.contains(r"^Restricted",case=True, regex=True, na=False) & \
        ~df['line_item'].str.contains(r"Vie|Exchange|interest|Held",case=False, regex=True, na=False) & \
        ~df['line_item'].str.contains(r"Related|Agree",case=False, regex=True, na=False) & \
        ~df['line_item'].str.contains(r"Disc|Marg|Reg",case=True, regex=True, na=False)

  df.loc[mask,'line_item'] = 'Restricted Cash Equivalents'

  mask = df['line_item'].str.contains(r"Cash",case=True, regex=True, na=False) & \
            df['line_item'].str.contains(r"Equivalent",case=True, regex=True, na=False) & \
            df['line_item'].str.contains(r"Restricted",case=True, regex=True, na=False) & \
            df['line_item'].str.contains(r"Incl",case=True, regex=True, na=False) & \
           ~df['line_item'].str.contains(r"^Restricted",case=True, regex=True, na=False) & \
           ~df['line_item'].str.contains(r"Vie|Exchange|interest|Held",case=False, regex=True, na=False) & \
           ~df['line_item'].str.contains(r"Related|Agree",case=False, regex=True, na=False) & \
           ~df['line_item'].str.contains(r"Disc|Marg|Reg|Total",case=True, regex=True, na=False)

  df.loc[mask,'line_item'] = 'Cash Equivalents Including Restricted'

  # Short-Term Investments
    
  mask = df['line_item'].str.contains(r"Invest",case=True, regex=True, na=False) & \
         df['line_item'].str.contains(r"Short.*Term",case=True, regex=True, na=False) & \
        ~df['line_item'].str.contains(r"Cash|Secur",case=True, regex=True, na=False) & \
        ~df['line_item'].str.contains(r"Vie|Exchange|interest|Held",case=False, regex=True, na=False) & \
        ~df['line_item'].str.contains(r"Related|Agree|Amort|^Other",case=False, regex=True, na=False) & \
        ~df['line_item'].str.contains(r"Disc|Marg|Reg|Total",case=True, regex=True, na=False)

  df.loc[mask,'line_item'] = 'Short-Term Investments'
    
  mask = df['line_item'].str.contains(r"Invest",case=True, regex=True, na=False) & \
         df['line_item'].str.contains(r"Short.*Term",case=True, regex=True, na=False) & \
         df['line_item'].str.contains(r"Amort",case=True, regex=True, na=False) & \
        ~df['line_item'].str.contains(r"Avail",case=True, regex=True, na=False) & \
        ~df['line_item'].str.contains(r"Vie|Exchange|interest|Held",case=False, regex=True, na=False) & \
        ~df['line_item'].str.contains(r"Related|Agree|^Other",case=False, regex=True, na=False) & \
        ~df['line_item'].str.contains(r"Disc|Marg|Reg|Total",case=True, regex=True, na=False)

  df.loc[mask,'line_item'] = 'Short-Term Investments'
    
  # Short-Term Investments Available For Sale
    
  mask = df['line_item'].str.contains(r"Invest",case=True, regex=True, na=False) & \
         df['line_item'].str.contains(r"Short.*Term",case=True, regex=True, na=False) & \
         df['line_item'].str.contains(r"Amort",case=True, regex=True, na=False) & \
         df['line_item'].str.contains(r"Avail",case=True, regex=True, na=False) & \
        ~df['line_item'].str.contains(r"Vie|Exchange|interest|Held",case=False, regex=True, na=False) & \
        ~df['line_item'].str.contains(r"Related|Agree|^Other",case=False, regex=True, na=False) & \
        ~df['line_item'].str.contains(r"Disc|Marg|Reg|Total",case=True, regex=True, na=False)

  df.loc[mask,'line_item'] = 'Short-Term Investments Available For Sale'
   
  df = df.set_index('line_item',drop=True)

  return df

In [6]:
df = clean_master_data['AA']['20']['income']['table']
bal_fix_labels(df)


Unnamed: 0_level_0,"Dec. 31, 2019","Dec. 31, 2018","Dec. 31, 2017"
line_item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Income Statement [Abstract],,,
Sales,10433.0,13403.0,11652.0
Cost Of Goods Sold Exclusive Of Expenses Below,8537.0,10053.0,8950.0
"Selling, General Administrative, And Other Expenses",280.0,248.0,280.0
Research And Development Expenses,27.0,31.0,32.0
"Provision For Depreciation, Depletion, And Amortization",713.0,733.0,750.0
"Restructuring And Other Charges, Net",1031.0,527.0,309.0
Interest Expense,121.0,122.0,104.0
"Other Expenses, Net",162.0,64.0,27.0
Total Costs And Expenses,10871.0,11778.0,10452.0


In [7]:
stype = 'balance'

ticker_list = list(clean_master_data.keys())

for ticker in ticker_list:
  
  print(ticker)
    
  for yr in clean_master_data[ticker]:
    
    if stype in clean_master_data[ticker][yr]:

      df_temp = bal_fix_labels(clean_master_data[ticker][yr][stype]['table'])
      clean_master_data[ticker][yr][stype]['table'] = df_temp

AA
AAL
AAP
AAPL
ABBV
ABNB
ABT
ACGL
ACHC
ACI
ACM
ACN
ADBE
ADI
ADM
ADP
ADSK
ADT
AEE
AEP
AES
AFG
AFL
AFRM
AGCO
AGL
AGNC
AGO
AGR
AIG
AIZ
AJG
AKAM
AL
ALB
ALGM
ALGN
ALK
ALL
ALLE
ALLY
ALNY
ALSN
AM
AMAT
AMC
AMCR
AMD
AME
AMED
AMG
AMGN
AMP
AMZN
AN
ANET
ANSS
AON
AOS
APA
APD
APH
APO
APP
APTV
AR
ARES
ARMK
ARW
ASH
ATO
ATR
ATUS
ATVI
AVGO
AVT
AVTR
AWI
AWK
AXON
AXS
AXTA
AYI
AYX
AZEK
AZO
AZTA
BA
BAC
BAH
BALL
BAX
BBWI
BBY
BC
BDX
BEN
BERY
BFAM
BG
BHF
BIIB
BILL
BIO
BJ
BK
BKNG
BLD
BLDR
BLK
BMRN
BMY
BOKF
BR
BRKR
BRO
BSX
BSY
BURL
BWA
BWXT
BX
BYD
C
CABO
CACC
CAH
CAR
CARR
CAT
CB
CBOE
CBSH
CC
CCCS
CCK
CCL
CDAY
CDNS
CDW
CE
CEG
CERT
CF
CFG
CFLT
CFR
CG
CGNX
CHD
CHDN
CHE
CHH
CHPT
CHRW
CHTR
CIEN
CINF
CL
CLH
CLVT
CLX
CMA
CMCSA
CME
CMG
CMI
CMS
CNA
CNM
CNP
CNXC
COF
COIN
COLB
COLM
COO
COST
COTY
CPB
CPRI
CPRT
CR
CRI
CRL
CRUS
CRWD
CSCO
CSL
CTLT
CTRA
CTSH
CTVA
CVNA
CVX
CW
CZR
D
DAL
DAR
DASH
DBX
DCI
DD
DDOG
DECK
DFS
DG
DGX
DHI
DHR
DINO
DIS
DISH
DKNG
DKS
DLB
DLTR
DNA
DOCS
DOCU
DOV
DOW
DPZ
DRVN
DT
DTE
DTM
DUK
DV
DVA
DVN
DXC
DX

In [8]:
# Save to file

with open(PROJ_ROOT_PATH + '/pickle/clean_master_data_stage2.pkl', 'wb') as f:
  pickle.dump(clean_master_data, f)