In [1]:
import numpy as np
import pandas as pd
import time
import os
import pickle
import re

In [2]:
pd.set_option('display.max_rows', 600)
pd.set_option('display.max_columns', 600)
pd.set_option('display.max_colwidth', 100)
pd.options.display.float_format = '{:.2f}'.format

In [3]:
DATA_ROOT_PATH="/mnt/data/projects/MD1"
PROJ_ROOT_PATH="/home/priyesh/projects/MD1"

In [4]:
# Read from Pickle file

filepath = os.path.join(PROJ_ROOT_PATH,'pickle','clean_master_data_interim.pkl')
clean_master_data = pd.read_pickle(filepath)

In [5]:
def cash_adj_labels(orig_df):
  
  df = orig_df.copy()
  df = df.reset_index()

  # Adjustments To Reconcile Net Income
    
  mask = df['line_item'].str.contains(r"Net Income",case=True, regex=True, na=False) & \
           df['line_item'].str.contains(r"Adjustment",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Other",case=True, regex=True, na=False)

  df.loc[mask,'line_item'] = 'Adjustments To Reconcile Net Income'
    
  mask = df['line_item'].str.contains(r"Operations",case=True, regex=True, na=False) & \
           df['line_item'].str.contains(r"Continuing",case=True, regex=True, na=False) & \
           df['line_item'].str.contains(r"Earn|Income",case=True, regex=True, na=False) & \
           df['line_item'].str.contains(r"Adjust",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Tax|Discon|Part|Share",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Dil|Pro|Begin|Less|Other|Basic",case=True, regex=True, na=False)

  df.loc[mask,'line_item'] = 'Adjustments To Reconcile Net Income From Continuing Operations'
    
  # Net Income

  mask = df['line_item'].str.contains(r"Net Income",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Adjustment|Non|Equity|Oper|Ins",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Attrib|Incl|Deduct|Tax|Tran|Comp",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Invest|Comm|Stock|Share|Nume",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Extr|Acq|Dives|Cons|Cont|Citi",case=True, regex=True, na=False)

  df.loc[mask,'line_item'] = 'Net Income'

  mask = df['line_item'].str.contains(r"Operations",case=True, regex=True, na=False) & \
           df['line_item'].str.contains(r"Continuing",case=True, regex=True, na=False) & \
           df['line_item'].str.contains(r"Earn|Income",case=True, regex=True, na=False) & \
           df['line_item'].str.contains(r"Net",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Adjust|Non|Attrib|Common",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Tax|Discon|Part|Share",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Dil|Pro|Begin|Less|Other|Basic",case=True, regex=True, na=False)

  df.loc[mask,'line_item'] = 'Net Income From Continuing Operations'

  # Other Non-Cash Adjustments to Net Income
    
  mask = df['line_item'].str.contains(r"Net Income",case=True, regex=True, na=False) & \
           df['line_item'].str.contains(r"Adjustment",case=True, regex=True, na=False) & \
           df['line_item'].str.contains(r"Non",case=True, regex=True, na=False) & \
           df['line_item'].str.contains(r"Other",case=True, regex=True, na=False)

  df.loc[mask,'line_item'] = 'Other Non-Cash Adjustments to Net Income'

  # Other Non-Cash Amounts Included in Net Income

  mask = df['line_item'].str.contains(r"Net Income",case=True, regex=True, na=False) & \
           df['line_item'].str.contains(r"Non",case=True, regex=True, na=False) & \
           df['line_item'].str.contains(r"Incl",case=True, regex=True, na=False) & \
           df['line_item'].str.contains(r"Other",case=True, regex=True, na=False)
    
  df.loc[mask,'line_item'] = 'Other Non-Cash Amounts Included In Net Amount'
    
  mask = df['line_item'].str.contains(r"Operations",case=True, regex=True, na=False) & \
           df['line_item'].str.contains(r"Continuing",case=True, regex=True, na=False) & \
           df['line_item'].str.contains(r"Earn|Income",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Net",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Adjust|Non|Attrib|Common",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Tax|Discon|Part|Share|Retain",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Dil|Pro|Begin|Less|Other|Basic",case=True, regex=True, na=False)  

  df.loc[mask,'line_item'] = 'Income From Continuing Operations'
    
  # Operating Cash Flow

  mask = df['line_item'].str.contains(r"Cash.Flow",case=True, regex=True, na=False) & \
           df['line_item'].str.contains(r"Operating",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Working|Total|Convert|Changes",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Addition|Increase|Decrease|Rel",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Disc|Adj|Reco|Net|Paid|Non|Fin|lea",case=True, regex=True, na=False)

  df.loc[mask,'line_item'] = 'Income From Continuing Operations'

  # Total Cash Flows From Operating Activities
    
  mask = df['line_item'].str.contains(r"Cash.Flow",case=True, regex=True, na=False) & \
           df['line_item'].str.contains(r"Operating",case=True, regex=True, na=False) & \
           df['line_item'].str.contains(r"Total",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Working|Convert|Changes",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Addition|Increase|Decrease|Rel",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Disc|Adj|Reco|Net|Paid|Non|Fin|lea",case=True, regex=True, na=False)
    
  df.loc[mask,'line_item'] = 'Total Cash Flows From Continuing Operations'

  # Cash Flows From Financing Activities
    
  mask = df['line_item'].str.contains(r"Cash.Flow",case=True, regex=True, na=False) & \
           df['line_item'].str.contains(r"Fina",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Working|Total|Convert|Changes",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Attrib|Cont|Dis|Lease",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Addition|Increase|Decrease|Rel",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Disc|Adj|Reco|Net|Paid|Non|lea",case=True, regex=True, na=False)

  df.loc[mask,'line_item'] = 'Cash Flows From Financing Activities'
  
  # Cash Flows From Financing Activities Attributable To Continuing Operations
    
  mask = df['line_item'].str.contains(r"Cash.Flow",case=True, regex=True, na=False) & \
           df['line_item'].str.contains(r"Fina",case=True, regex=True, na=False) & \
           df['line_item'].str.contains(r"Cont",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Working|Convert|Changes",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Dis|Lease|Total",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Addition|Increase|Decrease|Rel",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Disc|Adj|Reco|Net|Paid|Non|lea",case=True, regex=True, na=False)

  df.loc[mask,'line_item'] = 'Cash Flows From Financing Activities Attributable to Continuing Operations'

  # Total Cash Flows From Financing Activities
    
  mask = df['line_item'].str.contains(r"Cash.Flow",case=True, regex=True, na=False) & \
           df['line_item'].str.contains(r"Fina",case=True, regex=True, na=False) & \
           df['line_item'].str.contains(r"Total",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Working|Convert|Changes",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Attrib|Cont|Dis|Lease",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Addition|Increase|Decrease|Rel",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Disc|Adj|Reco|Net|Paid|Non|lea",case=True, regex=True, na=False)

  df.loc[mask,'line_item'] = 'Total Cash Flows From Financing Activities'
  
  # Total Cash Flows Used By Financing Activities From Continuing Operations
    
  mask = df['line_item'].str.contains(r"Cash.Flow",case=True, regex=True, na=False) & \
           df['line_item'].str.contains(r"Fina",case=True, regex=True, na=False) & \
           df['line_item'].str.contains(r"Total",case=True, regex=True, na=False) & \
           df['line_item'].str.contains(r"Cont",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Working|Convert|Changes",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Attrib|Dis|Lease",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Addition|Increase|Decrease|Rel",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Disc|Adj|Reco|Net|Paid|Non|lea",case=True, regex=True, na=False)

  df.loc[mask,'line_item'] = 'Total Cash Flows From Financing Activities From Continuing Operations'

  # Cash and Cash Equivalent At End Of Year

  mask = df['line_item'].str.contains(r"Cash Equivalents",case=True, regex=True, na=False) & \
           df['line_item'].str.contains(r"End Of",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Total",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Increase|Decrease|Recon|Change",case=False, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Disc|Exchange|Foreign|Beg|Curr",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Cust|Investment|Proceed",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"^Restricted|Decon|Use Of Rest",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Acqui|Tran|Client|Less|Excl",case=True, regex=True, na=False)

  df.loc[mask,'line_item'] = 'Cash and Cash Equivalent At End Of Year'

  mask = df['line_item'].str.contains(r"Cash Equivalents",case=True, regex=True, na=False) & \
           df['line_item'].str.contains(r"End Of|End|Period",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Total",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Finan|Invest|Operat",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Jan|Dec",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Settlement|Sales|Cont|Recon",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Held|Consisted|Retain|vie|var",case=False, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Increase|Decrease|Reclass|Change",case=False, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Disc|Exchange|Foreign|Beg|Curr",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Cust|Investment|Proceed",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"^Restricted|Decon|Use Of Rest",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Acqui|Tran|Client|Less|Excl",case=True, regex=True, na=False)

  df.loc[mask,'line_item'] = 'Cash and Cash Equivalent At End Of Year'

  # Total Cash And Cash Equivalent At End Of Year
    
  mask = df['line_item'].str.contains(r"Cash Equivalents",case=True, regex=True, na=False) & \
           df['line_item'].str.contains(r"End Of",case=True, regex=True, na=False) & \
           df['line_item'].str.contains(r"Total",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Increase|Decrease|Recon|Change",case=False, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Disc|Exchange|Foreign|Beg|Curr",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Cust|Investment|Proceed",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"^Restricted|Decon|Use Of Rest",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Acqui|Tran|Client|Less|Excl",case=True, regex=True, na=False)

  df.loc[mask,'line_item'] = 'Total Cash and Cash Equivalent At End Of Period'
    
  mask = df['line_item'].str.contains(r"Cash Equivalents",case=True, regex=True, na=False) & \
           df['line_item'].str.contains(r"Total",case=True, regex=True, na=False) & \
           df['line_item'].str.contains(r"End Of|End|Period",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Finan|Invest|Operat",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Jan|Dec",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Settlement|Sales|Cont|Recon",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Held|Consisted|Retain|vie|var",case=False, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Increase|Decrease|Reclass|Change",case=False, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Disc|Exchange|Foreign|Beg|Curr",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Cust|Investment|Proceed",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"^Restricted|Decon|Use Of Rest",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Acqui|Tran|Client|Less|Excl",case=True, regex=True, na=False)

  df.loc[mask,'line_item'] = 'Total Cash and Cash Equivalent At End Of Period'
    
  # Total Cash and Cash Equivalent At Beginning Of Year

  mask = df['line_item'].str.contains(r"Cash Equivalents",case=True, regex=True, na=False) & \
           df['line_item'].str.contains(r"Total",case=True, regex=True, na=False) & \
           df['line_item'].str.contains(r"Beg",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"End Of|End",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Finan|Invest|Operat",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Jan|Dec",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Settlement|Sales|Cont|Recon",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Held|Consisted|Retain|vie|var",case=False, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Increase|Decrease|Reclass|Change",case=False, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Disc|Exchange|Foreign|Curre",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Cust|Investment|Proceed",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"^Restricted|Decon|Use Of Rest",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Acqui|Tran|Client|Less|Excl",case=True, regex=True, na=False)

  df.loc[mask,'line_item'] = 'Total Cash and Cash Equivalent At End Of Period'

  # Cash and Cash Equivalent At Beginning Of Year
    
  mask = df['line_item'].str.contains(r"Cash Equivalents",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Total",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"End Of|End",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Finan|Invest|Operat",case=True, regex=True, na=False) & \
           df['line_item'].str.contains(r"Beg",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Jan|Dec",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Settlement|Sales|Cont|Recon",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Held|Consisted|Retain|vie|var",case=False, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Increase|Decrease|Reclass|Change",case=False, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Disc|Exchange|Foreign|Curre",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Cust|Investment|Proceed",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"^Restricted|Decon|Use Of Rest",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Acqui|Tran|Client|Less|Excl",case=True, regex=True, na=False)

  df.loc[mask,'line_item'] = 'Total Cash and Cash Equivalent At Begining Of Period'
    
  # Cash And Cash Equivalent

  mask = df['line_item'].str.contains(r"Cash Equivalents",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"End Of|End|Period",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Total",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Finan|Invest|Operat",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Ending|Jan|Dec|Of",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Settlement|Sales|Cont|Recon",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Held|Consisted|Retain|vie|var",case=False, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Increase|Decrease|Reclass|Change",case=False, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Disc|Exchange|Foreign|Beg|Curr",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Cust|Investment|Proceed",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"^Restricted|Decon|Use Of Rest",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Acqui|Tran|Client|Less|Excl",case=True, regex=True, na=False)
    
  df.loc[mask,'line_item'] = 'Cash and Cash Equivalent'

  mask = df['line_item'].str.contains(r"Cash Equivalents",case=True, regex=True, na=False) & \
           df['line_item'].str.contains(r"Total",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"End Of|End|Period",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Finan|Invest|Operat",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Ending|Jan|Dec|Of",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Settlement|Sales|Cont|Recon",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Held|Consisted|Retain|vie|var",case=False, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Increase|Decrease|Reclass|Change",case=False, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Disc|Exchange|Foreign|Beg|Curr",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Cust|Investment|Proceed",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"^Restricted|Decon|Use Of Rest",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Acqui|Tran|Client|Less|Excl",case=True, regex=True, na=False)

  df.loc[mask,'line_item'] = 'Total Cash and Cash Equivalent'

  # Cash Flow From Investing
    
  mask = df['line_item'].str.contains(r"Cash.Flow",case=True, regex=True, na=False) & \
           df['line_item'].str.contains(r"Invest",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Cont",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Lessee|Dis|Req|Other",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Working|Total|Convert|Changes",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Addition|Increase|Decrease|Rel",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Disc|Adj|Reco|Net|Paid|Non|lea",case=True, regex=True, na=False)

  df.loc[mask,'line_item'] = 'Cash Flow From Investing'
  
  # Cash Flow From Investing From Continuing Activities
    
  mask = df['line_item'].str.contains(r"Cash.Flow",case=True, regex=True, na=False) & \
           df['line_item'].str.contains(r"Invest",case=True, regex=True, na=False) & \
           df['line_item'].str.contains(r"Cont",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Lessee|Dis|Req|Other",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Working|Total|Convert|Changes",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Addition|Increase|Decrease|Rel",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Disc|Adj|Reco|Net|Paid|Non|lea",case=True, regex=True, na=False)

  df.loc[mask,'line_item'] = 'Cash Flow Investing Activities From Continuing Activities'

  # Total Cash Flow From Investing

  mask = df['line_item'].str.contains(r"Cash.Flow",case=True, regex=True, na=False) & \
           df['line_item'].str.contains(r"Invest",case=True, regex=True, na=False) & \
           df['line_item'].str.contains(r"Total",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Cont",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Lessee|Dis|Req|Other",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Working|Convert|Changes",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Addition|Increase|Decrease|Rel",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Disc|Adj|Reco|Net|Paid|Non|lea",case=True, regex=True, na=False)

  df.loc[mask,'line_item'] = 'Cash Flow Investing Acitvities'

  # Total Cash Flow From Investing From Continuing Activities

  mask = df['line_item'].str.contains(r"Cash.Flow",case=True, regex=True, na=False) & \
           df['line_item'].str.contains(r"Invest",case=True, regex=True, na=False) & \
           df['line_item'].str.contains(r"Total",case=True, regex=True, na=False) & \
           df['line_item'].str.contains(r"Cont",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Lessee|Dis|Req|Other",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Working|Convert|Changes",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Addition|Increase|Decrease|Rel",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Disc|Adj|Reco|Net|Paid|Non|lea",case=True, regex=True, na=False)

  df.loc[mask,'line_item'] = 'Total Cash Flow From Investing From Continuing Activities'
  
  # Stock-Based Compensation

  mask = df['line_item'].str.contains(r"Compensation",case=True, regex=True, na=False) & \
           df['line_item'].str.contains(r"Stock|Share",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Director|Recog|Long|Settle|Adj",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Ret|Withhold|Opt|Manage|Trustee",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Proc|Grant|Excess|Cap|Cash",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Non.Employee|Aff|Mod",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Rel|Tax|Common|Inter|Amort|Issu",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Acqui|Tran|Client",case=True, regex=True, na=False)

  df.loc[mask,'line_item'] = 'Stock-Based Compensation'

  # Dividends

  mask = df['line_item'].str.contains(r"Dividends",case=True, regex=True, na=False) & \
           df['line_item'].str.contains(r"Common|Ord",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Ret|Withhold|Opt|Manage|Trustee",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Proc|Grant|Excess|Cap|Cash",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Liability|Increase|Specia",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Kind|Wend|Dowd|Not Paid|Dec",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Changes|Ren|Trad",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Equi|Class|Vor|Ugi|Ppg",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Fina|Member|Accr|Renn|Contra",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Net Of Dividend|Other|Former",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Reinvest|Part|Unvest|Not Yet",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Non.Employee|Aff|Mod|Pref|Unpaid",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Quarter|Invest|Subsid|Rec|From|Non",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Acqui|Tran|Client|Less|Excl|Parent",case=True, regex=True, na=False)

  df.loc[mask,'line_item'] = 'Divdends'

  # Dividends Paid
    
  mask = df['line_item'].str.contains(r"Dividends",case=True, regex=True, na=False) & \
           df['line_item'].str.contains(r"Paid",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Common|Ord",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Ret|Withhold|Opt|Manage|Trustee",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Proc|Grant|Excess|Cap|Cash",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Liability|Increase|Specia",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Kind|Wend|Dowd|Not Paid|Dec",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Changes|Ren|Trad",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Equi|Class|Vor|Ugi|Ppg",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Fina|Member|Accr|Renn|Contra",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Net Of Dividend|Other|Former",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Reinvest|Part|Unvest|Not Yet",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Non.Employee|Aff|Mod|Pref|Unpaid",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Quarter|Invest|Subsid|Rec|From|Non",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Acqui|Tran|Client|Less|Excl|Parent",case=True, regex=True, na=False)

  df.loc[mask,'line_item'] = 'Divdends Paid'
  
  # Dividends Not Yet Paid
    
  mask = df['line_item'].str.contains(r"Dividends",case=True, regex=True, na=False) & \
           df['line_item'].str.contains(r"Not Yet|Not Paid",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Acqui|Tran|Client|Less|Excl|Parent",case=True, regex=True, na=False)
 
  df.loc[mask,'line_item'] = 'Divdends Not Yet Paid'

  # Capital Expenditure
    
  mask = df['line_item'].str.contains(r"Expenditure",case=True, regex=True, na=False) & \
           df['line_item'].str.contains(r"Capital",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Total",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Not Paid",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Funding",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Repay|Project|Adj|Increa|Decre",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Prop|Constr|Estate|Rel|Financed",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Account|Disallow|Other|Addi|Fina",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Unpaid|Accru|Rec|Discon|Not Yet",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Acqui|Tran|Client|Less|Excl|Parent",case=True, regex=True, na=False)

  df.loc[mask,'line_item'] = 'Capital Expenditure'
    
  mask = df['line_item'].str.contains(r"Expenditure",case=True, regex=True, na=False) & \
           df['line_item'].str.contains(r"Capital",case=True, regex=True, na=False) & \
           df['line_item'].str.contains(r"Not Paid",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Total",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Funding",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Repay|Project|Adj|Increa|Decre",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Prop|Constr|Estate|Rel|Financed",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Account|Disallow|Other|Addi|Fina",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Unpaid|Accru|Rec|Discon|Not Yet",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Acqui|Tran|Client|Less|Excl|Parent",case=True, regex=True, na=False)

  df.loc[mask,'line_item'] = 'Capital Expenditure Not Paid'

  df = df.set_index('line_item',drop=True)

  return df

In [7]:
stype = 'cash'

ticker_list = list(clean_master_data.keys())

for ticker in ticker_list:
  
  print(ticker)
    
  for yr in clean_master_data[ticker]:
    
    if stype in clean_master_data[ticker][yr]:

      df_temp = cash_adj_labels(clean_master_data[ticker][yr][stype]['table'])
      clean_master_data[ticker][yr][stype]['table'] = df_temp

AA
AAL
AAP
AAPL
ABBV
ABNB
ABT
ACGL
ACHC
ACI
ACM
ACN
ADBE
ADI
ADM
ADP
ADSK
ADT
AEE
AEP
AES
AFG
AFL
AFRM
AGCO
AGL
AGNC
AGO
AGR
AIG
AIZ
AJG
AKAM
AL
ALB
ALGM
ALGN
ALK
ALL
ALLE
ALLY
ALNY
ALSN
AM
AMAT
AMC
AMCR
AMD
AME
AMED
AMG
AMGN
AMP
AMZN
AN
ANET
ANSS
AON
AOS
APA
APD
APH
APO
APP
APTV
AR
ARES
ARMK
ARW
ASH
ATO
ATR
ATUS
ATVI
AVGO
AVT
AVTR
AWI
AWK
AXON
AXS
AXTA
AYI
AYX
AZEK
AZO
AZTA
BA
BAC
BAH
BALL
BAX
BBWI
BBY
BC
BDX
BEN
BERY
BFAM
BG
BHF
BIIB
BILL
BIO
BJ
BK
BKNG
BLD
BLDR
BLK
BMRN
BMY
BOKF
BR
BRKR
BRO
BSX
BSY
BURL
BWA
BWXT
BX
BYD
C
CABO
CACC
CAH
CAR
CARR
CAT
CB
CBOE
CBSH
CC
CCCS
CCK
CCL
CDAY
CDNS
CDW
CE
CEG
CERT
CF
CFG
CFLT
CFR
CG
CGNX
CHD
CHDN
CHE
CHH
CHPT
CHRW
CHTR
CIEN
CINF
CL
CLH
CLVT
CLX
CMA
CMCSA
CME
CMG
CMI
CMS
CNA
CNM
CNP
CNXC
COF
COIN
COLB
COLM
COO
COST
COTY
CPB
CPRI
CPRT
CR
CRI
CRL
CRUS
CRWD
CSCO
CSL
CTLT
CTRA
CTSH
CTVA
CVNA
CVX
CW
CZR
D
DAL
DAR
DASH
DBX
DCI
DD
DDOG
DECK
DFS
DG
DGX
DHI
DHR
DINO
DIS
DISH
DKNG
DKS
DLB
DLTR
DNA
DOCS
DOCU
DOV
DOW
DPZ
DRVN
DT
DTE
DTM
DUK
DV
DVA
DVN
DXC
DX

In [8]:
# Save to file

with open(PROJ_ROOT_PATH + '/pickle/clean_master_data_stage2.pkl', 'wb') as f:
  pickle.dump(clean_master_data, f)