In [1]:
import numpy as np
import pandas as pd
import time
import os
import pickle
import re

In [2]:
pd.set_option('display.max_rows', 600)
pd.set_option('display.max_columns', 600)
pd.set_option('display.max_colwidth', 100)
pd.options.display.float_format = '{:.2f}'.format

In [3]:
DATA_ROOT_PATH="/mnt/data/projects/MD1"
PROJ_ROOT_PATH="/home/priyesh/projects/MD1"

In [4]:
# Read from Pickle file

filepath = os.path.join(PROJ_ROOT_PATH,'pickle','clean_master_data_interim.pkl')
clean_master_data = pd.read_pickle(filepath)

In [5]:
def inc_fix_labels(orig_df):
    
  df = orig_df.copy()
  df = df.reset_index()

  # Operating Expenses Excl Dep And Amort

  mask = df['line_item'].str.contains(r"Operat.*Expense",case=False, regex=True, na=False) & \
          df['line_item'].str.contains(r"Excl",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Asset|Segment|Item",case=True, regex=True, na=False)

  df.loc[mask,'line_item'] = 'Operating Expenses Excl Dep And Amort'

  # Revenues Including From Related Parties

  mask = df['line_item'].str.contains(r"Revenues",case=True, regex=True, na=False) & \
           df['line_item'].str.contains(r"Including",case=True, regex=True, na=False) & \
           df['line_item'].str.contains(r"Related Parties",case=True, regex=True, na=False) & \
         ~ df['line_item'].str.contains(r"Other",case=True, regex=True, na=False) 

  df.loc[mask,'line_item'] = 'Revenues Including Amount From Related Parties'

  # Net Sales

  mask = df['line_item'].str.contains(r"^Net Sales",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Related Part",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Passenger|Hardware|Software|Excl|Less",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Service|Estate|Customer|Cons|Gross",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Multi|Fee|Interest|Dividend|Oper",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Cloud|Subs|Maint|Util|Lease|Prod",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Insur|Rent|Office|Home|Comm|non",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Automotive|Cost|Property|Broker|Ext",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Other|Deferred|Recur|Member|Gas",case=True, regex=True, na=False)

  df.loc[mask,'line_item'] = 'Net Sales'

  mask = df['line_item'].str.contains(r"^Net Sales",case=False, regex=True, na=False) & \
          df['line_item'].str.contains(r"Related Part",case=True, regex=True, na=False) & \
          df['line_item'].str.contains(r"Including",case=True, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Passenger|Hardware|Software|Excl|Less",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Service|Estate|Customer|Cons|Gross",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Multi|Fee|Interest|Dividend|Oper",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Cloud|Subs|Maint|Util|Lease|Prod",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Insur|Rent|Office|Home|Comm|non",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Automotive|Cost|Property|Broker|Ext",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Other|Deferred|Recur|Member|Gas",case=True, regex=True, na=False)

  df.loc[mask,'line_item'] = 'Net Sales'

  # Total Revenue
   
  mask = df['line_item'].str.contains(r"Revenue",case=False, regex=True, na=False) & \
            df['line_item'].str.contains(r"Total",case=False, regex=True, na=False) & \
           ~df['line_item'].str.contains(r"Passenger|Hardware|Software|Excl|Less",case=False, regex=True, na=False) & \
           ~df['line_item'].str.contains(r"Service|Estate|Customer|Cons|Gross",case=False, regex=True, na=False) & \
           ~df['line_item'].str.contains(r"Multi|Fee|Interest|Dividend|Oper",case=False, regex=True, na=False) & \
           ~df['line_item'].str.contains(r"Cloud|Subs|Maint|Util|Lease|Net",case=False, regex=True, na=False) & \
           ~df['line_item'].str.contains(r"Insur|Rent|Office|Home|Comm|non",case=False, regex=True, na=False) & \
           ~df['line_item'].str.contains(r"Automotive|Cost|Property|Broker|Sales",case=False, regex=True, na=False) & \
           ~df['line_item'].str.contains(r"Other|Deferred|Recur|Member|Gas",case=True, regex=True, na=False)
          
  df.loc[mask,'line_item'] = 'Total Revenue'

  # Total Net Revenue

  mask = df['line_item'].str.contains(r"Revenue",case=False, regex=True, na=False) & \
            df['line_item'].str.contains(r"Total",case=False, regex=True, na=False) & \
            df['line_item'].str.contains(r"Net",case=False, regex=True, na=False) & \
           ~df['line_item'].str.contains(r"Passenger|Hardware|Software|Excl|Less",case=False, regex=True, na=False) & \
           ~df['line_item'].str.contains(r"Service|Estate|Customer|Cons|Gross",case=False, regex=True, na=False) & \
           ~df['line_item'].str.contains(r"Multi|Fee|Interest|Dividend|Oper",case=False, regex=True, na=False) & \
           ~df['line_item'].str.contains(r"Cloud|Subs|Maint|Util|Lease|Sales",case=False, regex=True, na=False) & \
           ~df['line_item'].str.contains(r"Insur|Rent|Office|Home|Comm|non",case=False, regex=True, na=False) & \
           ~df['line_item'].str.contains(r"Automotive|Cost|Property|Broker",case=False, regex=True, na=False) & \
           ~df['line_item'].str.contains(r"Other|Deferred|Recur|Member|Gas",case=True, regex=True, na=False)
         
  df.loc[mask,'line_item'] = 'Total Net Revenue'  

  mask = df['line_item'].str.contains(r"Revenue",case=False, regex=True, na=False) & \
           df['line_item'].str.contains(r"Total",case=False, regex=True, na=False) & \
           df['line_item'].str.contains(r"Net",case=False, regex=True, na=False) & \
           df['line_item'].str.contains(r"Sales",case=False, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Passenger|Hardware|Software|Excl|Less",case=False, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Service|Estate|Customer|Cons|Gross",case=False, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Multi|Fee|Interest|Dividend|Oper",case=False, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Cloud|Subs|Maint|Util|Lease",case=False, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Insur|Rent|Office|Home|Comm|non",case=False, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Automotive|Cost|Property|Broker",case=False, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Other|Deferred|Recur|Member|Gas",case=True, regex=True, na=False)
  
  df.loc[mask,'line_item'] = 'Total Net Sales And Revenue'  

  # Total Operating Revenue
   
  mask = df['line_item'].str.contains(r"Revenue",case=False, regex=True, na=False) & \
           df['line_item'].str.contains(r"Total",case=False, regex=True, na=False) & \
           df['line_item'].str.contains(r"Oper",case=False, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Passenger|Hardware|Software|Excl|Less",case=False, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Multi|Fee|Interest|Dividend|Non",case=False, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Automotive|Cost|Property|Broker",case=False, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Other|Deferred|Recur|Member|Gas|Util",case=True, regex=True, na=False)
         
  df.loc[mask,'line_item'] = 'Total Operating Revenue'   

  # Net Revenue Including From Related Parties
    
  mask = df['line_item'].str.contains(r"Revenue",case=False, regex=True, na=False) & \
          df['line_item'].str.contains(r"Net",case=False, regex=True, na=False) & \
          df['line_item'].str.contains(r"Includes",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Total",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Passenger|Hardware|Software|Excl|Less",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Service|Estate|Customer|Cons|Gross",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Multi|Fee|Interest|Dividend|Oper",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Cloud|Subs|Maint|Util|Lease",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Insur|Rent|Office|Home|Comm|non",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Automotive|Cost|Property|Broker",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Other|Deferred|Recur|Member|Gas",case=True, regex=True, na=False)
         
  df.loc[mask,'line_item'] = 'Net Revenue Incl From Related Parties' 

  # Total Cost of Sale
    
  mask = df['line_item'].str.contains(r"Cost Of Sales",case=False, regex=True, na=False) & \
           df['line_item'].str.contains(r"Total",case=False, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Veh|Other|Oper|Excl",case=True, regex=True, na=False)

  df.loc[mask,'line_item'] = 'Total Cost Of Sales'  
    
  mask = df['line_item'].str.contains(r"Cost Of Sales",case=False, regex=True, na=False) & \
          df['line_item'].str.contains(r"Total",case=False, regex=True, na=False) & \
          df['line_item'].str.contains(r"Excl",case=False, regex=True, na=False) & \
         ~df['line_item'].str.contains(r"Veh|Other|Oper",case=True, regex=True, na=False)

  df.loc[mask,'line_item'] = 'Total Cost Of Sales Excluding Deprec And Amort'  
    
  # Total Cost of Revenue
    
  mask = df['line_item'].str.contains(r"Revenue",case=False, regex=True, na=False) & \
           df['line_item'].str.contains(r"Total",case=False, regex=True, na=False) & \
           df['line_item'].str.contains(r"Cost",case=False, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Passenger|Hardware|Software",case=False, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Excl|Recurr",case=False, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Service|Estate|Customer",case=False, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Multi|Fee|Interest|Dividend",case=False, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Cloud|Subs|Maint|Util|Lease",case=False, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Insur|Rent|Office|Home|Comm|non",case=False, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Automotive|Property|Broker",case=False, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Other|Deferred",case=False, regex=True, na=False)
   
  df.loc[mask,'line_item'] = 'Total Cost Of Revenue'   
    
  # Cost Sales Excluding Dep
    
  mask = df['line_item'].str.contains(r"Cost Of Sales",case=False, regex=True, na=False) & \
           df['line_item'].str.contains(r"Excl",case=False, regex=True, na=False) & \
           df['line_item'].str.contains(r"Dep",case=False, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Total",case=False, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Part|Serv|Prod|Unit|Comm|^Dep",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Software|Chem|Mids|Auto",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Res|Rev|Online|Fuel",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Occ|Member|Ext|Energy|Type",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Other|Man|Oper|Sub|Food|Bev",case=False, regex=True, na=False)
         
  df.loc[mask,'line_item'] = 'Cost Of Sales Excluding Deprec And Amort'  

  # Cost of Revenue

  mask = df['line_item'].str.contains(r"Revenue",case=False, regex=True, na=False) & \
           df['line_item'].str.contains(r"^Cost",case=False, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Total|Dep|Prod|Lic|Sup|Con|Bio",case=False, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Party|Related|Less|Rest|Land",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Passenger|Hardware|Software",case=False, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Royal|Coal|Excl|Sales|Net",case=False, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Multi|Fee|Interest|Dividend",case=False, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Cloud|Subs|Maint|Util|Lease|Reimb",case=False, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Insur|Rent|Office|Home|Comm|non",case=False, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Automotive|Property|Broker",case=False, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Other|Deferred|Member|Abstract",case=False, regex=True, na=False)

  df.loc[mask,'line_item'] = 'Cost Of Revenue'   
    
  # Cost of Revenue Including Amortization and Impairments

  mask = df['line_item'].str.contains(r"Revenue",case=False, regex=True, na=False) & \
          df['line_item'].str.contains(r"^Cost",case=False, regex=True, na=False) & \
          df['line_item'].str.contains(r"Including",case=False, regex=True, na=False) & \
          df['line_item'].str.contains(r"Amort",case=False, regex=True, na=False) & \
          df['line_item'].str.contains(r"Impair",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Definite",case=True, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Other|Deferred|Member|Abstract",case=True, regex=True, na=False)

  df.loc[mask,'line_item'] = 'Cost Of Revenue Including Amortization and Impairments'   

  # Cost of Revenue Excluding Amortization
    
  mask = df['line_item'].str.contains(r"Revenue",case=False, regex=True, na=False) & \
          df['line_item'].str.contains(r"^Cost",case=False, regex=True, na=False) & \
          df['line_item'].str.contains(r"Excluding|Exclusive",case=False, regex=True, na=False) & \
          df['line_item'].str.contains(r"Amort",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Definite|Dep|Intang",case=True, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Other|Deferred|Member|Abstract",case=True, regex=True, na=False)
    
  df.loc[mask,'line_item'] = 'Cost Of Revenue Excluding Amortization' 

  mask = df['line_item'].str.contains(r"Revenue",case=False, regex=True, na=False) & \
          df['line_item'].str.contains(r"^Cost",case=False, regex=True, na=False) & \
          df['line_item'].str.contains(r"Excluding|Exclusive",case=False, regex=True, na=False) & \
          df['line_item'].str.contains(r"Amort",case=False, regex=True, na=False) & \
          df['line_item'].str.contains(r"Depr",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Definite|Intang",case=True, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Other|Deferred|Member|Abstract",case=True, regex=True, na=False)

  df.loc[mask,'line_item'] = 'Cost Of Revenue Excluding Depreciation and Amortization' 

  # Total Cost of Revenue
    
  mask = df['line_item'].str.contains(r"Total Cost Of Sales",case=False, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Veh|Other|Oper",case=True, regex=True, na=False)
  
  df.loc[mask,'line_item'] = 'Total Cost Of Revenue'

  mask = df['line_item'].str.contains(r"Cost Of Revenue",case=True, regex=True, na=False) & \
           df['line_item'].str.contains(r"Total",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Veh|Other|Oper|Non|Auto",case=True, regex=True, na=False)

  df.loc[mask,'line_item'] = 'Total Cost Of Revenue'

  # Selling, General and Admin Includng Stock Based Comp
    
  mask = df['line_item'].str.contains(r"Selling",case=False, regex=True, na=False) & \
           df['line_item'].str.contains(r"General",case=False, regex=True, na=False) & \
           df['line_item'].str.contains(r"Admin",case=False, regex=True, na=False) & \
           df['line_item'].str.contains(r"Including Stock",case=False, regex=True, na=False)
       
  df.loc[mask,'line_item'] = 'SGA Including Stock-Based Comp'

  # SGA Excluding Depreciation and Amortisation
  
  mask = df['line_item'].str.contains(r"Selling",case=False, regex=True, na=False) & \
          df['line_item'].str.contains(r"General",case=False, regex=True, na=False) & \
          df['line_item'].str.contains(r"Admin",case=False, regex=True, na=False) & \
          df['line_item'].str.contains(r"Excluding",case=False, regex=True, na=False) & \
          df['line_item'].str.contains(r"Dep|Amort",case=False, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Including",case=False, regex=True, na=False) 
         
  df.loc[mask,'line_item'] = 'SGA Excluding Dep and Amort'

  # SGA Excluding Impairment Losses
    
  mask = df['line_item'].str.contains(r"Selling",case=False, regex=True, na=False) & \
          df['line_item'].str.contains(r"General",case=False, regex=True, na=False) & \
          df['line_item'].str.contains(r"Admin",case=False, regex=True, na=False) & \
          df['line_item'].str.contains(r"Excluding",case=False, regex=True, na=False) & \
         ~df['line_item'].str.contains(r"Dep|Amort",case=False, regex=True, na=False) & \
         ~df['line_item'].str.contains(r"Including",case=False, regex=True, na=False) 
         
  df.loc[mask,'line_item'] = 'SGA Excluding Impairment Losses'

  mask = df['line_item'].str.contains(r"General",case=False, regex=True, na=False) & \
           df['line_item'].str.contains(r"Administrative",case=False, regex=True, na=False) & \
           df['line_item'].str.contains(r"Sales",case=False, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"^Dep|Home|Percent|Intan",case=True, regex=True, na=False)
       
  df.loc[mask,'line_item'] = 'SGA'

  mask = df['line_item'].str.contains(r"General",case=False, regex=True, na=False) & \
           df['line_item'].str.contains(r"Admin",case=True, regex=True, na=False) & \
           df['line_item'].str.contains(r"Selling",case=False, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Other Selling|Div",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Including|Excluding|Part",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"^Dep|Home|Percent|Member|Direct",case=True, regex=True, na=False)
  
  df.loc[mask,'line_item'] = 'SGA'
    
  # General And Administrative

  mask = df['line_item'].str.contains(r"General And Administrative",case=False, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Dep|Selling|Incl|Excl|Member",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Party",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Asset|Total|Other|Home|Prop",case=True, regex=True, na=False) 
         
  df.loc[mask,'line_item'] = 'General And Admin'

  # General And Admin Excluding Dep and Amort
    
  mask = df['line_item'].str.contains(r"General And Admin",case=False, regex=True, na=False) & \
          df['line_item'].str.contains(r"Excl",case=True, regex=True, na=False) & \
        ~ df['line_item'].str.contains(r"Sell|Asset",case=True, regex=True, na=False) 
       
  df.loc[mask,'line_item'] = 'General And Admin Excluding Dep and Amort'

  # General And Admin Including Stock Based Comp
    
  mask = df['line_item'].str.contains(r"General And Administrative",case=False, regex=True, na=False) & \
           df['line_item'].str.contains(r"Incl",case=True, regex=True, na=False) & \
           df['line_item'].str.contains(r"Stock|Equity",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Dep|Selling|Excl|Member",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Party",case=True, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Asset|Total|Other|Home|Prop",case=True, regex=True, na=False) 
        
  df.loc[mask,'line_item'] = 'GA Including Stock Based Comp'

  # Gross Profit
    
  mask = df['line_item'].str.contains(r"Gross Profit",case=False, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Member|Per|Total",case=False, regex=True, na=False)

  df.loc[mask,'line_item'] = 'Gross Profit' 
    
  # Total Gross Profit
    
  mask = df['line_item'].str.contains(r"Gross Profit",case=False, regex=True, na=False) & \
           df['line_item'].str.contains(r"Total",case=False, regex=True, na=False) & \
          ~df['line_item'].str.contains(r"Member|Per",case=False, regex=True, na=False)

  df.loc[mask,'line_item'] = 'Total Gross Profit' 
    
  df = df.set_index('line_item',drop=True)

  return df

In [6]:
df = clean_master_data['CSCO']['20']['income']['table']
inc_fix_labels(df)


Unnamed: 0_level_0,"Jul. 25, 2020","Jul. 27, 2019","Jul. 28, 2018"
line_item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Revenue:,,,
Revenue,49301.0,51904.0,49330.0
Cost Of Sales:,,,
Total Cost Of Revenue,17618.0,19238.0,18724.0
Gross Margin,31683.0,32666.0,30606.0
Operating Expenses:,,,
Research And Development,6347.0,6577.0,6332.0
Sales And Marketing,9169.0,9571.0,9242.0
General And Admin,1925.0,1827.0,2144.0
Amortization Of Purchased Intangible Assets,141.0,150.0,221.0


In [7]:
stype = 'income'

ticker_list = list(clean_master_data.keys())

for ticker in ticker_list:
  
  print(ticker)
    
  for yr in clean_master_data[ticker]:
    
    if stype in clean_master_data[ticker][yr]:

      df_temp = inc_fix_labels(clean_master_data[ticker][yr][stype]['table'])
      clean_master_data[ticker][yr][stype]['table'] = df_temp

AAL
AAP
AAPL
ABBV
ABT
ACGL
ACHC
ACM
ACN
ADBE
ADI
ADM
ADP
ADSK
AEE
AEP
AES
AFG
AFL
AGCO
AGNC
AGO
AGR
AIG
AIZ
AJG
AKAM
AL
ALB
ALGN
ALK
ALL
ALLE
ALLY
ALNY
ALSN
AMAT
AMC
AMD
AME
AMED
AMG
AMGN
AMH
AMP
AMT
AMZN
AN
ANET
ANSS
AON
AOS
APD
APH
APTV
AR
ARE
ARES
ARMK
ARW
ATO
ATR
ATVI
AVB
AVT
AWI
AWK
AXS
AXTA
AYI
AZO
BA
BAC
BAH
BALL
BAX
BBY
BC
BDX
BEN
BERY
BFAM
BG
BIIB
BIO
BK
BKI
BKNG
BLD
BLDR
BLK
BMRN
BMY
BOKF
BR
BRKR
BRO
BRX
BSX
BURL
BWA
BWXT
BX
BXP
BYD
C
CABO
CACC
CACI
CAG
CAH
CAR
CASY
CAT
CB
CBOE
CBRE
CBSH
CC
CCI
CCK
CCL
CDNS
CDW
CE
CF
CFG
CFR
CG
CGNX
CHD
CHDN
CHE
CHH
CHK
CHRW
CHTR
CIEN
CINF
CL
CLH
CLX
CMA
CMCSA
CME
CMG
CMI
CMS
CNA
CNC
CNP
COF
COLB
COLM
COO
COST
COTY
CPB
CPRI
CPRT
CPT
CR
CRI
CRL
CRUS
CSCO
CSGP
CSL
CSX
CTAS
CTLT
CTSH
CUBE
CUZ
CVS
CVX
CW
D
DAL
DAR
DCI
DECK
DEI
DFS
DG
DGX
DHI
DHR
DISH
DKS
DLB
DLR
DLTR
DOV
DPZ
DRI
DTE
DUK
DVA
DVN
DXCM
EA
EBAY
ECL
ED
EEFT
EFX
EGP
EHC
EIX
EL
ELS
EMN
EMR
ENPH
ENTG
EOG
EPAM
EPR
EQIX
EQR
EQT
ERIE
ES
ESI
ESS
ETN
ETR
EVA
EVR
EW
EWBC
EXAS
EXC
EXEL
EXP
EXPD

In [8]:
# Save to file

with open(PROJ_ROOT_PATH + '/pickle/clean_master_data_stage2.pkl', 'wb') as f:
  pickle.dump(clean_master_data, f)