## Extract Cash Sheet Data

Take the data downloaded from Yahoo, select a subset of useful features, validate date and exclude problem 
companies to create a clean data sets.

In [1]:
import pandas as pd
import numpy as np
import pickle
import os

In [2]:
# Functions shared across multiple notebooks are stored in yahoo_data_ext_kit.py which can be found in the same
# directory as the notebooks.

import yahoo_data_ext_kit as ext

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
pd.set_option('display.max_rows', 800)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 100)
pd.options.display.float_format = '{:.0f}'.format

In [5]:
DATA_ROOT_DIR='/mnt/data/projects/MD3'
PROJ_ROOT_DIR='/home/priyesh/projects/MD3'

## Cash Sheet

Extract Cash sheet from data downloaded from Yahoo and format as a dataframe.

In [6]:
filepath=os.path.join(PROJ_ROOT_DIR,'pickle','yahoo_company_data.pkl')
master_company_data = pd.read_pickle(filepath)
filepath=os.path.join(PROJ_ROOT_DIR,'pickle','yahoo_drop_data_details_stage4.pkl')
drop_data_details = pd.read_pickle(filepath)

In [7]:
cash_sheets = ext.extract_statements('cash',master_company_data)

AM
AR
APA
BKR
LNG
CHK
CVX
COP
CTRA
DVN
FANG
DTM
EVA
EOG
EQT
XOM
HAL
HES
DINO
KMI
MRO
MPC
NFE
NOV
OXY
OKE
OVV
PDCE
PSX
PXD
RRC
SLB
SWN
TRGP
TPL
VLO
VTS
WMB
AES
LNT
AEE
AEP
AWK
ATO
AGR
BEPC
CNP
CMS
ED
CEG
D
DTE
DUK
EIX
ETR
WTRG
EVRG
ES
EXC
FE
HE
IDA
MDU
NFG
NEE
NI
NRG
OGE
PCG
PNW
PPL
PEG
SRE
SO
UGI
VST
WEC
XEL
ATVI
GOOG
ATUS
AMC
T
CABO
CHTR
CMCSA
DISH
DIS
EA
FOXA
FOX
FYBR
IAC
IPG
LBRDA
LBRDK
FWONA
FWONK
LSXMA
LSXMK
LYV
LUMN
MSGS
MTCH
META
NFLX
NYT
NWSA
NWS
NXST
OMC
PARAA
PARA
PINS
PLTK
RBLX
ROKU
SIRI
SPOT
TMUS
TTWO
TRIP
VZ
WBD
WWE
ZI
ADM
ACI
MO
BJ
SAM
BF-A
BG
CPB
CASY
CHD
CLX
KO
CL
CAG
STZ
COST
COTY
DAR
DG
DLTR
EL
FLO
FRPT
GIS
GO
HSY
HRL
INGR
K
KDP
KMB
KHC
KR
LW
MKC
TAP
MDLZ
MNST
OLPX
PEP
PFGC
PM
PPC
POST
PG
REYN
SEB
SJM
SPB
SYY
TGT
TSN
USFD
WBA
WMT
APD
ALB
AA
AMCR
ATR
AMBP
ASH
AVY
AXTA
BALL
BERY
CE
CF
CC
CLF
CTVA
CCK
DOW
DD
EXP
EMN
ECL
ESI
FMC
FCX
DNA
GPK
HUN
IFF
IP
LIN
LPX
LYB
MLM
MOS
MP
NEU
NEM
NUE
OLN
PKG
PPG
RS
RGLD
RPM
SMG
SEE
SHW
SLGN
SON
SCCO
SSRM
STLD
X
VVV
VMC
WLK
WRK
ARE
AMH
A

In [8]:
# By taking a copy to modify, we always have the original to refer back to if and when required.

df = cash_sheets.copy()
df.head()

Unnamed: 0,st_date,amortizationCashFlow,amortizationOfIntangibles,assetImpairmentCharge,beginningCashPosition,capitalExpenditure,cashDividendsPaid,cashFlowFromContinuingFinancingActivities,cashFlowFromContinuingInvestingActivities,cashFlowFromContinuingOperatingActivities,changeInAccountPayable,changeInAccruedExpense,changeInCashSupplementalAsReported,changeInOtherCurrentAssets,changeInPayable,changeInPayablesAndAccruedExpense,changeInReceivables,changeInWorkingCapital,changesInAccountReceivables,commonStockDividendPaid,commonStockPayments,deferredIncomeTax,deferredTax,depreciation,depreciationAmortizationDepletion,depreciationAndAmortization,dividendReceivedCFO,earningsLossesFromEquityInvestments,endCashPosition,financingCashFlow,freeCashFlow,incomeTaxPaidSupplementalData,interestPaidSupplementalData,investingCashFlow,issuanceOfDebt,longTermDebtIssuance,longTermDebtPayments,netBusinessPurchaseAndSale,netCommonStockIssuance,netIncome,netIncomeFromContinuingOperations,netIssuancePaymentsOfDebt,netLongTermDebtIssuance,netOtherFinancingCharges,netOtherInvestingChanges,netPPEPurchaseAndSale,netShortTermDebtIssuance,operatingCashFlow,operatingGainsLosses,otherNonCashItems,preferredStockDividendPaid,purchaseOfBusiness,purchaseOfPPE,repaymentOfDebt,repurchaseOfCapitalStock,saleOfBusiness,stockBasedCompensation,changeInIncomeTaxPayable,changeInTaxPayable,changesInCash,dividendsReceivedCFI,shortTermDebtPayments,company,industry,yahoo_sector,gics_sector,st_YR,st_Mnth,changeInDividendPayable,changeInOtherCurrentLiabilities,commonStockIssuance,depletion,gainLossOnInvestmentSecurities,gainLossOnSaleOfBusiness,issuanceOfCapitalStock,otherCashAdjustmentOutsideChangeinCash,changeInPrepaidAssets,changeInInventory,changeInOtherWorkingCapital,netInvestmentPurchaseAndSale,saleOfInvestment,saleOfPPE,shortTermDebtIssuance,capitalExpenditureReported,effectOfExchangeRateChanges,unrealizedGainLossOnInvestmentSecurities,interestPaidCFO,gainLossOnSaleOfPPE,netForeignCurrencyExchangeGainLoss,proceedsFromStockOptionExercised,purchaseOfInvestment,changeInInterestPayable,netPreferredStockIssuance,preferredStockPayments,cashFlowFromDiscontinuedOperation,cashFromDiscontinuedFinancingActivities,cashFromDiscontinuedInvestingActivities,cashFromDiscontinuedOperatingActivities,netInvestmentPropertiesPurchaseAndSale,purchaseOfInvestmentProperties,dividendPaidCFO,pensionAndEmployeeBenefitExpense,provisionandWriteOffofAssets,amortizationOfSecurities,otherCashAdjustmentInsideChangeinCash,preferredStockIssuance,saleOfInvestmentProperties,netIntangiblesPurchaseAndSale,purchaseOfIntangibles,taxesRefundPaid,saleOfIntangibles,interestReceivedCFI,interestReceivedCFO,excessTaxBenefitFromStockBasedCompensation,interestPaidCFF,cashFlowsfromusedinOperatingActivitiesDirect,classesofCashPayments,classesofCashReceiptsfromOperatingActivities,dividendsReceivedDirect,interestPaidDirect,interestReceivedDirect,otherCashPaymentsfromOperatingActivities,otherCashReceiptsfromOperatingActivities,paymentstoSuppliersforGoodsandServices
0,2022-12-31,70672000,70672000,3702000.0,0,-515650000,-433375000,-205778000,-493826000,699604000,8755000,-747000,0,-313000,8755000,8008000,-2631000,5064000,-2631000,-432825000,0.0,117494000,117494000,131762000,202434000,202434000,120460000,-94218000,0.0,-205778000,183954000,0.0,183079000,-493826000,0.0,0.0,0.0,17000000.0,0.0,326242000,326242000,234800000,0.0,-7203000,4824000,-515650000,234800000,699604000,-96469000,1023000,-550000,0.0,-515650000,0.0,0.0,17000000.0,19654000,,,,,,AM,Oil & Gas Midstream,Energy,Energy,2022,12,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,2023-03-31,70672000,70672000,,0,-474376000,-433590000,-245828000,-451831000,697659000,-3836000,60000,0,-951000,-3836000,-3776000,-14921000,-19648000,-14921000,-433040000,,121097000,121097000,138658000,209330000,209330000,123435000,-95442000,,-245828000,223283000,,192742000,-451831000,,,,,,332709000,332709000,194500000,,-6738000,5545000,-474376000,194500000,697659000,-97820000,1705000,-550000,,-474376000,,,,23149000,,,,,,AM,Oil & Gas Midstream,Energy,Energy,2023,3,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,2019-12-31,57010000,57010000,761960000.0,2822000,-267383000,-496197000,-98299000,-525675000,622387000,-11865000,8681000,-1587000,-335000,-27543000,-18862000,42669000,23472000,42669000,-495823000,-125519000.0,-101927000,-101927000,95526000,152536000,152536000,107812000,-51315000,1235000.0,-98299000,355004000,16079000.0,83016000,-525675000,650000000.0,650000000.0,-115500000.0,-133536000.0,-125519000.0,-355114000,-355114000,534500000,650000000.0,-11083000,-124756000,-267383000,-115500000,622387000,-51315000,11446000,-374000,-753068000.0,-267383000,-115500000.0,-125519000.0,619532000.0,73517000,-15678000.0,-15678000.0,-1587000.0,-598709000.0,-115500000.0,AM,Oil & Gas Midstream,Energy,Energy,2019,12,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,2020-12-31,70672000,70672000,673640000.0,1235000,-157931000,-590190000,-534746000,-219231000,753382000,1917000,-13142000,-595000,155000,1917000,-11225000,11489000,419000,28740000,-589640000,-24713000.0,-171000,-171000,108790000,179462000,179462000,98858000,-86430000,640000.0,-534746000,595451000,,140732000,-219231000,550000000.0,550000000.0,-346000000.0,-25267000.0,-24713000.0,-122527000,-122527000,204000000,550000000.0,-123843000,-36033000,-157931000,-346000000,753382000,-83501000,-5576000,-550000,-25267000.0,-157931000,-346000000.0,-24713000.0,0.0,12778000,0.0,0.0,-595000.0,0.0,-346000000.0,AM,Oil & Gas Midstream,Energy,Energy,2020,12,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,2021-12-31,70672000,70672000,5042000.0,640000,-232825000,-471721000,-477150000,-233242000,709752000,1487000,-7346000,-640000,550000,1487000,-5859000,9740000,4431000,-6571000,-471171000,0.0,117123000,117123000,108790000,179462000,179462000,118990000,-90451000,0.0,-477150000,476927000,16311000.0,179748000,-233242000,750000000.0,750000000.0,-667472000.0,-2070000.0,0.0,78626000,331617000,16228000,82528000.0,-21657000,1653000,-232825000,-66300000,709752000,-65066000,4624000,-550000,-2070000.0,-232825000,-733772000.0,0.0,0.0,13529000,0.0,0.0,-640000.0,,-66300000.0,AM,Oil & Gas Midstream,Energy,Energy,2021,12,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [9]:
len(df['company'].unique())

848

In [10]:
# Remove sectors, industries and individual companies previously identified for removal.

# Note: as we go through the analysis of the data for each of the financial statements, companies, industries and 
# sectors are identified for removal and stored in drop_data_details. 

for sector in drop_data_details['sector']:
   print(sector)
   df.drop(df[df['yahoo_sector'] == sector].index, inplace=True, axis=0)
    
for industry in drop_data_details['industry']:
   print(industry)
   df.drop(df[df['industry'] == industry].index, inplace=True, axis=0)
    
for company in drop_data_details['companies']:
   print(company)
   df.drop(df[df['company'] == company].index, inplace=True, axis=0)

Real Estate
Healthcare Plans
AMBP
CI
ORCL
LHX
LSXMK
VTS
BF-A
CCCS
AGL
FWONK
NWSA
DRI
AR
FOXA
RPM
FDX
PARAA
LCID
GIS
HUM
RXO
HRB
LBRDK
UAA
BEPC
LSXMA
FYBR
CAG
DNA
ELV
PAYX
MBC
NKE
CASY
EHAB
ZG
ESAB
CXT
GEHC
FWONA
CTAS
CHPT
LW
CHK
RGLD
OSK
AZPN
KD
PYCR
UA
DH
DNB
BKI
CACI
CSX
VFC
KMX
NTAP
KR
DXC
CPRI
MDT
SJM
UHAL
ALGM
DOCU
ESTC
SMAR


In [11]:
len(df['company'].unique())

711

In [12]:
list(df.columns.sort_values())

['amortizationCashFlow',
 'amortizationOfIntangibles',
 'amortizationOfSecurities',
 'assetImpairmentCharge',
 'beginningCashPosition',
 'capitalExpenditure',
 'capitalExpenditureReported',
 'cashDividendsPaid',
 'cashFlowFromContinuingFinancingActivities',
 'cashFlowFromContinuingInvestingActivities',
 'cashFlowFromContinuingOperatingActivities',
 'cashFlowFromDiscontinuedOperation',
 'cashFlowsfromusedinOperatingActivitiesDirect',
 'cashFromDiscontinuedFinancingActivities',
 'cashFromDiscontinuedInvestingActivities',
 'cashFromDiscontinuedOperatingActivities',
 'changeInAccountPayable',
 'changeInAccruedExpense',
 'changeInCashSupplementalAsReported',
 'changeInDividendPayable',
 'changeInIncomeTaxPayable',
 'changeInInterestPayable',
 'changeInInventory',
 'changeInOtherCurrentAssets',
 'changeInOtherCurrentLiabilities',
 'changeInOtherWorkingCapital',
 'changeInPayable',
 'changeInPayablesAndAccruedExpense',
 'changeInPrepaidAssets',
 'changeInReceivables',
 'changeInTaxPayable',
 

In [13]:
len(list(df.columns.sort_values()))

124

In [14]:
summary=['company', 'yahoo_sector','industry','st_date', 
         'netIncome', 
         'changeInWorkingCapital',
         'capitalExpenditure',
         'beginningCashPosition',
         'endCashPosition',
         'operatingCashFlow',
         'cashFlowFromContinuingOperatingActivities',
         'cashFlowFromContinuingInvestingActivities',
         'cashFlowFromContinuingFinancingActivities',
         'purchaseOfBusiness',
         'purchaseOfPPE',
         'repaymentOfDebt',
         'repurchaseOfCapitalStock',
         'cashDividendsPaid',
         'commonStockDividendPaid',
         'gainLossOnSaleOfBusiness',
         'saleOfPPE',
         'freeCashFlow']

operating=['company', 'yahoo_sector','industry','st_date', 
           'netIncome',
           'netIncomeFromContinuingOperations',
           'operatingGainsLosses',
           'depreciation',
           'depreciationAmortizationDepletion',
           'depreciationAndAmortization',
           'stockBasedCompensation',
           'deferredIncomeTax',
           'changeInInterestPayable',
           'changeInWorkingCapital',
           'changeInOtherWorkingCapital',
           'changesInAccountReceivables',
           'changeInReceivables',
           'changeInAccountPayable',
           'changeInPayable',
           'paymentstoSuppliersforGoodsandServices',
           'changeInPayablesAndAccruedExpense',
           'otherCashPaymentsfromOperatingActivities',
           'otherCashReceiptsfromOperatingActivities',
           'otherNonCashItems',
           'cashFlowsfromusedinOperatingActivitiesDirect',
           'cashFlowFromContinuingOperatingActivities']

investing=['company','yahoo_sector','industry','st_date', 
           'capitalExpenditure',
           'capitalExpenditureReported',
           'cashFlowFromContinuingInvestingActivities',
           'purchaseOfBusiness',
           'saleOfBusiness',
           'netBusinessPurchaseAndSale',
           'purchaseOfPPE',
           'saleOfPPE',
           'netPPEPurchaseAndSale',
           'purchaseOfInvestmentProperties',
           'saleOfInvestmentProperties',     
           'netInvestmentPropertiesPurchaseAndSale',
           'purchaseOfInvestment',
           'saleOfInvestment',
           'netInvestmentPurchaseAndSale',
           'purchaseOfIntangibles',
           'saleOfIntangibles',
           'netIntangiblesPurchaseAndSale',
           'netOtherInvestingChanges',
           'investingCashFlow']

finance=['company','yahoo_sector','industry','st_date', 
         'cashFlowFromContinuingFinancingActivities',
         'shortTermDebtIssuance',
         'shortTermDebtPayments',
         'netShortTermDebtIssuance', 
         'longTermDebtIssuance',
         'longTermDebtPayments',
         'netLongTermDebtIssuance',
         'issuanceOfDebt',
         'repaymentOfDebt',
         'netIssuancePaymentsOfDebt',
         'cashDividendsPaid',
         'commonStockDividendPaid',
         'preferredStockDividendPaid',
         'commonStockIssuance',
         'commonStockPayments',
         'netCommonStockIssuance',      
         'issuanceOfCapitalStock',
         'netPreferredStockIssuance',
         'netOtherFinancingCharges',
         'repurchaseOfCapitalStock']

adhoc=['company','yahoo_sector','industry',
       'st_date', 
       'cashFlowFromContinuingOperatingActivities',
       'capitalExpenditure',
       'freeCashFlow']

In [15]:
# Identify companies which have less than 4 years of data

df.groupby('company').filter(lambda x: len(x) < 4)

Unnamed: 0,st_date,amortizationCashFlow,amortizationOfIntangibles,assetImpairmentCharge,beginningCashPosition,capitalExpenditure,cashDividendsPaid,cashFlowFromContinuingFinancingActivities,cashFlowFromContinuingInvestingActivities,cashFlowFromContinuingOperatingActivities,changeInAccountPayable,changeInAccruedExpense,changeInCashSupplementalAsReported,changeInOtherCurrentAssets,changeInPayable,changeInPayablesAndAccruedExpense,changeInReceivables,changeInWorkingCapital,changesInAccountReceivables,commonStockDividendPaid,commonStockPayments,deferredIncomeTax,deferredTax,depreciation,depreciationAmortizationDepletion,depreciationAndAmortization,dividendReceivedCFO,earningsLossesFromEquityInvestments,endCashPosition,financingCashFlow,freeCashFlow,incomeTaxPaidSupplementalData,interestPaidSupplementalData,investingCashFlow,issuanceOfDebt,longTermDebtIssuance,longTermDebtPayments,netBusinessPurchaseAndSale,netCommonStockIssuance,netIncome,netIncomeFromContinuingOperations,netIssuancePaymentsOfDebt,netLongTermDebtIssuance,netOtherFinancingCharges,netOtherInvestingChanges,netPPEPurchaseAndSale,netShortTermDebtIssuance,operatingCashFlow,operatingGainsLosses,otherNonCashItems,preferredStockDividendPaid,purchaseOfBusiness,purchaseOfPPE,repaymentOfDebt,repurchaseOfCapitalStock,saleOfBusiness,stockBasedCompensation,changeInIncomeTaxPayable,changeInTaxPayable,changesInCash,dividendsReceivedCFI,shortTermDebtPayments,company,industry,yahoo_sector,gics_sector,st_YR,st_Mnth,changeInDividendPayable,changeInOtherCurrentLiabilities,commonStockIssuance,depletion,gainLossOnInvestmentSecurities,gainLossOnSaleOfBusiness,issuanceOfCapitalStock,otherCashAdjustmentOutsideChangeinCash,changeInPrepaidAssets,changeInInventory,changeInOtherWorkingCapital,netInvestmentPurchaseAndSale,saleOfInvestment,saleOfPPE,shortTermDebtIssuance,capitalExpenditureReported,effectOfExchangeRateChanges,unrealizedGainLossOnInvestmentSecurities,interestPaidCFO,gainLossOnSaleOfPPE,netForeignCurrencyExchangeGainLoss,proceedsFromStockOptionExercised,purchaseOfInvestment,changeInInterestPayable,netPreferredStockIssuance,preferredStockPayments,cashFlowFromDiscontinuedOperation,cashFromDiscontinuedFinancingActivities,cashFromDiscontinuedInvestingActivities,cashFromDiscontinuedOperatingActivities,netInvestmentPropertiesPurchaseAndSale,purchaseOfInvestmentProperties,dividendPaidCFO,pensionAndEmployeeBenefitExpense,provisionandWriteOffofAssets,amortizationOfSecurities,otherCashAdjustmentInsideChangeinCash,preferredStockIssuance,saleOfInvestmentProperties,netIntangiblesPurchaseAndSale,purchaseOfIntangibles,taxesRefundPaid,saleOfIntangibles,interestReceivedCFI,interestReceivedCFO,excessTaxBenefitFromStockBasedCompensation,interestPaidCFF,cashFlowsfromusedinOperatingActivitiesDirect,classesofCashPayments,classesofCashReceiptsfromOperatingActivities,dividendsReceivedDirect,interestPaidDirect,interestReceivedDirect,otherCashPaymentsfromOperatingActivities,otherCashReceiptsfromOperatingActivities,paymentstoSuppliersforGoodsandServices


In [16]:
df.groupby('company').filter(lambda x: len(x) > 4)[summary]

Unnamed: 0,company,yahoo_sector,industry,st_date,netIncome,changeInWorkingCapital,capitalExpenditure,beginningCashPosition,endCashPosition,operatingCashFlow,cashFlowFromContinuingOperatingActivities,cashFlowFromContinuingInvestingActivities,cashFlowFromContinuingFinancingActivities,purchaseOfBusiness,purchaseOfPPE,repaymentOfDebt,repurchaseOfCapitalStock,cashDividendsPaid,commonStockDividendPaid,gainLossOnSaleOfBusiness,saleOfPPE,freeCashFlow
0,AM,Energy,Oil & Gas Midstream,2022-12-31,326242000,5064000,-515650000,0,0,699604000,699604000,-493826000,-205778000,0,-515650000,0,0,-433375000,-432825000,,,183954000
1,AM,Energy,Oil & Gas Midstream,2023-03-31,332709000,-19648000,-474376000,0,,697659000,697659000,-451831000,-245828000,,-474376000,,,-433590000,-433040000,,,223283000
2,AM,Energy,Oil & Gas Midstream,2019-12-31,-355114000,23472000,-267383000,2822000,1235000,622387000,622387000,-525675000,-98299000,-753068000,-267383000,-115500000,-125519000,-496197000,-495823000,,,355004000
3,AM,Energy,Oil & Gas Midstream,2020-12-31,-122527000,419000,-157931000,1235000,640000,753382000,753382000,-219231000,-534746000,-25267000,-157931000,-346000000,-24713000,-590190000,-589640000,,,595451000
4,AM,Energy,Oil & Gas Midstream,2021-12-31,78626000,4431000,-232825000,640000,0,709752000,709752000,-233242000,-477150000,-2070000,-232825000,-733772000,0,-471721000,-471171000,,,476927000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4463,ZTS,Healthcare,Drug Manufacturers—Specialty & Generic,2022-12-31,2114000000,-699000000,-586000000,3485000000,3581000000,1912000000,1912000000,-883000000,-904000000,-312000000,,0,-1594000000,-611000000,-611000000,,,1326000000
4464,ZTS,Healthcare,Drug Manufacturers—Specialty & Generic,2023-03-31,2071000000,-469000000,-694000000,3135000000,2145000000,2152000000,2152000000,-981000000,-2161000000,-315000000,,-1350000000,-1516000000,-631000000,-631000000,,,1458000000
4465,ZTS,Healthcare,Drug Manufacturers—Specialty & Generic,2019-12-31,1500000000,-151000000,-460000000,1602000000,1934000000,1795000000,1795000000,-504000000,-951000000,-195000000,,,-626000000,-314000000,-314000000,,,1335000000
4466,ZTS,Healthcare,Drug Manufacturers—Specialty & Generic,2020-12-31,1638000000,-85000000,-453000000,1934000000,3604000000,2126000000,2126000000,-572000000,123000000,-113000000,,-500000000,-250000000,-380000000,-380000000,,,1673000000


In [17]:
# Check counts 

df.groupby('company').filter(lambda x: len(x) > 4).groupby('company').size().sort_values()

company
LFUS     5
NCNO     5
NCR      5
NEE      5
NEM      5
NET      5
NEU      5
NCLH     5
NEWR     5
NI       5
NOC      5
NOV      5
NOW      5
NRG      5
NSC      5
NFLX     5
NBIX     5
NATI     5
MTZ      5
MLM      5
MMM      5
MNST     5
MOS      5
MP       5
MPC      5
MRK      5
MRNA     5
MRO      5
MRVI     5
MRVL     5
MSA      5
MSI      5
MTD      5
MTN      5
NVDA     5
MKSI     5
NVR      5
NVT      5
PDCE     5
PEG      5
PEGA     5
PEN      5
PENN     5
PEP      5
PCOR     5
PFE      5
PII      5
PINS     5
PKG      5
PLNT     5
PLUG     5
PM       5
PHM      5
PCG      5
PCAR     5
PAYC     5
NWL      5
NYT      5
OC       5
ODFL     5
OGE      5
OGN      5
OKE      5
OKTA     5
OLN      5
OLPX     5
OMC      5
ORLY     5
OTIS     5
OXY      5
PARA     5
NVST     5
MKC      5
MIDD     5
MHK      5
JAMF     5
JAZZ     5
JBHT     5
JNJ      5
JNPR     5
JWN      5
ITW      5
K        5
KMB      5
KMI      5
KNX      5
KO       5
LAD      5
LBRDA    5
KHC      5
IT

We know from analysis of data for income sheets that the unexpected extra rows corespond to spurious quarterly 
accounts plus TTM.

Delete rows which do not match the annual anniversary of statement date. This is determined by finding the most 
common month for a given company.

In [18]:
df = ext.remove_non_annual_data(df)

AM
APA
BKR
LNG
CVX
COP
CTRA
DVN
FANG
DTM
EVA
EOG
EQT
XOM
HAL
HES
DINO
KMI
MRO
MPC
NFE
NOV
OXY
OKE
OVV
PDCE
PSX
PXD
RRC
SLB
SWN
TRGP
TPL
VLO
WMB
AES
LNT
AEE
AEP
AWK
ATO
AGR
CNP
CMS
ED
CEG
D
DTE
DUK
EIX
ETR
WTRG
EVRG
ES
EXC
FE
HE
IDA
MDU
NFG
NEE
NI
NRG
OGE
PCG
PNW
PPL
PEG
SRE
SO
UGI
VST
WEC
XEL
ATVI
GOOG
ATUS
AMC
T
CABO
CHTR
CMCSA
DISH
DIS
EA
FOX
IAC
IPG
LBRDA
LYV
LUMN
MSGS
MTCH
META
NFLX
NYT
NWS
NXST
OMC
PARA
PINS
PLTK
RBLX
ROKU
SIRI
SPOT
TMUS
TTWO
TRIP
VZ
WBD
WWE
ZI
ADM
ACI
MO
BJ
SAM
BG
CPB
CHD
CLX
KO
CL
STZ
COST
COTY
DAR
DG
DLTR
EL
FLO
FRPT
GO
HSY
HRL
INGR
K
KDP
KMB
KHC
MKC
TAP
MDLZ
MNST
OLPX
PEP
PFGC
PM
PPC
POST
PG
REYN
SEB
SPB
SYY
TGT
TSN
USFD
WBA
WMT
APD
ALB
AA
AMCR
ATR
ASH
AVY
AXTA
BALL
BERY
CE
CF
CC
CLF
CTVA
CCK
DOW
DD
EXP
EMN
ECL
ESI
FMC
FCX
GPK
HUN
IFF
IP
LIN
LPX
LYB
MLM
MOS
MP
NEU
NEM
NUE
OLN
PKG
PPG
RS
SMG
SEE
SHW
SLGN
SON
SCCO
SSRM
STLD
X
VVV
VMC
WLK
WRK
Z
MMM
AYI
ADP
WMS
ACM
AGCO
AL
ALK
ALLE
ALSN
AAL
AME
AWI
CAR
AXON
AZEK
BA
BAH
BR
BLDR
BWXT
CHRW
CSL
CARR
CLVT
CLH
CPA
CPRT


In [19]:
# Let's see if that leaves us with companies with less than 4 years of accounts

df.groupby('company').filter(lambda x: len(x) < 4)

Unnamed: 0,st_date,amortizationCashFlow,amortizationOfIntangibles,assetImpairmentCharge,beginningCashPosition,capitalExpenditure,cashDividendsPaid,cashFlowFromContinuingFinancingActivities,cashFlowFromContinuingInvestingActivities,cashFlowFromContinuingOperatingActivities,changeInAccountPayable,changeInAccruedExpense,changeInCashSupplementalAsReported,changeInOtherCurrentAssets,changeInPayable,changeInPayablesAndAccruedExpense,changeInReceivables,changeInWorkingCapital,changesInAccountReceivables,commonStockDividendPaid,commonStockPayments,deferredIncomeTax,deferredTax,depreciation,depreciationAmortizationDepletion,depreciationAndAmortization,dividendReceivedCFO,earningsLossesFromEquityInvestments,endCashPosition,financingCashFlow,freeCashFlow,incomeTaxPaidSupplementalData,interestPaidSupplementalData,investingCashFlow,issuanceOfDebt,longTermDebtIssuance,longTermDebtPayments,netBusinessPurchaseAndSale,netCommonStockIssuance,netIncome,netIncomeFromContinuingOperations,netIssuancePaymentsOfDebt,netLongTermDebtIssuance,netOtherFinancingCharges,netOtherInvestingChanges,netPPEPurchaseAndSale,netShortTermDebtIssuance,operatingCashFlow,operatingGainsLosses,otherNonCashItems,preferredStockDividendPaid,purchaseOfBusiness,purchaseOfPPE,repaymentOfDebt,repurchaseOfCapitalStock,saleOfBusiness,stockBasedCompensation,changeInIncomeTaxPayable,changeInTaxPayable,changesInCash,dividendsReceivedCFI,shortTermDebtPayments,company,industry,yahoo_sector,gics_sector,st_YR,st_Mnth,changeInDividendPayable,changeInOtherCurrentLiabilities,commonStockIssuance,depletion,gainLossOnInvestmentSecurities,gainLossOnSaleOfBusiness,issuanceOfCapitalStock,otherCashAdjustmentOutsideChangeinCash,changeInPrepaidAssets,changeInInventory,changeInOtherWorkingCapital,netInvestmentPurchaseAndSale,saleOfInvestment,saleOfPPE,shortTermDebtIssuance,capitalExpenditureReported,effectOfExchangeRateChanges,unrealizedGainLossOnInvestmentSecurities,interestPaidCFO,gainLossOnSaleOfPPE,netForeignCurrencyExchangeGainLoss,proceedsFromStockOptionExercised,purchaseOfInvestment,changeInInterestPayable,netPreferredStockIssuance,preferredStockPayments,cashFlowFromDiscontinuedOperation,cashFromDiscontinuedFinancingActivities,cashFromDiscontinuedInvestingActivities,cashFromDiscontinuedOperatingActivities,netInvestmentPropertiesPurchaseAndSale,purchaseOfInvestmentProperties,dividendPaidCFO,pensionAndEmployeeBenefitExpense,provisionandWriteOffofAssets,amortizationOfSecurities,otherCashAdjustmentInsideChangeinCash,preferredStockIssuance,saleOfInvestmentProperties,netIntangiblesPurchaseAndSale,purchaseOfIntangibles,taxesRefundPaid,saleOfIntangibles,interestReceivedCFI,interestReceivedCFO,excessTaxBenefitFromStockBasedCompensation,interestPaidCFF,cashFlowsfromusedinOperatingActivitiesDirect,classesofCashPayments,classesofCashReceiptsfromOperatingActivities,dividendsReceivedDirect,interestPaidDirect,interestReceivedDirect,otherCashPaymentsfromOperatingActivities,otherCashReceiptsfromOperatingActivities,paymentstoSuppliersforGoodsandServices,M


In [20]:
# Sanity check. Do we still have companies with more than 4.

df.groupby('company').filter(lambda x: len(x) > 4)

Unnamed: 0,st_date,amortizationCashFlow,amortizationOfIntangibles,assetImpairmentCharge,beginningCashPosition,capitalExpenditure,cashDividendsPaid,cashFlowFromContinuingFinancingActivities,cashFlowFromContinuingInvestingActivities,cashFlowFromContinuingOperatingActivities,changeInAccountPayable,changeInAccruedExpense,changeInCashSupplementalAsReported,changeInOtherCurrentAssets,changeInPayable,changeInPayablesAndAccruedExpense,changeInReceivables,changeInWorkingCapital,changesInAccountReceivables,commonStockDividendPaid,commonStockPayments,deferredIncomeTax,deferredTax,depreciation,depreciationAmortizationDepletion,depreciationAndAmortization,dividendReceivedCFO,earningsLossesFromEquityInvestments,endCashPosition,financingCashFlow,freeCashFlow,incomeTaxPaidSupplementalData,interestPaidSupplementalData,investingCashFlow,issuanceOfDebt,longTermDebtIssuance,longTermDebtPayments,netBusinessPurchaseAndSale,netCommonStockIssuance,netIncome,netIncomeFromContinuingOperations,netIssuancePaymentsOfDebt,netLongTermDebtIssuance,netOtherFinancingCharges,netOtherInvestingChanges,netPPEPurchaseAndSale,netShortTermDebtIssuance,operatingCashFlow,operatingGainsLosses,otherNonCashItems,preferredStockDividendPaid,purchaseOfBusiness,purchaseOfPPE,repaymentOfDebt,repurchaseOfCapitalStock,saleOfBusiness,stockBasedCompensation,changeInIncomeTaxPayable,changeInTaxPayable,changesInCash,dividendsReceivedCFI,shortTermDebtPayments,company,industry,yahoo_sector,gics_sector,st_YR,st_Mnth,changeInDividendPayable,changeInOtherCurrentLiabilities,commonStockIssuance,depletion,gainLossOnInvestmentSecurities,gainLossOnSaleOfBusiness,issuanceOfCapitalStock,otherCashAdjustmentOutsideChangeinCash,changeInPrepaidAssets,changeInInventory,changeInOtherWorkingCapital,netInvestmentPurchaseAndSale,saleOfInvestment,saleOfPPE,shortTermDebtIssuance,capitalExpenditureReported,effectOfExchangeRateChanges,unrealizedGainLossOnInvestmentSecurities,interestPaidCFO,gainLossOnSaleOfPPE,netForeignCurrencyExchangeGainLoss,proceedsFromStockOptionExercised,purchaseOfInvestment,changeInInterestPayable,netPreferredStockIssuance,preferredStockPayments,cashFlowFromDiscontinuedOperation,cashFromDiscontinuedFinancingActivities,cashFromDiscontinuedInvestingActivities,cashFromDiscontinuedOperatingActivities,netInvestmentPropertiesPurchaseAndSale,purchaseOfInvestmentProperties,dividendPaidCFO,pensionAndEmployeeBenefitExpense,provisionandWriteOffofAssets,amortizationOfSecurities,otherCashAdjustmentInsideChangeinCash,preferredStockIssuance,saleOfInvestmentProperties,netIntangiblesPurchaseAndSale,purchaseOfIntangibles,taxesRefundPaid,saleOfIntangibles,interestReceivedCFI,interestReceivedCFO,excessTaxBenefitFromStockBasedCompensation,interestPaidCFF,cashFlowsfromusedinOperatingActivitiesDirect,classesofCashPayments,classesofCashReceiptsfromOperatingActivities,dividendsReceivedDirect,interestPaidDirect,interestReceivedDirect,otherCashPaymentsfromOperatingActivities,otherCashReceiptsfromOperatingActivities,paymentstoSuppliersforGoodsandServices,M


In [21]:
# Check count of companies

len(df['company'].unique())

711

## Operating Activities

In [22]:
df[operating].isnull().sum().sort_values()

company                                            0
changeInWorkingCapital                             0
depreciationAmortizationDepletion                  0
netIncomeFromContinuingOperations                  0
cashFlowFromContinuingOperatingActivities          0
st_date                                            0
industry                                           0
yahoo_sector                                       0
netIncome                                          0
changeInPayablesAndAccruedExpense                 49
changeInReceivables                              113
depreciationAndAmortization                      137
otherNonCashItems                                210
deferredIncomeTax                                286
stockBasedCompensation                           308
changeInPayable                                  450
changeInAccountPayable                           500
operatingGainsLosses                             599
changeInOtherWorkingCapital                   

In [23]:
operating

['company',
 'yahoo_sector',
 'industry',
 'st_date',
 'netIncome',
 'netIncomeFromContinuingOperations',
 'operatingGainsLosses',
 'depreciation',
 'depreciationAmortizationDepletion',
 'depreciationAndAmortization',
 'stockBasedCompensation',
 'deferredIncomeTax',
 'changeInInterestPayable',
 'changeInWorkingCapital',
 'changeInOtherWorkingCapital',
 'changesInAccountReceivables',
 'changeInReceivables',
 'changeInAccountPayable',
 'changeInPayable',
 'paymentstoSuppliersforGoodsandServices',
 'changeInPayablesAndAccruedExpense',
 'otherCashPaymentsfromOperatingActivities',
 'otherCashReceiptsfromOperatingActivities',
 'otherNonCashItems',
 'cashFlowsfromusedinOperatingActivitiesDirect',
 'cashFlowFromContinuingOperatingActivities']

In [24]:
# Select fields for final cut

final_operating = ['netIncome',
                   'netIncomeFromContinuingOperations',
                   'depreciationAmortizationDepletion',
                   'stockBasedCompensation',
                   'cashFlowFromContinuingOperatingActivities']

## Investing

In [25]:
df[investing].isnull().sum().sort_values()

company                                         0
cashFlowFromContinuingInvestingActivities       0
st_date                                         0
investingCashFlow                               0
yahoo_sector                                    0
industry                                        0
capitalExpenditure                             11
netBusinessPurchaseAndSale                    515
purchaseOfBusiness                            670
netPPEPurchaseAndSale                         688
netOtherInvestingChanges                      908
purchaseOfPPE                                1030
netInvestmentPurchaseAndSale                 1055
purchaseOfInvestment                         1308
saleOfInvestment                             1318
capitalExpenditureReported                   1493
saleOfPPE                                    1832
saleOfBusiness                               1867
netIntangiblesPurchaseAndSale                2274
purchaseOfIntangibles                        2313


In [26]:
cols = investing + ['freeCashFlow','cashFlowFromContinuingOperatingActivities']
df[df['capitalExpenditure'].isnull()].sort_values(by=['industry','company','st_date'])[cols]

Unnamed: 0,company,yahoo_sector,industry,st_date,capitalExpenditure,capitalExpenditureReported,cashFlowFromContinuingInvestingActivities,purchaseOfBusiness,saleOfBusiness,netBusinessPurchaseAndSale,purchaseOfPPE,saleOfPPE,netPPEPurchaseAndSale,purchaseOfInvestmentProperties,saleOfInvestmentProperties,netInvestmentPropertiesPurchaseAndSale,purchaseOfInvestment,saleOfInvestment,netInvestmentPurchaseAndSale,purchaseOfIntangibles,saleOfIntangibles,netIntangiblesPurchaseAndSale,netOtherInvestingChanges,investingCashFlow,freeCashFlow,cashFlowFromContinuingOperatingActivities
4343,RPRX,Healthcare,Biotechnology,2019-12-31,,,-2116142000,-27042000,,-27042000,,,,,,,-2363651000.0,524551000.0,-1839100000.0,,,,-250000000.0,-2116142000,1667239000,1667239000
4344,RPRX,Healthcare,Biotechnology,2020-12-31,,,-2759000000,-40000000,,-40000000,,,,,,,-3937000000.0,1203000000.0,-2734000000.0,,,,-2182000000.0,-2759000000,2035000000,2034629000
4345,RPRX,Healthcare,Biotechnology,2021-12-31,,,-1870280000,-34855000,,-34855000,,,,,,,-3593656000.0,1776308000.0,-1817348000.0,,,,-18600000.0,-1870280000,2017536000,2017536000
4346,RPRX,Healthcare,Biotechnology,2022-12-31,,,-1029421000,-9896000,,-9896000,,,,,,,-2565068000.0,1545543000.0,-1019525000.0,,,,,-1029421000,2143980000,2143980000
4435,VEEV,Healthcare,Health Information Services,2021-01-31,,,-333634000,0,,0,,,,,,,-979292000.0,654341000.0,-324951000.0,,,,-8683000.0,-333634000,551246000,551246000
4436,VEEV,Healthcare,Health Information Services,2022-01-31,,,-346152000,-7780000,,-7780000,,,,,,,-1117076000.0,792918000.0,-324158000.0,,,,-14214000.0,-346152000,764463000,764463000
4432,VEEV,Healthcare,Health Information Services,2023-01-31,,,-1007683000,0,,0,,,,,,,-1996878000.0,1002707000.0,-994171000.0,,,,-13512000.0,-1007683000,780470000,780470000
203,LNT,Utilities,Utilities—Regulated Electric,2019-12-31,,,-1287300000,-1640100000,,-1640100000,,,,,,,,,,,,,352800000.0,-1287300000,660400000,660400000
204,LNT,Utilities,Utilities—Regulated Electric,2020-12-31,,,-951000000,-1366000000,,-1366000000,,,,,,,,,,,,,415000000.0,-951000000,501000000,501000000
202,LNT,Utilities,Utilities—Regulated Electric,2021-12-31,,,-728000000,-1169000000,,-1169000000,,,,,,,,,,,,,441000000.0,-728000000,582000000,582000000


Only 3 companies with capex set to null. Free cashflow matches operating cashflow so the inference is that capeX is 0.


In [27]:
df['capitalExpenditure'] = df['capitalExpenditure'].fillna(0)

In [28]:
df[df['netBusinessPurchaseAndSale'].isnull()].sort_values(by=['industry','company','st_date'])[investing]

Unnamed: 0,company,yahoo_sector,industry,st_date,capitalExpenditure,capitalExpenditureReported,cashFlowFromContinuingInvestingActivities,purchaseOfBusiness,saleOfBusiness,netBusinessPurchaseAndSale,purchaseOfPPE,saleOfPPE,netPPEPurchaseAndSale,purchaseOfInvestmentProperties,saleOfInvestmentProperties,netInvestmentPropertiesPurchaseAndSale,purchaseOfInvestment,saleOfInvestment,netInvestmentPurchaseAndSale,purchaseOfIntangibles,saleOfIntangibles,netIntangiblesPurchaseAndSale,netOtherInvestingChanges,investingCashFlow
1682,BA,Industrials,Aerospace & Defense,2020-12-31,-1303000000,,-18366000000,,,,-1303000000.0,296000000.0,-1007000000.0,,,,-37616000000.0,20275000000.0,-17341000000.0,,,,-18000000.0,-18366000000
1679,BA,Industrials,Aerospace & Defense,2022-12-31,-1222000000,,4370000000,,,,-1222000000.0,35000000.0,-1187000000.0,,,,-5051000000.0,10619000000.0,5568000000.0,,,,-11000000.0,4370000000
1903,GD,Industrials,Aerospace & Defense,2021-12-31,-887000000,-887000000.0,-882000000,,,,,,,,,,,,,,,,5000000.0,-882000000
1904,GD,Industrials,Aerospace & Defense,2022-12-31,-1114000000,-1114000000.0,-1489000000,,,,,,,,,,,,,,,,-375000000.0,-1489000000
1949,HWM,Industrials,Aerospace & Defense,2019-12-31,-586000000,-586000000.0,583000000,,,,,,,,,,,73000000.0,73000000.0,,,,1096000000.0,583000000
1950,HWM,Industrials,Aerospace & Defense,2020-12-31,-267000000,-267000000.0,271000000,,,,,,,,,,,0.0,0.0,,,,538000000.0,271000000
1951,HWM,Industrials,Aerospace & Defense,2021-12-31,-199000000,-199000000.0,107000000,,,,,,,,,,,6000000.0,6000000.0,,,,300000000.0,107000000
1947,HWM,Industrials,Aerospace & Defense,2022-12-31,-193000000,-193000000.0,-135000000,,,,,,,,,,,0.0,0.0,,,,58000000.0,-135000000
1937,HXL,Industrials,Aerospace & Defense,2022-12-31,-76300000,-76300000.0,-54600000,,,,,21200000.0,21200000.0,,,,,500000.0,500000.0,,,,,-54600000
2053,LMT,Industrials,Aerospace & Defense,2019-12-31,-1484000000,-1484000000.0,-1241000000,,,,,,,,,,,,,,,,243000000.0,-1241000000


We are interested in net spend on acquisitions and we can allow for any offsets resulting from disposals. So based
on count and verification from above, we can take netBusinessPurchaseAndSale as amount spent on acquisitions. If 
positive then it is amount resulting from disposals.

It's reasonable to set netBusinessPurchaseAndSale to 0 if null. I think it is fair to apply the same to purchaseOfBusiness
and saleOfBusiness.

In [29]:
df['netBusinessPurchaseAndSale'] = df['netBusinessPurchaseAndSale'].fillna(0)
df['purchaseOfBusiness'] = df['purchaseOfBusiness'].fillna(0)
df['saleOfBusiness'] = df['saleOfBusiness'].fillna(0)

In [30]:
df[df['netInvestmentPurchaseAndSale'].isnull()].sort_values(by=['industry','company','st_date'])[investing]

Unnamed: 0,company,yahoo_sector,industry,st_date,capitalExpenditure,capitalExpenditureReported,cashFlowFromContinuingInvestingActivities,purchaseOfBusiness,saleOfBusiness,netBusinessPurchaseAndSale,purchaseOfPPE,saleOfPPE,netPPEPurchaseAndSale,purchaseOfInvestmentProperties,saleOfInvestmentProperties,netInvestmentPropertiesPurchaseAndSale,purchaseOfInvestment,saleOfInvestment,netInvestmentPurchaseAndSale,purchaseOfIntangibles,saleOfIntangibles,netIntangiblesPurchaseAndSale,netOtherInvestingChanges,investingCashFlow
484,IPG,Communication Services,Advertising Agencies,2019-12-31,-198500000,-198500000,-161700000,-600000,0,-600000,,,,,,,,,,,,,37400000,-161700000
485,IPG,Communication Services,Advertising Agencies,2020-12-31,-167500000,-167500000,-216200000,-4900000,0,-4900000,,,,,,,,,,,,,-43800000,-216200000
1791,CW,Industrials,Aerospace & Defense,2019-12-31,-69752000,,-240040000,-185209000,0,-185209000,-69752000,15093000,-54659000,,,,,,,,,,-172000,-240040000
1792,CW,Industrials,Aerospace & Defense,2020-12-31,-47499000,,-532530000,-487944000,0,-487944000,-47499000,2930000,-44569000,,,,,,,,,,-17000,-532530000
1793,CW,Industrials,Aerospace & Defense,2021-12-31,-41108000,,-42403000,-5340000,0,-5340000,-41108000,4045000,-37063000,,,,,,,,,,,-42403000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2277,SRCL,Industrials,Waste Management,2022-12-31,-132200000,-132200000,-84600000,0,46700000,46700000,,,,,,,,,,,,,900000,-84600000
2389,WM,Industrials,Waste Management,2019-12-31,-1818000000,-1818000000,-2376000000,-521000000,49000000,-472000000,,,,,,,,,,,,,-86000000,-2376000000
2390,WM,Industrials,Waste Management,2020-12-31,-1632000000,-1632000000,-4847000000,-4085000000,885000000,-3200000000,,,,,,,,,,,,,-15000000,-4847000000
2391,WM,Industrials,Waste Management,2021-12-31,-1904000000,-1904000000,-1894000000,-75000000,96000000,21000000,,,,,,,,,,,,,-11000000,-1894000000


In [31]:
df['netInvestmentPurchaseAndSale'] = df['netInvestmentPurchaseAndSale'].fillna(0)
df['purchaseOfInvestment'] = df['purchaseOfInvestment'].fillna(0)
df['saleOfInvestment'] = df['saleOfInvestment'].fillna(0)

In [32]:
df[df['netInvestmentPropertiesPurchaseAndSale'].isnull()].sort_values(by=['industry','company','st_date'])[investing]

Unnamed: 0,company,yahoo_sector,industry,st_date,capitalExpenditure,capitalExpenditureReported,cashFlowFromContinuingInvestingActivities,purchaseOfBusiness,saleOfBusiness,netBusinessPurchaseAndSale,purchaseOfPPE,saleOfPPE,netPPEPurchaseAndSale,purchaseOfInvestmentProperties,saleOfInvestmentProperties,netInvestmentPropertiesPurchaseAndSale,purchaseOfInvestment,saleOfInvestment,netInvestmentPurchaseAndSale,purchaseOfIntangibles,saleOfIntangibles,netIntangiblesPurchaseAndSale,netOtherInvestingChanges,investingCashFlow
484,IPG,Communication Services,Advertising Agencies,2019-12-31,-198500000,-198500000,-161700000,-600000,0,-600000,,,,,,,0,0,0,,,,37400000,-161700000
485,IPG,Communication Services,Advertising Agencies,2020-12-31,-167500000,-167500000,-216200000,-4900000,0,-4900000,,,,,,,0,0,0,,,,-43800000,-216200000
486,IPG,Communication Services,Advertising Agencies,2021-12-31,-195300000,-195300000,-185300000,-16300000,0,-16300000,,,,,,,0,34800000,34800000,,,,-8500000,-185300000
487,IPG,Communication Services,Advertising Agencies,2022-12-31,-178100000,-178100000,-430100000,-252600000,0,-252600000,,,,,,,0,2600000,2600000,,,,-2000000,-430100000
579,OMC,Communication Services,Advertising Agencies,2019-12-31,-102200000,-102200000,-30900000,-10000000,79400000,69400000,,,,,,,0,1900000,1900000,,,,,-30900000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2277,SRCL,Industrials,Waste Management,2022-12-31,-132200000,-132200000,-84600000,0,46700000,46700000,,,,,,,0,0,0,,,,900000,-84600000
2389,WM,Industrials,Waste Management,2019-12-31,-1818000000,-1818000000,-2376000000,-521000000,49000000,-472000000,,,,,,,0,0,0,,,,-86000000,-2376000000
2390,WM,Industrials,Waste Management,2020-12-31,-1632000000,-1632000000,-4847000000,-4085000000,885000000,-3200000000,,,,,,,0,0,0,,,,-15000000,-4847000000
2391,WM,Industrials,Waste Management,2021-12-31,-1904000000,-1904000000,-1894000000,-75000000,96000000,21000000,,,,,,,0,0,0,,,,-11000000,-1894000000


In [33]:
df['netInvestmentPropertiesPurchaseAndSale'] = df['netInvestmentPropertiesPurchaseAndSale'].fillna(0)
df['purchaseOfInvestmentProperties'] = df['purchaseOfInvestmentProperties'].fillna(0)
df['saleOfInvestmentProperties'] = df['saleOfInvestmentProperties'].fillna(0)

In [34]:
df[df['netIntangiblesPurchaseAndSale'].notna()].sort_values(by=['industry','company','st_date'])[investing].reset_index()

Unnamed: 0,index,company,yahoo_sector,industry,st_date,capitalExpenditure,capitalExpenditureReported,cashFlowFromContinuingInvestingActivities,purchaseOfBusiness,saleOfBusiness,netBusinessPurchaseAndSale,purchaseOfPPE,saleOfPPE,netPPEPurchaseAndSale,purchaseOfInvestmentProperties,saleOfInvestmentProperties,netInvestmentPropertiesPurchaseAndSale,purchaseOfInvestment,saleOfInvestment,netInvestmentPurchaseAndSale,purchaseOfIntangibles,saleOfIntangibles,netIntangiblesPurchaseAndSale,netOtherInvestingChanges,investingCashFlow
0,1667,AXON,Industrials,Aerospace & Defense,2019-12-31,-16343000,,-240737000,0,0,0,-15939000.0,0.0,-15939000.0,0,0,0,-354477000,130083000,-224394000,-404000.0,,-404000,,-240737000
1,1668,AXON,Industrials,Aerospace & Defense,2020-12-31,-72870000,,-356526000,-7068000,0,-7068000,-72629000.0,95000.0,-72534000.0,0,0,0,-656522000,379839000,-276683000,-241000.0,,-241000,,-356526000
2,1669,AXON,Industrials,Aerospace & Defense,2021-12-31,-50278000,,252556000,-22393000,0,-22393000,-49886000.0,43000.0,-49843000.0,0,0,0,-407979000,733163000,325184000,-392000.0,,-392000,,252556000
3,1670,AXON,Industrials,Aerospace & Defense,2022-12-31,-56109000,,-830967000,-2104000,0,-2104000,-55802000.0,287000.0,-55515000.0,0,0,0,-845179000,72138000,-773041000,-307000.0,,-307000,,-830967000
4,1681,BA,Industrials,Aerospace & Defense,2019-12-31,-1961000000,,-1530000000,-455000000,464000000,9000000,-1834000000.0,334000000.0,-1500000000.0,0,0,0,-1658000000,1759000000,101000000,-127000000.0,,-127000000,-13000000.0,-1530000000
5,2176,RTX,Industrials,Aerospace & Defense,2019-12-31,-2607000000,-2256000000.0,-3092000000,-56000000,82000000,82000000,,,,0,0,0,-658000000,336000000,-322000000,-351000000.0,,-351000000,-245000000.0,-3092000000
6,2177,RTX,Industrials,Aerospace & Defense,2020-12-31,-1967000000,-1795000000.0,3343000000,-419000000,5764000000,5345000000,,,,0,0,0,-312000000,368000000,56000000,-172000000.0,,-172000000,-91000000.0,3102000000
7,2178,RTX,Industrials,Aerospace & Defense,2021-12-31,-2322000000,-2134000000.0,-1364000000,-1088000000,1879000000,791000000,,,,0,0,0,-16000000,158000000,142000000,-188000000.0,,-188000000,25000000.0,-1364000000
8,2174,RTX,Industrials,Aerospace & Defense,2022-12-31,-2775000000,-2288000000.0,-2829000000,-66000000,94000000,28000000,,,,0,0,0,-355000000,179000000,-176000000,-487000000.0,,-487000000,94000000.0,-2829000000
9,1017,CF,Basic Materials,Agricultural Inputs,2021-12-31,-524000000,,-466000000,0,0,0,-514000000.0,1000000.0,-513000000.0,0,0,0,-13000000,12000000,-1000000,-10000000.0,58000000.0,48000000,-1000000.0,-466000000


Conclusion: purchase and sale of intangibles is significant. Net purchases and sales of intangibles is the best 
field to take in final cut. 

Reasonable to set fields to 0 where they are null.

In [35]:
df['netIntangiblesPurchaseAndSale'] = df['netIntangiblesPurchaseAndSale'].fillna(0)
df['purchaseOfIntangibles'] = df['purchaseOfIntangibles'].fillna(0)
df['saleOfIntangibles'] = df['saleOfIntangibles'].fillna(0)

In [36]:
df[df['netPPEPurchaseAndSale'].notna()].sort_values(by=['industry','company','st_date'])[investing]

Unnamed: 0,company,yahoo_sector,industry,st_date,capitalExpenditure,capitalExpenditureReported,cashFlowFromContinuingInvestingActivities,purchaseOfBusiness,saleOfBusiness,netBusinessPurchaseAndSale,purchaseOfPPE,saleOfPPE,netPPEPurchaseAndSale,purchaseOfInvestmentProperties,saleOfInvestmentProperties,netInvestmentPropertiesPurchaseAndSale,purchaseOfInvestment,saleOfInvestment,netInvestmentPurchaseAndSale,purchaseOfIntangibles,saleOfIntangibles,netIntangiblesPurchaseAndSale,netOtherInvestingChanges,investingCashFlow
1667,AXON,Industrials,Aerospace & Defense,2019-12-31,-16343000,,-240737000,0,0,0,-15939000,0,-15939000,0,0,0,-354477000,130083000,-224394000,-404000,0,-404000,,-240737000
1668,AXON,Industrials,Aerospace & Defense,2020-12-31,-72870000,,-356526000,-7068000,0,-7068000,-72629000,95000,-72534000,0,0,0,-656522000,379839000,-276683000,-241000,0,-241000,,-356526000
1669,AXON,Industrials,Aerospace & Defense,2021-12-31,-50278000,,252556000,-22393000,0,-22393000,-49886000,43000,-49843000,0,0,0,-407979000,733163000,325184000,-392000,0,-392000,,252556000
1670,AXON,Industrials,Aerospace & Defense,2022-12-31,-56109000,,-830967000,-2104000,0,-2104000,-55802000,287000,-55515000,0,0,0,-845179000,72138000,-773041000,-307000,0,-307000,,-830967000
1681,BA,Industrials,Aerospace & Defense,2019-12-31,-1961000000,,-1530000000,-455000000,464000000,9000000,-1834000000,334000000,-1500000000,0,0,0,-1658000000,1759000000,101000000,-127000000,0,-127000000,-13000000,-1530000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1745,CLH,Industrials,Waste Management,2022-12-31,-347022000,,-388944000,-86278000,16811000,-69467000,-345056000,8779000,-336277000,0,0,0,-49845000,68611000,18766000,-1966000,0,-1966000,,-388944000
2184,RSG,Industrials,Waste Management,2019-12-31,-1207100000,,-1719000000,-575100000,42800000,-532300000,-1207100000,21700000,-1185400000,0,0,0,0,0,0,0,0,0,-1300000,-1719000000
2185,RSG,Industrials,Waste Management,2020-12-31,-1194600000,,-1922800000,-769500000,32900000,-736600000,-1194600000,30100000,-1164500000,0,0,0,0,0,0,0,0,0,-21700000,-1922800000
2186,RSG,Industrials,Waste Management,2021-12-31,-1316300000,,-2466100000,-1221700000,46300000,-1175400000,-1316300000,19500000,-1296800000,0,0,0,0,0,0,0,0,0,6100000,-2466100000


In [37]:
df['netPPEPurchaseAndSale'] = df['netPPEPurchaseAndSale'].fillna(0)
df['purchaseOfPPE'] = df['purchaseOfPPE'].fillna(0)
df['saleOfPPE'] = df['saleOfPPE'].fillna(0)

In [38]:
# Check count of nulls to see what's left

df[investing].isnull().sum().sort_values()

company                                         0
netIntangiblesPurchaseAndSale                   0
saleOfIntangibles                               0
purchaseOfIntangibles                           0
netInvestmentPurchaseAndSale                    0
saleOfInvestment                                0
purchaseOfInvestment                            0
netInvestmentPropertiesPurchaseAndSale          0
saleOfInvestmentProperties                      0
purchaseOfInvestmentProperties                  0
netPPEPurchaseAndSale                           0
saleOfPPE                                       0
purchaseOfPPE                                   0
netBusinessPurchaseAndSale                      0
saleOfBusiness                                  0
purchaseOfBusiness                              0
cashFlowFromContinuingInvestingActivities       0
capitalExpenditure                              0
st_date                                         0
industry                                        0


In [39]:
df['netOtherInvestingChanges'] = df['netOtherInvestingChanges'].fillna(0)

In [40]:
investing

['company',
 'yahoo_sector',
 'industry',
 'st_date',
 'capitalExpenditure',
 'capitalExpenditureReported',
 'cashFlowFromContinuingInvestingActivities',
 'purchaseOfBusiness',
 'saleOfBusiness',
 'netBusinessPurchaseAndSale',
 'purchaseOfPPE',
 'saleOfPPE',
 'netPPEPurchaseAndSale',
 'purchaseOfInvestmentProperties',
 'saleOfInvestmentProperties',
 'netInvestmentPropertiesPurchaseAndSale',
 'purchaseOfInvestment',
 'saleOfInvestment',
 'netInvestmentPurchaseAndSale',
 'purchaseOfIntangibles',
 'saleOfIntangibles',
 'netIntangiblesPurchaseAndSale',
 'netOtherInvestingChanges',
 'investingCashFlow']

In [41]:
# Select fields for final cut

final_investing = ['capitalExpenditure', 
                   'saleOfBusiness',
                   'purchaseOfBusiness',
                   'netBusinessPurchaseAndSale',
                   'saleOfPPE',
                   'purchaseOfPPE',          
                   'netPPEPurchaseAndSale',
                   'saleOfInvestment',
                   'purchaseOfInvestment',
                   'netInvestmentPurchaseAndSale',
                   'saleOfInvestmentProperties',
                   'purchaseOfInvestmentProperties',
                   'netInvestmentPropertiesPurchaseAndSale',
                   'saleOfIntangibles',
                   'purchaseOfIntangibles',
                   'netIntangiblesPurchaseAndSale',
                   'netOtherInvestingChanges',
                   'cashFlowFromContinuingInvestingActivities']

## Finance

In [42]:
df[finance].isnull().sum().sort_values()

company                                         0
yahoo_sector                                    0
industry                                        0
st_date                                         0
cashFlowFromContinuingFinancingActivities       0
netIssuancePaymentsOfDebt                     110
netLongTermDebtIssuance                       155
repaymentOfDebt                               184
longTermDebtPayments                          250
issuanceOfDebt                                277
netCommonStockIssuance                        284
netOtherFinancingCharges                      301
longTermDebtIssuance                          378
repurchaseOfCapitalStock                      628
commonStockPayments                           660
cashDividendsPaid                             986
netShortTermDebtIssuance                     1247
commonStockDividendPaid                      1283
issuanceOfCapitalStock                       1666
commonStockIssuance                          1707


In [43]:
# Investigate net debt issuance

df[df['netIssuancePaymentsOfDebt'].isnull()].sort_values(by=['industry','company','st_date'])[finance]

Unnamed: 0,company,yahoo_sector,industry,st_date,cashFlowFromContinuingFinancingActivities,shortTermDebtIssuance,shortTermDebtPayments,netShortTermDebtIssuance,longTermDebtIssuance,longTermDebtPayments,netLongTermDebtIssuance,issuanceOfDebt,repaymentOfDebt,netIssuancePaymentsOfDebt,cashDividendsPaid,commonStockDividendPaid,preferredStockDividendPaid,commonStockIssuance,commonStockPayments,netCommonStockIssuance,issuanceOfCapitalStock,netPreferredStockIssuance,netOtherFinancingCharges,repurchaseOfCapitalStock
1667,AXON,Industrials,Aerospace & Defense,2019-12-31,-3937000,,,,,,,,,,,,,0.0,,0.0,0.0,,-4051000.0,
1668,AXON,Industrials,Aerospace & Defense,2020-12-31,299265000,,,,,,,,,,,,,306779000.0,,306779000.0,306779000.0,,-7809000.0,
1669,AXON,Industrials,Aerospace & Defense,2021-12-31,-174181000,,,,,,,,,,,,,105514000.0,,105514000.0,105514000.0,,-331309000.0,
3561,LULU,Consumer Cyclical,Apparel Retail,2020-01-31,-177173000,,,,,,,,,,,,,,-173399000.0,-173399000.0,,,-21944000.0,-173399000.0
3558,LULU,Consumer Cyclical,Apparel Retail,2021-01-31,-80788000,,,,,,,,,,,,,,-63663000.0,-63663000.0,,,-32388000.0,-63663000.0
3559,LULU,Consumer Cyclical,Apparel Retail,2022-01-31,-844987000,,,,,,,,,,,,,,-812602000.0,-812602000.0,,,-50579000.0,-812602000.0
3560,LULU,Consumer Cyclical,Apparel Retail,2023-01-31,-467487000,,,,,,,,,,,,,,-479159000.0,-479159000.0,,,-32000.0,-479159000.0
3707,ROST,Consumer Cyclical,Apparel Retail,2020-01-31,-1683249000,,,,,,,,,,-369793000.0,-369793000.0,,,-1335665000.0,-1335665000.0,,,,-1335665000.0
3761,TJX,Consumer Cyclical,Apparel Retail,2020-01-31,-2414900000,,,,,,,,,,-1071600000.0,-1071600000.0,,232100000.0,-1552000000.0,-1319900000.0,232100000.0,,-23400000.0,-1552000000.0
3689,QS,Consumer Cyclical,Auto Parts,2020-12-31,953724000,,,,,,,,,,,,,99800000.0,,99800000.0,276262000.0,176462000.0,676863000.0,


In [44]:
df[df['shortTermDebtPayments'].notnull()].sort_values(by=['industry','company','st_date'])[finance].head(500)

Unnamed: 0,company,yahoo_sector,industry,st_date,cashFlowFromContinuingFinancingActivities,shortTermDebtIssuance,shortTermDebtPayments,netShortTermDebtIssuance,longTermDebtIssuance,longTermDebtPayments,netLongTermDebtIssuance,issuanceOfDebt,repaymentOfDebt,netIssuancePaymentsOfDebt,cashDividendsPaid,commonStockDividendPaid,preferredStockDividendPaid,commonStockIssuance,commonStockPayments,netCommonStockIssuance,issuanceOfCapitalStock,netPreferredStockIssuance,netOtherFinancingCharges,repurchaseOfCapitalStock
1791,CW,Industrials,Aerospace & Defense,2019-12-31,-68145000,37692000.0,-37934000,-242000,37692000.0,0.0,0.0,37692000.0,-37934000,-242000,-28200000.0,-28200000.0,,,-50661000.0,-50661000.0,,,-812000.0,-50661000.0
1792,CW,Industrials,Aerospace & Defense,2020-12-31,82081000,570675000.0,-570675000,0,300000000.0,0.0,300000000.0,870675000.0,-570675000,300000000,-28175000.0,-28175000.0,,,-200018000.0,-200018000.0,,,-874000.0,-200018000.0
1793,CW,Industrials,Aerospace & Defense,2021-12-31,-369129000,455950000.0,-362050000,93900000,0.0,-100000000.0,-100000000.0,455950000.0,-462050000,-6100000,-28660000.0,,,,-343129000.0,-343129000.0,,,-945000.0,-343129000.0
1794,CW,Industrials,Aerospace & Defense,2022-12-31,129428000,1697647000.0,-1791547000,-93900000,300000000.0,0.0,300000000.0,1997647000.0,-1791547000,206100000,-28779000.0,-28779000.0,,,-56870000.0,-56870000.0,,,-1020000.0,-56870000.0
1901,GD,Industrials,Aerospace & Defense,2019-12-31,-1997000000,,-850000000,-850000000,0.0,-850000000.0,-850000000.0,0.0,-850000000,-850000000,-1152000000.0,-1152000000.0,,,-231000000.0,-231000000.0,,,236000000.0,-231000000.0
1902,GD,Industrials,Aerospace & Defense,2020-12-31,-903000000,420000000.0,-861000000,-441000000,3960000000.0,-2500000000.0,1460000000.0,4380000000.0,-3361000000,1019000000,-1240000000.0,-1240000000.0,,,-587000000.0,-587000000.0,,,-95000000.0,-587000000.0
1903,GD,Industrials,Aerospace & Defense,2021-12-31,-4590000000,2003000000.0,-1997000000,6000000,1497000000.0,-3000000000.0,-1503000000.0,3500000000.0,-4997000000,-1497000000,-1315000000.0,-1315000000.0,,,-1828000000.0,-1828000000.0,,,50000000.0,-1828000000.0
1904,GD,Industrials,Aerospace & Defense,2022-12-31,-3471000000,0.0,0,0,0.0,-1000000000.0,-1000000000.0,0.0,-1000000000,-1000000000,-1369000000.0,,,,-1229000000.0,-1229000000.0,,,127000000.0,-1229000000.0
1930,HEI,Industrials,Aerospace & Defense,2021-10-31,-558968000,0.0,-505000000,-505000000,,,,0.0,-505000000,-505000000,-23002000.0,-23002000.0,,,,,,,-32519000.0,
1957,HII,Industrials,Aerospace & Defense,2019-12-31,-434000000,5119000000.0,-5119000000,0,0.0,0.0,0.0,5119000000.0,-5119000000,0,-149000000.0,-149000000.0,,,-262000000.0,-262000000.0,,,-23000000.0,-262000000.0


Take netIssuancePaymentOfDebt as net debt issued. If positive then this is net debt repaid. We can ignore the other
debt related fields.

Reasonable to set the debt related fields to 0 if null.

In [45]:
# Check whether issuanceOfDebt includes long and short term debt.

df[(df['shortTermDebtIssuance'] + df['longTermDebtIssuance']) != df['issuanceOfDebt']][finance].head(400)

Unnamed: 0,company,yahoo_sector,industry,st_date,cashFlowFromContinuingFinancingActivities,shortTermDebtIssuance,shortTermDebtPayments,netShortTermDebtIssuance,longTermDebtIssuance,longTermDebtPayments,netLongTermDebtIssuance,issuanceOfDebt,repaymentOfDebt,netIssuancePaymentsOfDebt,cashDividendsPaid,commonStockDividendPaid,preferredStockDividendPaid,commonStockIssuance,commonStockPayments,netCommonStockIssuance,issuanceOfCapitalStock,netPreferredStockIssuance,netOtherFinancingCharges,repurchaseOfCapitalStock
0,AM,Energy,Oil & Gas Midstream,2022-12-31,-205778000,,,234800000.0,0.0,0.0,0.0,0.0,0.0,234800000.0,-433375000.0,-432825000.0,-550000.0,,0.0,0.0,,,-7203000.0,0.0
2,AM,Energy,Oil & Gas Midstream,2019-12-31,-98299000,,-115500000.0,-115500000.0,650000000.0,-115500000.0,650000000.0,650000000.0,-115500000.0,534500000.0,-496197000.0,-495823000.0,-374000.0,,-125519000.0,-125519000.0,,,-11083000.0,-125519000.0
3,AM,Energy,Oil & Gas Midstream,2020-12-31,-534746000,,-346000000.0,-346000000.0,550000000.0,-346000000.0,550000000.0,550000000.0,-346000000.0,204000000.0,-590190000.0,-589640000.0,-550000.0,,-24713000.0,-24713000.0,,,-123843000.0,-24713000.0
4,AM,Energy,Oil & Gas Midstream,2021-12-31,-477150000,,-66300000.0,-66300000.0,750000000.0,-667472000.0,82528000.0,750000000.0,-733772000.0,16228000.0,-471721000.0,-471171000.0,-550000.0,,0.0,0.0,,,-21657000.0,0.0
17,BKR,Energy,Oil & Gas Equipment & Services,2019-12-31,-1534000000,,,-542000000.0,525000000.0,-587000000.0,-587000000.0,525000000.0,-587000000.0,-587000000.0,-745000000.0,-745000000.0,,,-250000000.0,-250000000.0,,,48000000.0,-250000000.0
19,BKR,Energy,Oil & Gas Equipment & Services,2021-12-31,-2143000000,,-873000000.0,-832000000.0,1250000000.0,-1354000000.0,-104000000.0,1250000000.0,-1354000000.0,-936000000.0,-749000000.0,,,,-434000000.0,-434000000.0,,,-24000000.0,-434000000.0
20,LNG,Energy,Oil & Gas Midstream,2019-12-31,1168000000,,,,6434000000.0,-4346000000.0,2088000000.0,6434000000.0,-4346000000.0,2088000000.0,,,,,-249000000.0,-249000000.0,,,-671000000.0,-249000000.0
21,LNG,Energy,Oil & Gas Midstream,2020-12-31,-235000000,,,,7823000000.0,-6940000000.0,883000000.0,7823000000.0,-6940000000.0,883000000.0,,,,,-155000000.0,-155000000.0,,,-963000000.0,-155000000.0
22,LNG,Energy,Oil & Gas Midstream,2021-12-31,-1817000000,,,,5911000000.0,-6810000000.0,-899000000.0,5911000000.0,-6810000000.0,-899000000.0,-85000000.0,-85000000.0,,,-9000000.0,-9000000.0,,,-824000000.0,-9000000.0
23,LNG,Energy,Oil & Gas Midstream,2022-12-31,-8014000000,,,,1575000000.0,-6778000000.0,-5203000000.0,1575000000.0,-6778000000.0,-5203000000.0,-349000000.0,-349000000.0,,,-1373000000.0,-1373000000.0,,,-1089000000.0,-1373000000.0


Observation : issuance of debt does not include short term debt, at least in some cases.

In [46]:
df[df['longTermDebtIssuance'] != df['issuanceOfDebt']][finance]

Unnamed: 0,company,yahoo_sector,industry,st_date,cashFlowFromContinuingFinancingActivities,shortTermDebtIssuance,shortTermDebtPayments,netShortTermDebtIssuance,longTermDebtIssuance,longTermDebtPayments,netLongTermDebtIssuance,issuanceOfDebt,repaymentOfDebt,netIssuancePaymentsOfDebt,cashDividendsPaid,commonStockDividendPaid,preferredStockDividendPaid,commonStockIssuance,commonStockPayments,netCommonStockIssuance,issuanceOfCapitalStock,netPreferredStockIssuance,netOtherFinancingCharges,repurchaseOfCapitalStock
10,APA,Energy,Oil & Gas E&P,2022-12-31,-3489000000,24000000,,24000000,0,-1493000000,-1493000000,24000000,-1493000000,-1469000000,-218000000,-207000000,-11000000,,-1423000000,-1423000000,,,-379000000,-1423000000
12,APA,Energy,Oil & Gas E&P,2019-12-31,112000000,396000000,,396000000,989000000,-1150000000,235000000,1385000000,-1150000000,235000000,-376000000,-376000000,,2000000,,2000000,2000000,,251000000,
13,APA,Energy,Oil & Gas E&P,2020-12-31,93000000,228000000,,378000000,145000000,-1243000000,-5000000,373000000,-1243000000,373000000,-146000000,-123000000,-23000000,1000000,,1000000,1000000,,93000000,
14,APA,Energy,Oil & Gas E&P,2021-12-31,-2623000000,425000000,,425000000,0,-1795000000,-1795000000,425000000,-1795000000,-1370000000,-98000000,-52000000,-46000000,,-847000000,-847000000,,,-308000000,-847000000
18,BKR,Energy,Oil & Gas Equipment & Services,2020-12-31,225000000,737000000,-204000000,737000000,500000000,-246000000,254000000,1237000000,-246000000,991000000,-744000000,-744000000,,,0,0,,,-22000000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4436,VEEV,Healthcare,Health Information Services,2022-01-31,-4140000,,,,,-384000,-384000,,-384000,-384000,,,,,,,,,-55294000,
4453,WST,Healthcare,Medical Instruments & Supplies,2019-12-31,-36800000,108500000,-136300000,-27800000,90000000,-100000,89900000,198500000,-136400000,62100000,-45100000,-45100000,,,-83100000,-83100000,,,-1200000,-83100000
4456,WST,Healthcare,Medical Instruments & Supplies,2022-12-31,-293600000,,,,,-44300000,-44300000,,-44300000,-44300000,-54100000,,,,-222200000,-222200000,,,-1200000,-222200000
4461,ZBH,Healthcare,Medical Devices,2022-12-31,-775700000,595000000,-220000000,375000000,83000000,-1738700000,-1060700000,678000000,-1738700000,-1060700000,-201200000,-201200000,,,-126400000,-126400000,,,534500000,-126400000


Here we see examples where issuance of debt is only made up of short term debt.

In [47]:
# Check that netIssuancePaymentsOfDebt is equal to the sum of short and long term debt.

col_list = ['company','st_date',
            'shortTermDebtIssuance','longTermDebtIssuance',
            'shortTermDebtPayments','longTermDebtPayments',
            'netShortTermDebtIssuance','netLongTermDebtIssuance','netIssuancePaymentsOfDebt',
           ]
df[(df['netShortTermDebtIssuance'] + df['netLongTermDebtIssuance']) != df['netIssuancePaymentsOfDebt']][col_list].head(500)

Unnamed: 0,company,st_date,shortTermDebtIssuance,longTermDebtIssuance,shortTermDebtPayments,longTermDebtPayments,netShortTermDebtIssuance,netLongTermDebtIssuance,netIssuancePaymentsOfDebt
12,APA,2019-12-31,396000000.0,989000000.0,,-1150000000.0,396000000.0,235000000.0,235000000.0
15,BKR,2022-12-31,0.0,0.0,0.0,-28000000.0,-28000000.0,-28000000.0,-28000000.0
17,BKR,2019-12-31,,525000000.0,,-587000000.0,-542000000.0,-587000000.0,-587000000.0
20,LNG,2019-12-31,,6434000000.0,,-4346000000.0,,2088000000.0,2088000000.0
21,LNG,2020-12-31,,7823000000.0,,-6940000000.0,,883000000.0,883000000.0
22,LNG,2021-12-31,,5911000000.0,,-6810000000.0,,-899000000.0,-899000000.0
23,LNG,2022-12-31,,1575000000.0,,-6778000000.0,,-5203000000.0,-5203000000.0
29,CVX,2022-12-31,263000000.0,0.0,0.0,-8742000000.0,263000000.0,-8742000000.0,-8500000000.0
30,CVX,2019-12-31,2586000000.0,0.0,-5407000000.0,-5025000000.0,-2821000000.0,-7800000000.0,-7800000000.0
31,CVX,2020-12-31,10846000000.0,12308000000.0,-10195000000.0,-5489000000.0,651000000.0,7500000000.0,7500000000.0


Didn't expect this. Totally inconsistent whether netIssuancePaymentsOfDebt includes long and short term. Disregard
this field as it is unreliable.

In [48]:
# Let's check that netShortTermDebtIssuance is equal to sum of shortTermIssuance and shortTermPayments

df[(df['shortTermDebtIssuance'] + df['shortTermDebtPayments']) != df['netShortTermDebtIssuance']][finance].head(500)

Unnamed: 0,company,yahoo_sector,industry,st_date,cashFlowFromContinuingFinancingActivities,shortTermDebtIssuance,shortTermDebtPayments,netShortTermDebtIssuance,longTermDebtIssuance,longTermDebtPayments,netLongTermDebtIssuance,issuanceOfDebt,repaymentOfDebt,netIssuancePaymentsOfDebt,cashDividendsPaid,commonStockDividendPaid,preferredStockDividendPaid,commonStockIssuance,commonStockPayments,netCommonStockIssuance,issuanceOfCapitalStock,netPreferredStockIssuance,netOtherFinancingCharges,repurchaseOfCapitalStock
0,AM,Energy,Oil & Gas Midstream,2022-12-31,-205778000,,,234800000.0,0.0,0.0,0.0,0.0,0.0,234800000.0,-433375000.0,-432825000.0,-550000.0,,0.0,0.0,,,-7203000.0,0.0
2,AM,Energy,Oil & Gas Midstream,2019-12-31,-98299000,,-115500000.0,-115500000.0,650000000.0,-115500000.0,650000000.0,650000000.0,-115500000.0,534500000.0,-496197000.0,-495823000.0,-374000.0,,-125519000.0,-125519000.0,,,-11083000.0,-125519000.0
3,AM,Energy,Oil & Gas Midstream,2020-12-31,-534746000,,-346000000.0,-346000000.0,550000000.0,-346000000.0,550000000.0,550000000.0,-346000000.0,204000000.0,-590190000.0,-589640000.0,-550000.0,,-24713000.0,-24713000.0,,,-123843000.0,-24713000.0
4,AM,Energy,Oil & Gas Midstream,2021-12-31,-477150000,,-66300000.0,-66300000.0,750000000.0,-667472000.0,82528000.0,750000000.0,-733772000.0,16228000.0,-471721000.0,-471171000.0,-550000.0,,0.0,0.0,,,-21657000.0,0.0
10,APA,Energy,Oil & Gas E&P,2022-12-31,-3489000000,24000000.0,,24000000.0,0.0,-1493000000.0,-1493000000.0,24000000.0,-1493000000.0,-1469000000.0,-218000000.0,-207000000.0,-11000000.0,,-1423000000.0,-1423000000.0,,,-379000000.0,-1423000000.0
12,APA,Energy,Oil & Gas E&P,2019-12-31,112000000,396000000.0,,396000000.0,989000000.0,-1150000000.0,235000000.0,1385000000.0,-1150000000.0,235000000.0,-376000000.0,-376000000.0,,2000000.0,,2000000.0,2000000.0,,251000000.0,
13,APA,Energy,Oil & Gas E&P,2020-12-31,93000000,228000000.0,,378000000.0,145000000.0,-1243000000.0,-5000000.0,373000000.0,-1243000000.0,373000000.0,-146000000.0,-123000000.0,-23000000.0,1000000.0,,1000000.0,1000000.0,,93000000.0,
14,APA,Energy,Oil & Gas E&P,2021-12-31,-2623000000,425000000.0,,425000000.0,0.0,-1795000000.0,-1795000000.0,425000000.0,-1795000000.0,-1370000000.0,-98000000.0,-52000000.0,-46000000.0,,-847000000.0,-847000000.0,,,-308000000.0,-847000000.0
15,BKR,Energy,Oil & Gas Equipment & Services,2022-12-31,-1592000000,0.0,0.0,-28000000.0,0.0,-28000000.0,-28000000.0,0.0,-28000000.0,-28000000.0,-743000000.0,-743000000.0,,,-828000000.0,-828000000.0,,,7000000.0,-828000000.0
17,BKR,Energy,Oil & Gas Equipment & Services,2019-12-31,-1534000000,,,-542000000.0,525000000.0,-587000000.0,-587000000.0,525000000.0,-587000000.0,-587000000.0,-745000000.0,-745000000.0,,,-250000000.0,-250000000.0,,,48000000.0,-250000000.0


Plenty of inconsistencies. Many cases where netShortTermDebtIssuance is populated but the other two fields aren't.
The totals do not always add up. 
Best we can do is take netShortTermDebtIssuance in the final cut as representative of net ShortTermDebt.


In [49]:
# Let's check that netLongTermDebtIssuance is equal to sum of longTermIssuance and longTermPayments

df[(df['longTermDebtIssuance'] + df['longTermDebtPayments']) != df['netLongTermDebtIssuance']][finance].head(500)

Unnamed: 0,company,yahoo_sector,industry,st_date,cashFlowFromContinuingFinancingActivities,shortTermDebtIssuance,shortTermDebtPayments,netShortTermDebtIssuance,longTermDebtIssuance,longTermDebtPayments,netLongTermDebtIssuance,issuanceOfDebt,repaymentOfDebt,netIssuancePaymentsOfDebt,cashDividendsPaid,commonStockDividendPaid,preferredStockDividendPaid,commonStockIssuance,commonStockPayments,netCommonStockIssuance,issuanceOfCapitalStock,netPreferredStockIssuance,netOtherFinancingCharges,repurchaseOfCapitalStock
2,AM,Energy,Oil & Gas Midstream,2019-12-31,-98299000,,-115500000.0,-115500000.0,650000000.0,-115500000.0,650000000.0,650000000.0,-115500000.0,534500000.0,-496197000.0,-495823000.0,-374000.0,,-125519000.0,-125519000.0,,,-11083000.0,-125519000.0
3,AM,Energy,Oil & Gas Midstream,2020-12-31,-534746000,,-346000000.0,-346000000.0,550000000.0,-346000000.0,550000000.0,550000000.0,-346000000.0,204000000.0,-590190000.0,-589640000.0,-550000.0,,-24713000.0,-24713000.0,,,-123843000.0,-24713000.0
12,APA,Energy,Oil & Gas E&P,2019-12-31,112000000,396000000.0,,396000000.0,989000000.0,-1150000000.0,235000000.0,1385000000.0,-1150000000.0,235000000.0,-376000000.0,-376000000.0,,2000000.0,,2000000.0,2000000.0,,251000000.0,
13,APA,Energy,Oil & Gas E&P,2020-12-31,93000000,228000000.0,,378000000.0,145000000.0,-1243000000.0,-5000000.0,373000000.0,-1243000000.0,373000000.0,-146000000.0,-123000000.0,-23000000.0,1000000.0,,1000000.0,1000000.0,,93000000.0,
17,BKR,Energy,Oil & Gas Equipment & Services,2019-12-31,-1534000000,,,-542000000.0,525000000.0,-587000000.0,-587000000.0,525000000.0,-587000000.0,-587000000.0,-745000000.0,-745000000.0,,,-250000000.0,-250000000.0,,,48000000.0,-250000000.0
30,CVX,Energy,Oil & Gas Integrated,2019-12-31,-19700000000,2586000000.0,-5407000000.0,-2821000000.0,0.0,-5025000000.0,-7800000000.0,2586000000.0,-10432000000.0,-7800000000.0,-9000000000.0,-9000000000.0,,,-2900000000.0,-2900000000.0,,,-100000000.0,-2900000000.0
31,CVX,Energy,Oil & Gas Integrated,2020-12-31,-3700000000,10846000000.0,-10195000000.0,651000000.0,12308000000.0,-5489000000.0,7500000000.0,23154000000.0,-15684000000.0,7500000000.0,-9700000000.0,-9700000000.0,,,-1500000000.0,-1500000000.0,,,-24000000.0,-1500000000.0
32,CVX,Energy,Oil & Gas Integrated,2021-12-31,-23100000000,4448000000.0,-10020000000.0,-5572000000.0,0.0,-7364000000.0,-12900000000.0,4448000000.0,-17384000000.0,-12900000000.0,-10200000000.0,-10200000000.0,,,,0.0,,,-36000000.0,
36,COP,Energy,Oil & Gas E&P,2019-12-31,-5229000000,,,,,-80000000.0,-80000000.0,,-80000000.0,-80000000.0,-1500000000.0,-1500000000.0,,,-3530000000.0,-3530000000.0,,,-119000000.0,-3530000000.0
39,CTRA,Energy,Oil & Gas E&P,2022-12-31,-4145000000,,,,0.0,-6000000.0,-880000000.0,0.0,-6000000.0,-880000000.0,-1992000000.0,-1992000000.0,,,-1250000000.0,-1250000000.0,,-10000000.0,-25000000.0,-1260000000.0


Similar inconsistencies as with shortTermDebt. Make a decision to just use netLongTermDebtIssuance.

In [50]:
# Check the following.

df[(df['issuanceOfDebt'] + df['repaymentOfDebt']) != df['netIssuancePaymentsOfDebt']][finance].head(500)

Unnamed: 0,company,yahoo_sector,industry,st_date,cashFlowFromContinuingFinancingActivities,shortTermDebtIssuance,shortTermDebtPayments,netShortTermDebtIssuance,longTermDebtIssuance,longTermDebtPayments,netLongTermDebtIssuance,issuanceOfDebt,repaymentOfDebt,netIssuancePaymentsOfDebt,cashDividendsPaid,commonStockDividendPaid,preferredStockDividendPaid,commonStockIssuance,commonStockPayments,netCommonStockIssuance,issuanceOfCapitalStock,netPreferredStockIssuance,netOtherFinancingCharges,repurchaseOfCapitalStock
0,AM,Energy,Oil & Gas Midstream,2022-12-31,-205778000,,,234800000.0,0.0,0.0,0.0,0.0,0.0,234800000.0,-433375000.0,-432825000.0,-550000.0,,0.0,0.0,,,-7203000.0,0.0
13,APA,Energy,Oil & Gas E&P,2020-12-31,93000000,228000000.0,,378000000.0,145000000.0,-1243000000.0,-5000000.0,373000000.0,-1243000000.0,373000000.0,-146000000.0,-123000000.0,-23000000.0,1000000.0,,1000000.0,1000000.0,,93000000.0,
17,BKR,Energy,Oil & Gas Equipment & Services,2019-12-31,-1534000000,,,-542000000.0,525000000.0,-587000000.0,-587000000.0,525000000.0,-587000000.0,-587000000.0,-745000000.0,-745000000.0,,,-250000000.0,-250000000.0,,,48000000.0,-250000000.0
19,BKR,Energy,Oil & Gas Equipment & Services,2021-12-31,-2143000000,,-873000000.0,-832000000.0,1250000000.0,-1354000000.0,-104000000.0,1250000000.0,-1354000000.0,-936000000.0,-749000000.0,,,,-434000000.0,-434000000.0,,,-24000000.0,-434000000.0
29,CVX,Energy,Oil & Gas Integrated,2022-12-31,-24978000000,263000000.0,0.0,263000000.0,0.0,-8742000000.0,-8742000000.0,263000000.0,-8742000000.0,-8500000000.0,-10968000000.0,-11000000000.0,,,-11300000000.0,-11300000000.0,,,-114000000.0,-11300000000.0
30,CVX,Energy,Oil & Gas Integrated,2019-12-31,-19700000000,2586000000.0,-5407000000.0,-2821000000.0,0.0,-5025000000.0,-7800000000.0,2586000000.0,-10432000000.0,-7800000000.0,-9000000000.0,-9000000000.0,,,-2900000000.0,-2900000000.0,,,-100000000.0,-2900000000.0
31,CVX,Energy,Oil & Gas Integrated,2020-12-31,-3700000000,10846000000.0,-10195000000.0,651000000.0,12308000000.0,-5489000000.0,7500000000.0,23154000000.0,-15684000000.0,7500000000.0,-9700000000.0,-9700000000.0,,,-1500000000.0,-1500000000.0,,,-24000000.0,-1500000000.0
32,CVX,Energy,Oil & Gas Integrated,2021-12-31,-23100000000,4448000000.0,-10020000000.0,-5572000000.0,0.0,-7364000000.0,-12900000000.0,4448000000.0,-17384000000.0,-12900000000.0,-10200000000.0,-10200000000.0,,,,0.0,,,-36000000.0,
36,COP,Energy,Oil & Gas E&P,2019-12-31,-5229000000,,,,,-80000000.0,-80000000.0,,-80000000.0,-80000000.0,-1500000000.0,-1500000000.0,,,-3530000000.0,-3530000000.0,,,-119000000.0,-3530000000.0
39,CTRA,Energy,Oil & Gas E&P,2022-12-31,-4145000000,,,,0.0,-6000000.0,-880000000.0,0.0,-6000000.0,-880000000.0,-1992000000.0,-1992000000.0,,,-1250000000.0,-1250000000.0,,-10000000.0,-25000000.0,-1260000000.0


Plenty of discrepenacies. Take netIssuancePaymentsOfDebt, netShortTermDebtIssuance and netLongTermDebtIssuance 
in final cut and accept as reported. 

In [51]:
# Check for the case where both related fields are null but netCommonStockIssuance isn't.

df[df['commonStockIssuance'].isnull() & \
   df['commonStockPayments'].isnull() & \
   df['netCommonStockIssuance'].notnull()][finance].head(400)


Unnamed: 0,company,yahoo_sector,industry,st_date,cashFlowFromContinuingFinancingActivities,shortTermDebtIssuance,shortTermDebtPayments,netShortTermDebtIssuance,longTermDebtIssuance,longTermDebtPayments,netLongTermDebtIssuance,issuanceOfDebt,repaymentOfDebt,netIssuancePaymentsOfDebt,cashDividendsPaid,commonStockDividendPaid,preferredStockDividendPaid,commonStockIssuance,commonStockPayments,netCommonStockIssuance,issuanceOfCapitalStock,netPreferredStockIssuance,netOtherFinancingCharges,repurchaseOfCapitalStock
32,CVX,Energy,Oil & Gas Integrated,2021-12-31,-23100000000,4448000000.0,-10020000000.0,-5572000000,0,-7364000000,-12900000000,4448000000,-17384000000,-12900000000,-10200000000.0,-10200000000.0,,,,0,,,-36000000.0,
324,NFG,Energy,Oil & Gas Integrated,2019-09-30,-101095000,,,55200000,0,0,0,0,0,55200000,-147418000.0,-147418000.0,,,,-8877000,,,,
325,NFG,Energy,Oil & Gas Integrated,2020-09-30,476088000,,,-25200000,493007000,0,493007000,493007000,0,467807000,-153322000.0,-153322000.0,,,,161603000,,,,
326,NFG,Energy,Oil & Gas Integrated,2021-09-30,-58739000,,,128500000,495267000,-515715000,-20448000,495267000,-515715000,108052000,-163089000.0,-163089000.0,,,,-3702000,,,,
354,PNW,Utilities,Utilities—Regulated Electric,2022-12-31,371468000,48720000.0,0.0,48720000,875537000,-150000000,725537000,924257000,-150000000,774257000,-378881000.0,-378881000.0,,,,-2653000,,,-21255000.0,
356,PNW,Utilities,Utilities—Regulated Electric,2019-12-31,178768000,103275000.0,-65000000.0,38275000,1092188000,-600000000,492188000,1195463000,-665000000,530463000,-329643000.0,-329643000.0,,,,692000,,,-22744000.0,
357,PNW,Utilities,Utilities—Regulated Electric,2020-12-31,361138000,825015000.0,-770690000.0,54325000,1596672000,-915150000,681522000,2421687000,-1685840000,735847000,-350577000.0,-350577000.0,,,,-1389000,,,-22743000.0,
358,PNW,Utilities,Utilities—Regulated Electric,2021-12-31,476916000,142000000.0,-19000000.0,123000000,746999000,0,746999000,888999000,-19000000,869999000,-369478000.0,-369478000.0,,,,-2350000,,,-21255000.0,
800,INGR,Consumer Defensive,Packaged Foods,2021-12-31,-373000000,,,250000000,1300000000,-1690000000,-390000000,1300000000,-1690000000,-140000000,-184000000.0,-184000000.0,,,,-49000000,,,,
1893,GE,Industrials,Specialty Industrial Machinery,2019-12-31,-15764000000,,,280000000,2185000000,-16567000000,-14382000000,2185000000,-16567000000,-14102000000,-649000000.0,-649000000.0,,,,29000000,,,-1043000000.0,


In [52]:
# Check for case where netCommonStockIssuance is null but related fields aren't

df[(df['commonStockIssuance'].notnull() | \
   df['commonStockPayments'].notnull()) & \
   df['netCommonStockIssuance'].isnull()][finance].head(400)

Unnamed: 0,company,yahoo_sector,industry,st_date,cashFlowFromContinuingFinancingActivities,shortTermDebtIssuance,shortTermDebtPayments,netShortTermDebtIssuance,longTermDebtIssuance,longTermDebtPayments,netLongTermDebtIssuance,issuanceOfDebt,repaymentOfDebt,netIssuancePaymentsOfDebt,cashDividendsPaid,commonStockDividendPaid,preferredStockDividendPaid,commonStockIssuance,commonStockPayments,netCommonStockIssuance,issuanceOfCapitalStock,netPreferredStockIssuance,netOtherFinancingCharges,repurchaseOfCapitalStock


Conclusion: netCommonStockIssuance is the most reliable field for stock issued and should be included in final cut
as stock issued.

In [53]:
# cashDividendsPaid has a lower null count than commonStockDividendPaid. Check if there are cases where 
# cashDividendsPaid is null but commonStockDividendPaid is populated.

df[df['cashDividendsPaid'].isnull() & df['commonStockDividendPaid'].notna()][finance]

Unnamed: 0,company,yahoo_sector,industry,st_date,cashFlowFromContinuingFinancingActivities,shortTermDebtIssuance,shortTermDebtPayments,netShortTermDebtIssuance,longTermDebtIssuance,longTermDebtPayments,netLongTermDebtIssuance,issuanceOfDebt,repaymentOfDebt,netIssuancePaymentsOfDebt,cashDividendsPaid,commonStockDividendPaid,preferredStockDividendPaid,commonStockIssuance,commonStockPayments,netCommonStockIssuance,issuanceOfCapitalStock,netPreferredStockIssuance,netOtherFinancingCharges,repurchaseOfCapitalStock


In [54]:
# Sanity check to confirm that cashDividends paid is the sum of dividends paid for common and preferred shares.

df_temp = df[['company','st_date','cashDividendsPaid',
             'commonStockDividendPaid','preferredStockDividendPaid']].copy()
df_temp['totalDividendsPaid'] = df_temp['commonStockDividendPaid'] + df_temp['preferredStockDividendPaid']

df_temp[df_temp['cashDividendsPaid'].notna()].head(300)

Unnamed: 0,company,st_date,cashDividendsPaid,commonStockDividendPaid,preferredStockDividendPaid,totalDividendsPaid
0,AM,2022-12-31,-433375000,-432825000.0,-550000.0,-433375000.0
2,AM,2019-12-31,-496197000,-495823000.0,-374000.0,-496197000.0
3,AM,2020-12-31,-590190000,-589640000.0,-550000.0,-590190000.0
4,AM,2021-12-31,-471721000,-471171000.0,-550000.0,-471721000.0
10,APA,2022-12-31,-218000000,-207000000.0,-11000000.0,-218000000.0
12,APA,2019-12-31,-376000000,-376000000.0,,
13,APA,2020-12-31,-146000000,-123000000.0,-23000000.0,-146000000.0
14,APA,2021-12-31,-98000000,-52000000.0,-46000000.0,-98000000.0
15,BKR,2022-12-31,-743000000,-743000000.0,,
17,BKR,2019-12-31,-745000000,-745000000.0,,


In [55]:
# Set selected fields to 0 if not popupated

col_list = ['netIssuancePaymentsOfDebt',
            'netLongTermDebtIssuance',
            'netShortTermDebtIssuance',
            'commonStockDividendPaid', 
            'preferredStockDividendPaid', 
            'cashDividendsPaid',
            'netCommonStockIssuance', 
            'netPreferredStockIssuance', 
            'repurchaseOfCapitalStock',
            'netOtherFinancingCharges']

df[col_list] = df[col_list].fillna(0)

In [56]:
# Check null counts

df[finance].isnull().sum().sort_values()

company                                         0
netPreferredStockIssuance                       0
netCommonStockIssuance                          0
preferredStockDividendPaid                      0
commonStockDividendPaid                         0
cashDividendsPaid                               0
netIssuancePaymentsOfDebt                       0
netOtherFinancingCharges                        0
netLongTermDebtIssuance                         0
repurchaseOfCapitalStock                        0
netShortTermDebtIssuance                        0
cashFlowFromContinuingFinancingActivities       0
st_date                                         0
industry                                        0
yahoo_sector                                    0
repaymentOfDebt                               184
longTermDebtPayments                          250
issuanceOfDebt                                277
longTermDebtIssuance                          378
commonStockPayments                           660


In [57]:
final_finance = ['netIssuancePaymentsOfDebt','netLongTermDebtIssuance','netShortTermDebtIssuance',
                 'commonStockDividendPaid','preferredStockDividendPaid','cashDividendsPaid',
                 'netCommonStockIssuance',  'netPreferredStockIssuance','repurchaseOfCapitalStock',
                 'netOtherFinancingCharges','cashFlowFromContinuingFinancingActivities']


## Adhoc

In [58]:
df[adhoc].isnull().sum().sort_values()

company                                      0
yahoo_sector                                 0
industry                                     0
st_date                                      0
cashFlowFromContinuingOperatingActivities    0
capitalExpenditure                           0
freeCashFlow                                 0
dtype: int64

In [59]:
df[adhoc].sort_values(by=['industry','company','st_date']).head(500)

Unnamed: 0,company,yahoo_sector,industry,st_date,cashFlowFromContinuingOperatingActivities,capitalExpenditure,freeCashFlow
484,IPG,Communication Services,Advertising Agencies,2019-12-31,1529200000,-198500000,1330700000
485,IPG,Communication Services,Advertising Agencies,2020-12-31,1847200000,-167500000,1679700000
486,IPG,Communication Services,Advertising Agencies,2021-12-31,2075600000,-195300000,1880300000
487,IPG,Communication Services,Advertising Agencies,2022-12-31,608800000,-178100000,430700000
579,OMC,Communication Services,Advertising Agencies,2019-12-31,1856000000,-102200000,1753800000
580,OMC,Communication Services,Advertising Agencies,2020-12-31,1724600000,-75400000,1649200000
581,OMC,Communication Services,Advertising Agencies,2021-12-31,1945400000,-665800000,1279600000
582,OMC,Communication Services,Advertising Agencies,2022-12-31,926500000,-78200000,848300000
1667,AXON,Industrials,Aerospace & Defense,2019-12-31,65673000,-16343000,49330000
1668,AXON,Industrials,Aerospace & Defense,2020-12-31,38481000,-72870000,-34389000


In [60]:
final_adhoc = ['freeCashFlow']

## Save Modified Data

In [61]:
filepath=os.path.join(PROJ_ROOT_DIR,'pickle','yahoo_cash_sheets_modified_stage5.pkl')
with open(filepath,'wb') as f:
    pickle.dump(df,f)

In [62]:
filepath=os.path.join(PROJ_ROOT_DIR,'pickle','yahoo_drop_data_details_stage5.pkl')
with open(filepath,'wb') as f:
    pickle.dump(drop_data_details,f)

## Build final cut data for cash sheet

In [63]:
col_list = ['company', 'yahoo_sector','gics_sector','industry','st_date', 'st_YR','st_Mnth'] + \
           final_operating + final_investing + final_finance + final_adhoc

df_final = df[col_list].copy()
df_final

Unnamed: 0,company,yahoo_sector,gics_sector,industry,st_date,st_YR,st_Mnth,netIncome,netIncomeFromContinuingOperations,depreciationAmortizationDepletion,stockBasedCompensation,cashFlowFromContinuingOperatingActivities,capitalExpenditure,saleOfBusiness,purchaseOfBusiness,netBusinessPurchaseAndSale,saleOfPPE,purchaseOfPPE,netPPEPurchaseAndSale,saleOfInvestment,purchaseOfInvestment,netInvestmentPurchaseAndSale,saleOfInvestmentProperties,purchaseOfInvestmentProperties,netInvestmentPropertiesPurchaseAndSale,saleOfIntangibles,purchaseOfIntangibles,netIntangiblesPurchaseAndSale,netOtherInvestingChanges,cashFlowFromContinuingInvestingActivities,netIssuancePaymentsOfDebt,netLongTermDebtIssuance,netShortTermDebtIssuance,commonStockDividendPaid,preferredStockDividendPaid,cashDividendsPaid,netCommonStockIssuance,netPreferredStockIssuance,repurchaseOfCapitalStock,netOtherFinancingCharges,cashFlowFromContinuingFinancingActivities,freeCashFlow
0,AM,Energy,Energy,Oil & Gas Midstream,2022-12-31,2022,12,326242000,326242000,202434000,19654000,699604000,-515650000,17000000,0,17000000,0,-515650000,-515650000,0,0,0,0,0,0,0,0,0,4824000,-493826000,234800000,0,234800000,-432825000,-550000,-433375000,0,0,0,-7203000,-205778000,183954000
2,AM,Energy,Energy,Oil & Gas Midstream,2019-12-31,2019,12,-355114000,-355114000,152536000,73517000,622387000,-267383000,619532000,-753068000,-133536000,0,-267383000,-267383000,0,0,0,0,0,0,0,0,0,-124756000,-525675000,534500000,650000000,-115500000,-495823000,-374000,-496197000,-125519000,0,-125519000,-11083000,-98299000,355004000
3,AM,Energy,Energy,Oil & Gas Midstream,2020-12-31,2020,12,-122527000,-122527000,179462000,12778000,753382000,-157931000,0,-25267000,-25267000,0,-157931000,-157931000,0,0,0,0,0,0,0,0,0,-36033000,-219231000,204000000,550000000,-346000000,-589640000,-550000,-590190000,-24713000,0,-24713000,-123843000,-534746000,595451000
4,AM,Energy,Energy,Oil & Gas Midstream,2021-12-31,2021,12,78626000,331617000,179462000,13529000,709752000,-232825000,0,-2070000,-2070000,0,-232825000,-232825000,0,0,0,0,0,0,0,0,0,1653000,-233242000,16228000,82528000,-66300000,-471171000,-550000,-471721000,0,0,0,-21657000,-477150000,476927000
10,APA,Energy,Energy,Oil & Gas E&P,2022-12-31,2022,12,3604000000,4082000000,1233000000,,4943000000,-2398000000,0,-143000000,-143000000,778000000,-2398000000,-2398000000,224000000,0,224000000,0,0,0,0,0,0,806000000,-1511000000,-1469000000,-1493000000,24000000,-207000000,-11000000,-218000000,-1423000000,0,-1423000000,-379000000,-3489000000,2545000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4461,ZBH,Healthcare,Health Care,Medical Devices,2022-12-31,2022,12,231400000,291200000,926400000,105000000,1356200000,-187900000,0,-99800000,-99800000,0,-187900000,-187900000,89400000,-258300000,-168900000,0,0,0,0,0,0,-65400000,-522000000,-1060700000,-1060700000,375000000,-201200000,0,-201200000,-126400000,0,-126400000,534500000,-775700000,1096800000
4463,ZTS,Healthcare,Health Care,Drug Manufacturers—Specialty & Generic,2022-12-31,2022,12,2114000000,2111000000,465000000,62000000,1912000000,-586000000,0,-312000000,-312000000,0,0,0,23000000,-9000000,14000000,0,0,0,0,0,0,1000000,-883000000,1350000000,1348000000,2000000,-611000000,0,-611000000,-1594000000,0,-1594000000,-11000000,-904000000,1326000000
4465,ZTS,Healthcare,Health Care,Drug Manufacturers—Specialty & Generic,2019-12-31,2019,12,1500000000,1500000000,412000000,67000000,1795000000,-460000000,0,-195000000,-195000000,0,0,0,138000000,0,138000000,0,0,0,0,0,0,13000000,-504000000,-9000000,0,-9000000,-314000000,0,-314000000,-626000000,0,-626000000,-2000000,-951000000,1335000000
4466,ZTS,Healthcare,Health Care,Drug Manufacturers—Specialty & Generic,2020-12-31,2020,12,1638000000,1636000000,441000000,59000000,2126000000,-453000000,0,-113000000,-113000000,0,0,0,0,-27000000,-27000000,0,0,0,0,0,0,21000000,-572000000,744000000,740000000,4000000,-380000000,0,-380000000,-250000000,0,-250000000,9000000,123000000,1673000000


In [64]:
# Order by company and year.

df_final = df_final.sort_values(by=['company','st_YR'])

In [65]:
# Save final cut

filepath=os.path.join(PROJ_ROOT_DIR,'pickle','yahoo_cash_sheets_final_stage5.pkl')
with open(filepath,'wb') as f:
    pickle.dump(df_final,f)