In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from os import listdir
import re

import csv
import datetime
import pickle

In [2]:
pd.set_option('display.max_rows', 800)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 100)
pd.options.display.float_format = '{:.0f}'.format

In [3]:
# Functions shared across multiple notebooks are stored in yahoo_data_ext_kit.py which can be found in the same
# directory as the notebooks.

import yahoo_data_ext_kit as ext

In [4]:
%load_ext autoreload
%autoreload 2

In [5]:
master_company_data = pd.read_pickle('/home/priyesh/projects/TS1/pickle/yahoo_company_data.pkl')
drop_data_details = pd.read_pickle('/home/priyesh/projects/TS1/pickle/yahoo_drop_data_details_stage4.pkl')

## Cash Sheet

Extract Cash sheet from data downloaded from Yahoo and format as a dataframe.

In [6]:
cash_sheets = ext.extract_statements('cash',master_company_data)

AM
AR
APA
BKR
LNG
CHK
CVX
COP
CTRA
DVN
FANG
DTM
EVA
EOG
EQT
XOM
HAL
HES
DINO
KMI
MRO
MPC
NFE
NOV
OXY
OKE
OVV
PDCE
PSX
PXD
RRC
SLB
SWN
TRGP
TPL
VLO
VTS
WMB
AES
LNT
AEE
AEP
AWK
ATO
AGR
BEPC
CNP
CMS
ED
CEG
D
DTE
DUK
EIX
ETR
WTRG
EVRG
ES
EXC
FE
HE
IDA
MDU
NFG
NEE
NI
NRG
OGE
PCG
PNW
PPL
PEG
SRE
SO
UGI
VST
WEC
XEL
ATVI
GOOG
ATUS
AMC
T
CABO
CHTR
CMCSA
DISH
DIS
EA
FOXA
FOX
FYBR
IAC
IPG
LBRDA
LBRDK
FWONA
FWONK
LSXMA
LSXMK
LYV
LUMN
MSGS
MTCH
META
NFLX
NYT
NWSA
NWS
NXST
OMC
PARAA
PARA
PINS
PLTK
RBLX
ROKU
SIRI
SPOT
TMUS
TTWO
TRIP
VZ
WBD
WWE
ZI
ADM
ACI
MO
BJ
SAM
BF-A
BG
CPB
CASY
CHD
CLX
KO
CL
CAG
STZ
COST
COTY
DAR
DG
DLTR
EL
FLO
FRPT
GIS
GO
HSY
HRL
INGR
K
KDP
KMB
KHC
KR
LW
MKC
TAP
MDLZ
MNST
OLPX
PEP
PFGC
PM
PPC
POST
PG
REYN
SEB
SJM
SPB
SYY
TGT
TSN
USFD
WBA
WMT
APD
ALB
AA
AMCR
ATR
AMBP
ASH
AVY
AXTA
BALL
BERY
CE
CF
CC
CLF
CTVA
CCK
DOW
DD
EXP
EMN
ECL
ESI
FMC
FCX
DNA
GPK
HUN
IFF
IP
LIN
LPX
LYB
MLM
MOS
MP
NEU
NEM
NUE
OLN
PKG
PPG
RS
RGLD
RPM
SMG
SEE
SHW
SLGN
SON
SCCO
SSRM
STLD
X
VVV
VMC
WLK
WRK
ARE
AMH
A

In [2]:
df = cash_sheets.copy()

NameError: name 'cash_sheets' is not defined

In [1]:
df

NameError: name 'df' is not defined

In [8]:
len(df['company'].unique())

848

In [9]:
# Remove sectors, industries and individual companies previously identified for removal.

# Note: as we go through the analysis of the data for each of the financial statements, companies, industries and 
# sectors are identified for removal and stored in drop_data_details. 

for sector in drop_data_details['sector']:
   print(sector)
   df.drop(df[df['yahoo_sector'] == sector].index, inplace=True, axis=0)
    
for industry in drop_data_details['industry']:
   print(industry)
   df.drop(df[df['industry'] == industry].index, inplace=True, axis=0)
    
for company in drop_data_details['companies']:
   print(company)
   df.drop(df[df['company'] == company].index, inplace=True, axis=0)

Real Estate
Healthcare Plans
Healthcare Plans
Healthcare Plans
FOXA
LBRDK
FWONK
LSXMK
FWONK
LSXMK
UAA
ZG
PARAA
NWSA
BEPC
FWONA
LSXMA
CHK
VTS
BF-A
CASY
CAG
GIS
LW
RGLD
RPM
CTAS
CXT
FDX
OSK
PAYX
AZPN
KD
ORCL
PYCR
HRB
DRI
NKE
UA
DH
FYBR
DNB
BKI
DNB
BKI
DNB
BKI
DNB
BKI
CACI
CSX


In [10]:
list(df.columns.sort_values())

['amortizationCashFlow',
 'amortizationOfIntangibles',
 'amortizationOfSecurities',
 'assetImpairmentCharge',
 'beginningCashPosition',
 'capitalExpenditure',
 'capitalExpenditureReported',
 'cashDividendsPaid',
 'cashFlowFromContinuingFinancingActivities',
 'cashFlowFromContinuingInvestingActivities',
 'cashFlowFromContinuingOperatingActivities',
 'cashFlowFromDiscontinuedOperation',
 'cashFlowsfromusedinOperatingActivitiesDirect',
 'cashFromDiscontinuedFinancingActivities',
 'cashFromDiscontinuedInvestingActivities',
 'cashFromDiscontinuedOperatingActivities',
 'changeInAccountPayable',
 'changeInAccruedExpense',
 'changeInCashSupplementalAsReported',
 'changeInDividendPayable',
 'changeInIncomeTaxPayable',
 'changeInInterestPayable',
 'changeInInventory',
 'changeInOtherCurrentAssets',
 'changeInOtherCurrentLiabilities',
 'changeInOtherWorkingCapital',
 'changeInPayable',
 'changeInPayablesAndAccruedExpense',
 'changeInPrepaidAssets',
 'changeInReceivables',
 'changeInTaxPayable',
 

In [87]:
is_col_name_mapping = {'cf_accountsPayable': 'changeInPayablesAndAccruedExpense',
                       'cf_accountsReceivables': 'changeInReceivables'}

final = ['company', 'yahoo_sector','industry','st_date', 
         'netIncome',
         'capitalExpenditure',
         'beginningCashPosition',
         'endCashPosition',
         'depreciationAmortizationDepletion',
         'stockBasedCompensation',
         'changeInPayablesAndAccruedExpense',
         'changeInReceivables',
         'cashFlowFromContinuingOperatingActivities',
         'netBusinessPurchaseAndSale',
         'netPPEPurchaseAndSale',
         'netInvestmentPurchaseAndSale',
         'netIntangiblesPurchaseAndSale',
         'netOtherInvestingChanges',
         'cashFlowFromContinuingInvestingActivities',
         'netShortTermDebtIssuance',
         'netLongTermDebtIssuance',
         'netIssuancePaymentsOfDebt',
         'netOtherFinancingCharges',
         'netCommonStockIssuance',     
         'repurchaseOfCapitalStock',
         'cashDividendsPaid',
         'cashFlowFromContinuingFinancingActivities',      
         'freeCashFlow']

summary=['company', 'yahoo_sector','industry','st_date', 
         'netIncome', 
         'changeInWorkingCapital',
         'capitalExpenditure',
         'beginningCashPosition',
         'endCashPosition',
         'operatingCashFlow',
         'cashFlowFromContinuingOperatingActivities',
         'cashFlowFromContinuingInvestingActivities',
         'cashFlowFromContinuingFinancingActivities',
         'purchaseOfBusiness',
         'purchaseOfPPE',
         'repaymentOfDebt',
         'repurchaseOfCapitalStock',
         'cashDividendsPaid',
         'commonStockDividendPaid',
         'gainLossOnSaleOfBusiness',
         'saleOfPPE',
         'freeCashFlow']

operating=['company', 'yahoo_sector','industry','st_date', 
           'netIncome',
           'netIncomeFromContinuingOperations',
           'operatingGainsLosses',
           'depreciation',
           'depreciationAmortizationDepletion',
           'depreciationAndAmortization',
           'stockBasedCompensation',
           'deferredIncomeTax',
           'changeInInterestPayable',
           'changeInWorkingCapital',
           'changeInOtherWorkingCapital',
           'changesInAccountReceivables',
           'changeInReceivables',
           'changeInAccountPayable',
           'changeInPayable',
           'paymentstoSuppliersforGoodsandServices',
           'changeInPayablesAndAccruedExpense',
           'otherCashPaymentsfromOperatingActivities',
           'otherCashReceiptsfromOperatingActivities',
           'otherNonCashItems',
           'cashFlowsfromusedinOperatingActivitiesDirect',
           'cashFlowFromContinuingOperatingActivities']

investing=['company','yahoo_sector','industry','st_date', 
           'capitalExpenditure',
           'capitalExpenditureReported',
           'cashFlowFromContinuingInvestingActivities',
           'purchaseOfBusiness',
           'saleOfBusiness',
           'netBusinessPurchaseAndSale',
           'purchaseOfPPE',
           'saleOfPPE',
           'netPPEPurchaseAndSale',
           'purchaseOfInvestmentProperties',
           'saleOfInvestmentProperties',     
           'netInvestmentPropertiesPurchaseAndSale',
           'purchaseOfInvestment',
           'saleOfInvestment',
           'netInvestmentPurchaseAndSale',
           'purchaseOfIntangibles',
           'saleOfIntangibles',
           'netIntangiblesPurchaseAndSale',
           'netOtherInvestingChanges',
           'investingCashFlow']

finance=['company','yahoo_sector','industry','st_date', 
         'shortTermDebtIssuance',
         'shortTermDebtPayments',
         'netShortTermDebtIssuance', 
         'longTermDebtIssuance',
         'longTermDebtPayments',
         'netLongTermDebtIssuance',
         'issuanceOfDebt',
         'repaymentOfDebt',
         'netIssuancePaymentsOfDebt',
         'cashDividendsPaid',
         'commonStockDividendPaid',
         'preferredStockDividendPaid',
         'commonStockIssuance',
         'commonStockPayments',
         'netCommonStockIssuance',      
         'issuanceOfCapitalStock',
         'netPreferredStockIssuance',
         'netOtherFinancingCharges',
         'repurchaseOfCapitalStock']

adhoc=['company','yahoo_sector','industry',
       'st_date', 
       'cashFlowFromContinuingOperatingActivities',
       'capitalExpenditure',
       'freeCashFlow']

In [12]:
# Identify companies which have less than 4 years of data

len(df.groupby('company').filter(lambda x: len(x) < 4))

0

In [13]:
df.groupby('company').filter(lambda x: len(x) > 4)[summary]

Unnamed: 0,company,yahoo_sector,industry,st_date,netIncome,changeInWorkingCapital,capitalExpenditure,beginningCashPosition,endCashPosition,operatingCashFlow,cashFlowFromContinuingOperatingActivities,cashFlowFromContinuingInvestingActivities,cashFlowFromContinuingFinancingActivities,purchaseOfBusiness,purchaseOfPPE,repaymentOfDebt,repurchaseOfCapitalStock,cashDividendsPaid,commonStockDividendPaid,gainLossOnSaleOfBusiness,saleOfPPE,freeCashFlow
0,AM,Energy,Oil & Gas Midstream,2022-12-31,326242000.00,5064000.00,-515650000.00,0.00,0.00,699604000.00,699604000.00,-493826000.00,-205778000.00,0.00,-515650000.00,0.00,0.00,-433375000.00,-432825000.00,,,183954000.00
1,AM,Energy,Oil & Gas Midstream,2023-03-31,332709000.00,-19648000.00,-474376000.00,0.00,,697659000.00,697659000.00,-451831000.00,-245828000.00,,-474376000.00,,,-433590000.00,-433040000.00,,,223283000.00
2,AM,Energy,Oil & Gas Midstream,2019-12-31,-355114000.00,23472000.00,-267383000.00,2822000.00,1235000.00,622387000.00,622387000.00,-525675000.00,-98299000.00,-753068000.00,-267383000.00,-115500000.00,-125519000.00,-496197000.00,-495823000.00,,,355004000.00
3,AM,Energy,Oil & Gas Midstream,2020-12-31,-122527000.00,419000.00,-157931000.00,1235000.00,640000.00,753382000.00,753382000.00,-219231000.00,-534746000.00,-25267000.00,-157931000.00,-346000000.00,-24713000.00,-590190000.00,-589640000.00,,,595451000.00
4,AM,Energy,Oil & Gas Midstream,2021-12-31,78626000.00,4431000.00,-232825000.00,640000.00,0.00,709752000.00,709752000.00,-233242000.00,-477150000.00,-2070000.00,-232825000.00,-733772000.00,0.00,-471721000.00,-471171000.00,,,476927000.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4463,ZTS,Healthcare,Drug Manufacturers—Specialty & Generic,2022-12-31,2114000000.00,-699000000.00,-586000000.00,3485000000.00,3581000000.00,1912000000.00,1912000000.00,-883000000.00,-904000000.00,-312000000.00,,0.00,-1594000000.00,-611000000.00,-611000000.00,,,1326000000.00
4464,ZTS,Healthcare,Drug Manufacturers—Specialty & Generic,2023-03-31,2071000000.00,-469000000.00,-694000000.00,3135000000.00,2145000000.00,2152000000.00,2152000000.00,-981000000.00,-2161000000.00,-315000000.00,,-1350000000.00,-1516000000.00,-631000000.00,-631000000.00,,,1458000000.00
4465,ZTS,Healthcare,Drug Manufacturers—Specialty & Generic,2019-12-31,1500000000.00,-151000000.00,-460000000.00,1602000000.00,1934000000.00,1795000000.00,1795000000.00,-504000000.00,-951000000.00,-195000000.00,,,-626000000.00,-314000000.00,-314000000.00,,,1335000000.00
4466,ZTS,Healthcare,Drug Manufacturers—Specialty & Generic,2020-12-31,1638000000.00,-85000000.00,-453000000.00,1934000000.00,3604000000.00,2126000000.00,2126000000.00,-572000000.00,123000000.00,-113000000.00,,-500000000.00,-250000000.00,-380000000.00,-380000000.00,,,1673000000.00


In [14]:
# Check counts 

df.groupby('company').filter(lambda x: len(x) > 4).groupby('company').size().sort_values()

company
LEG      5
NATI     5
NBIX     5
NCLH     5
NCNO     5
NCR      5
NEE      5
NEM      5
NET      5
NEU      5
NEWR     5
NFLX     5
NI       5
NOC      5
NOV      5
NOW      5
MTZ      5
NRG      5
MTN      5
MSI      5
MIDD     5
MKC      5
MKSI     5
MLM      5
MMM      5
MNST     5
MOS      5
MP       5
MPC      5
MRK      5
MRNA     5
MRO      5
MRVI     5
MRVL     5
MSA      5
MTD      5
NSC      5
NTAP     5
NVDA     5
PCG      5
PCOR     5
PDCE     5
PEG      5
PEGA     5
PEN      5
PENN     5
PEP      5
PFE      5
PHM      5
PII      5
PINS     5
PKG      5
PLNT     5
PLUG     5
PCAR     5
PAYC     5
PARA     5
OXY      5
NVR      5
NVST     5
NVT      5
NWL      5
NYT      5
OC       5
ODFL     5
MHK      5
OGE      5
OKE      5
OKTA     5
OLN      5
OLPX     5
OMC      5
ORLY     5
OTIS     5
OGN      5
MGM      5
META     5
MDU      5
ITW      5
JAMF     5
JAZZ     5
JBHT     5
JNJ      5
JNPR     5
JWN      5
K        5
KHC      5
KMB      5
KMI      5
KMX      5
KN

We know from analysis of data for income sheets that the unexpected extra rows corespond to spurious quarterly 
accounts plus TTM.

Delete rows which do not match the annual anniversary of statement date. This is determined by finding the most 
common month for a given company.

In [15]:
df = ext.remove_non_annual_data(df)

AM
AR
APA
BKR
LNG
CVX
COP
CTRA
DVN
FANG
DTM
EVA
EOG
EQT
XOM
HAL
HES
DINO
KMI
MRO
MPC
NFE
NOV
OXY
OKE
OVV
PDCE
PSX
PXD
RRC
SLB
SWN
TRGP
TPL
VLO
WMB
AES
LNT
AEE
AEP
AWK
ATO
AGR
CNP
CMS
ED
CEG
D
DTE
DUK
EIX
ETR
WTRG
EVRG
ES
EXC
FE
HE
IDA
MDU
NFG
NEE
NI
NRG
OGE
PCG
PNW
PPL
PEG
SRE
SO
UGI
VST
WEC
XEL
ATVI
GOOG
ATUS
AMC
T
CABO
CHTR
CMCSA
DISH
DIS
EA
FOX
IAC
IPG
LBRDA
LYV
LUMN
MSGS
MTCH
META
NFLX
NYT
NWS
NXST
OMC
PARA
PINS
PLTK
RBLX
ROKU
SIRI
SPOT
TMUS
TTWO
TRIP
VZ
WBD
WWE
ZI
ADM
ACI
MO
BJ
SAM
BG
CPB
CHD
CLX
KO
CL
STZ
COST
COTY
DAR
DG
DLTR
EL
FLO
FRPT
GO
HSY
HRL
INGR
K
KDP
KMB
KHC
KR
MKC
TAP
MDLZ
MNST
OLPX
PEP
PFGC
PM
PPC
POST
PG
REYN
SEB
SJM
SPB
SYY
TGT
TSN
USFD
WBA
WMT
APD
ALB
AA
AMCR
ATR
AMBP
ASH
AVY
AXTA
BALL
BERY
CE
CF
CC
CLF
CTVA
CCK
DOW
DD
EXP
EMN
ECL
ESI
FMC
FCX
DNA
GPK
HUN
IFF
IP
LIN
LPX
LYB
MLM
MOS
MP
NEU
NEM
NUE
OLN
PKG
PPG
RS
SMG
SEE
SHW
SLGN
SON
SCCO
SSRM
STLD
X
VVV
VMC
WLK
WRK
Z
MMM
AYI
ADP
WMS
ACM
AGCO
AL
ALK
ALLE
ALSN
AAL
AME
AWI
CAR
AXON
AZEK
BA
BAH
BR
BLDR
BWXT
CHRW
CSL
CARR

In [16]:
# Let's see if that leaves us with companies with less than 4 years of accounts

df.groupby('company').filter(lambda x: len(x) < 4)

Unnamed: 0,st_date,amortizationCashFlow,amortizationOfIntangibles,assetImpairmentCharge,beginningCashPosition,capitalExpenditure,cashDividendsPaid,cashFlowFromContinuingFinancingActivities,cashFlowFromContinuingInvestingActivities,cashFlowFromContinuingOperatingActivities,changeInAccountPayable,changeInAccruedExpense,changeInCashSupplementalAsReported,changeInOtherCurrentAssets,changeInPayable,changeInPayablesAndAccruedExpense,changeInReceivables,changeInWorkingCapital,changesInAccountReceivables,commonStockDividendPaid,commonStockPayments,deferredIncomeTax,deferredTax,depreciation,depreciationAmortizationDepletion,depreciationAndAmortization,dividendReceivedCFO,earningsLossesFromEquityInvestments,endCashPosition,financingCashFlow,freeCashFlow,incomeTaxPaidSupplementalData,interestPaidSupplementalData,investingCashFlow,issuanceOfDebt,longTermDebtIssuance,longTermDebtPayments,netBusinessPurchaseAndSale,netCommonStockIssuance,netIncome,netIncomeFromContinuingOperations,netIssuancePaymentsOfDebt,netLongTermDebtIssuance,netOtherFinancingCharges,netOtherInvestingChanges,netPPEPurchaseAndSale,netShortTermDebtIssuance,operatingCashFlow,operatingGainsLosses,otherNonCashItems,preferredStockDividendPaid,purchaseOfBusiness,purchaseOfPPE,repaymentOfDebt,repurchaseOfCapitalStock,saleOfBusiness,stockBasedCompensation,changeInIncomeTaxPayable,changeInTaxPayable,changesInCash,dividendsReceivedCFI,shortTermDebtPayments,company,industry,yahoo_sector,gics_sector,st_YR,st_Mnth,changeInDividendPayable,changeInOtherCurrentLiabilities,commonStockIssuance,depletion,gainLossOnInvestmentSecurities,gainLossOnSaleOfBusiness,issuanceOfCapitalStock,otherCashAdjustmentOutsideChangeinCash,changeInPrepaidAssets,changeInInventory,changeInOtherWorkingCapital,netInvestmentPurchaseAndSale,saleOfInvestment,saleOfPPE,shortTermDebtIssuance,capitalExpenditureReported,effectOfExchangeRateChanges,unrealizedGainLossOnInvestmentSecurities,interestPaidCFO,gainLossOnSaleOfPPE,netForeignCurrencyExchangeGainLoss,proceedsFromStockOptionExercised,purchaseOfInvestment,changeInInterestPayable,netPreferredStockIssuance,preferredStockPayments,cashFlowFromDiscontinuedOperation,cashFromDiscontinuedFinancingActivities,cashFromDiscontinuedInvestingActivities,cashFromDiscontinuedOperatingActivities,netInvestmentPropertiesPurchaseAndSale,purchaseOfInvestmentProperties,dividendPaidCFO,pensionAndEmployeeBenefitExpense,provisionandWriteOffofAssets,amortizationOfSecurities,otherCashAdjustmentInsideChangeinCash,preferredStockIssuance,saleOfInvestmentProperties,netIntangiblesPurchaseAndSale,purchaseOfIntangibles,taxesRefundPaid,saleOfIntangibles,interestReceivedCFI,interestReceivedCFO,excessTaxBenefitFromStockBasedCompensation,interestPaidCFF,cashFlowsfromusedinOperatingActivitiesDirect,classesofCashPayments,classesofCashReceiptsfromOperatingActivities,dividendsReceivedDirect,interestPaidDirect,interestReceivedDirect,otherCashPaymentsfromOperatingActivities,otherCashReceiptsfromOperatingActivities,paymentstoSuppliersforGoodsandServices,M


In [17]:
# Sanity check. Do we still have companies with more than 4.

df.groupby('company').filter(lambda x: len(x) > 4)

Unnamed: 0,st_date,amortizationCashFlow,amortizationOfIntangibles,assetImpairmentCharge,beginningCashPosition,capitalExpenditure,cashDividendsPaid,cashFlowFromContinuingFinancingActivities,cashFlowFromContinuingInvestingActivities,cashFlowFromContinuingOperatingActivities,changeInAccountPayable,changeInAccruedExpense,changeInCashSupplementalAsReported,changeInOtherCurrentAssets,changeInPayable,changeInPayablesAndAccruedExpense,changeInReceivables,changeInWorkingCapital,changesInAccountReceivables,commonStockDividendPaid,commonStockPayments,deferredIncomeTax,deferredTax,depreciation,depreciationAmortizationDepletion,depreciationAndAmortization,dividendReceivedCFO,earningsLossesFromEquityInvestments,endCashPosition,financingCashFlow,freeCashFlow,incomeTaxPaidSupplementalData,interestPaidSupplementalData,investingCashFlow,issuanceOfDebt,longTermDebtIssuance,longTermDebtPayments,netBusinessPurchaseAndSale,netCommonStockIssuance,netIncome,netIncomeFromContinuingOperations,netIssuancePaymentsOfDebt,netLongTermDebtIssuance,netOtherFinancingCharges,netOtherInvestingChanges,netPPEPurchaseAndSale,netShortTermDebtIssuance,operatingCashFlow,operatingGainsLosses,otherNonCashItems,preferredStockDividendPaid,purchaseOfBusiness,purchaseOfPPE,repaymentOfDebt,repurchaseOfCapitalStock,saleOfBusiness,stockBasedCompensation,changeInIncomeTaxPayable,changeInTaxPayable,changesInCash,dividendsReceivedCFI,shortTermDebtPayments,company,industry,yahoo_sector,gics_sector,st_YR,st_Mnth,changeInDividendPayable,changeInOtherCurrentLiabilities,commonStockIssuance,depletion,gainLossOnInvestmentSecurities,gainLossOnSaleOfBusiness,issuanceOfCapitalStock,otherCashAdjustmentOutsideChangeinCash,changeInPrepaidAssets,changeInInventory,changeInOtherWorkingCapital,netInvestmentPurchaseAndSale,saleOfInvestment,saleOfPPE,shortTermDebtIssuance,capitalExpenditureReported,effectOfExchangeRateChanges,unrealizedGainLossOnInvestmentSecurities,interestPaidCFO,gainLossOnSaleOfPPE,netForeignCurrencyExchangeGainLoss,proceedsFromStockOptionExercised,purchaseOfInvestment,changeInInterestPayable,netPreferredStockIssuance,preferredStockPayments,cashFlowFromDiscontinuedOperation,cashFromDiscontinuedFinancingActivities,cashFromDiscontinuedInvestingActivities,cashFromDiscontinuedOperatingActivities,netInvestmentPropertiesPurchaseAndSale,purchaseOfInvestmentProperties,dividendPaidCFO,pensionAndEmployeeBenefitExpense,provisionandWriteOffofAssets,amortizationOfSecurities,otherCashAdjustmentInsideChangeinCash,preferredStockIssuance,saleOfInvestmentProperties,netIntangiblesPurchaseAndSale,purchaseOfIntangibles,taxesRefundPaid,saleOfIntangibles,interestReceivedCFI,interestReceivedCFO,excessTaxBenefitFromStockBasedCompensation,interestPaidCFF,cashFlowsfromusedinOperatingActivitiesDirect,classesofCashPayments,classesofCashReceiptsfromOperatingActivities,dividendsReceivedDirect,interestPaidDirect,interestReceivedDirect,otherCashPaymentsfromOperatingActivities,otherCashReceiptsfromOperatingActivities,paymentstoSuppliersforGoodsandServices,M


In [18]:
# Check count of companies

len(df['company'].unique())

737

## Operating Activities

In [19]:
df[operating].isnull().sum().sort_values()

company                                            0
yahoo_sector                                       0
industry                                           0
st_date                                            0
netIncome                                          0
changeInWorkingCapital                             1
depreciationAmortizationDepletion                  1
cashFlowFromContinuingOperatingActivities          1
netIncomeFromContinuingOperations                  1
changeInPayablesAndAccruedExpense                 54
changeInReceivables                              118
depreciationAndAmortization                      142
otherNonCashItems                                216
deferredIncomeTax                                299
stockBasedCompensation                           320
changeInPayable                                  459
changeInAccountPayable                           509
operatingGainsLosses                             627
changeInOtherWorkingCapital                   

In [20]:
df[df['cashFlowFromContinuingOperatingActivities'].isnull()].sort_values(by=['industry','company','st_date'])[summary]

Unnamed: 0,company,yahoo_sector,industry,st_date,netIncome,changeInWorkingCapital,capitalExpenditure,beginningCashPosition,endCashPosition,operatingCashFlow,cashFlowFromContinuingOperatingActivities,cashFlowFromContinuingInvestingActivities,cashFlowFromContinuingFinancingActivities,purchaseOfBusiness,purchaseOfPPE,repaymentOfDebt,repurchaseOfCapitalStock,cashDividendsPaid,commonStockDividendPaid,gainLossOnSaleOfBusiness,saleOfPPE,freeCashFlow
2337,UHAL,Industrials,Rental & Leasing Services,2023-03-31,922998000.0,,,,,,,,,,,,,,,,,


UHAL - Data for 2023 is missing. Verified this is the case by check Yahoo Finance website. 

Delete this company.

In [21]:
drop_data_details['companies'] = drop_data_details['companies'] + ['UHAL']
df.drop(df[df['company'] == 'UHAL'].index, inplace=True, axis=0)

In [22]:
df[df['changeInPayablesAndAccruedExpense'].isnull()].sort_values(by=['industry','company','st_date'])[operating]

Unnamed: 0,company,yahoo_sector,industry,st_date,netIncome,netIncomeFromContinuingOperations,operatingGainsLosses,depreciation,depreciationAmortizationDepletion,depreciationAndAmortization,stockBasedCompensation,deferredIncomeTax,changeInInterestPayable,changeInWorkingCapital,changeInOtherWorkingCapital,changesInAccountReceivables,changeInReceivables,changeInAccountPayable,changeInPayable,paymentstoSuppliersforGoodsandServices,changeInPayablesAndAccruedExpense,otherCashPaymentsfromOperatingActivities,otherCashReceiptsfromOperatingActivities,otherNonCashItems,cashFlowsfromusedinOperatingActivitiesDirect,cashFlowFromContinuingOperatingActivities
1632,ALK,Industrials,Airlines,2019-12-31,769000000.0,769000000.0,,,423000000.0,423000000.0,29000000.0,,,501000000.0,346000000.0,43000000.0,43000000.0,,,,,,,,,1722000000.0
1633,ALK,Industrials,Airlines,2020-12-31,-1307000000.0,-1324000000.0,,,420000000.0,420000000.0,24000000.0,,,-201000000.0,-214000000.0,-160000000.0,-160000000.0,,,,,,,220000000.0,,-234000000.0
1634,ALK,Industrials,Airlines,2021-12-31,478000000.0,478000000.0,,,394000000.0,394000000.0,51000000.0,,,118000000.0,94000000.0,-66000000.0,-66000000.0,,,,,,,434000000.0,,1030000000.0
1635,ALK,Industrials,Airlines,2022-12-31,58000000.0,58000000.0,,,415000000.0,415000000.0,42000000.0,,,407000000.0,140000000.0,-45000000.0,250000000.0,,,,,,,953000000.0,,1418000000.0
4362,SGEN,Healthcare,Biotechnology,2019-12-31,-158650000.0,-158650000.0,-48271000.0,33514000.0,33514000.0,33514000.0,127349000.0,,,-112763000.0,-33600000.0,-89720000.0,-89720000.0,,,,,,,,,-163737000.0
4363,SGEN,Healthcare,Biotechnology,2020-12-31,613670000.0,613670000.0,-11630000.0,47039000.0,63384000.0,63384000.0,147233000.0,-2053000.0,,42860000.0,184022000.0,-88727000.0,-88727000.0,,,,,,,,,856568000.0
4364,SGEN,Healthcare,Biotechnology,2021-12-31,-674471000.0,-674471000.0,-4744000.0,55539000.0,78626000.0,78626000.0,173117000.0,548000.0,,-88016000.0,,-64268000.0,-64268000.0,,,,,,,,,-499007000.0
4360,SGEN,Healthcare,Biotechnology,2022-12-31,-610308000.0,-610308000.0,10154000.0,59081000.0,82158000.0,82158000.0,221297000.0,553000.0,,-150540000.0,,-112656000.0,-112656000.0,,,,,,,,,-453751000.0
897,SEB,Industrials,Conglomerates,2019-12-31,283000000.0,283000000.0,-184000000.0,,138000000.0,138000000.0,,-66000000.0,,-17000000.0,,,-84000000.0,,,,,,,7000000.0,,171000000.0
1843,EFX,Industrials,Consulting Services,2019-12-31,-398800000.0,-392800000.0,,,337300000.0,337300000.0,49700000.0,-87200000.0,,406800000.0,,-61300000.0,-61300000.0,,,,,,,,,313800000.0


Scrollling down, we are able to verify that where changeInPayablesAndAccruedExpense is null, so are related 
fields, so we cannot derive change in accounts payable. Of the fields available, changeInPayablesAndAccruedExpense
is the one we with the best count and one we can include in the final dataset.

In [26]:
df[df['changeInReceivables'].isnull()].sort_values(by=['industry','company','st_date'])[operating]

Unnamed: 0,company,yahoo_sector,industry,st_date,netIncome,netIncomeFromContinuingOperations,operatingGainsLosses,depreciation,depreciationAmortizationDepletion,depreciationAndAmortization,stockBasedCompensation,deferredIncomeTax,changeInInterestPayable,changeInWorkingCapital,changeInOtherWorkingCapital,changesInAccountReceivables,changeInReceivables,changeInAccountPayable,changeInPayable,paymentstoSuppliersforGoodsandServices,changeInPayablesAndAccruedExpense,otherCashPaymentsfromOperatingActivities,otherCashReceiptsfromOperatingActivities,otherNonCashItems,cashFlowsfromusedinOperatingActivitiesDirect,cashFlowFromContinuingOperatingActivities
3443,GPS,Consumer Cyclical,Apparel Retail,2020-01-31,351000000.0,351000000.0,-191000000.0,,557000000.0,557000000.0,68000000.0,-81000000.0,,4000000.0,-61000000.0,,,66000000.0,152000000.0,,262000000.0,,,353000000.0,,1411000000.0
3444,GPS,Consumer Cyclical,Apparel Retail,2021-01-31,-665000000.0,-665000000.0,58000000.0,,507000000.0,507000000.0,77000000.0,-137000000.0,,-45000000.0,-189000000.0,,,564000000.0,260000000.0,,260000000.0,,,-144000000.0,,237000000.0
3445,GPS,Consumer Cyclical,Apparel Retail,2022-01-31,256000000.0,256000000.0,384000000.0,,504000000.0,504000000.0,139000000.0,-61000000.0,,-492000000.0,-102000000.0,,,186000000.0,101000000.0,,101000000.0,,,148000000.0,,809000000.0
3446,GPS,Consumer Cyclical,Apparel Retail,2023-01-31,-202000000.0,-202000000.0,-48000000.0,531000000.0,540000000.0,540000000.0,37000000.0,42000000.0,,197000000.0,-107000000.0,,,-540000000.0,-123000000.0,,-366000000.0,,,129000000.0,,607000000.0
3561,LULU,Consumer Cyclical,Apparel Retail,2020-01-31,645596000.0,645596000.0,-1925000.0,,161933000.0,161933000.0,45593000.0,24129000.0,,-194071000.0,82164000.0,,,-14810000.0,-55074000.0,,-64672000.0,,,23720000.0,,669316000.0
3558,LULU,Consumer Cyclical,Apparel Retail,2021-01-31,588913000.0,588913000.0,4485000.0,,185478000.0,185478000.0,50797000.0,34908000.0,,-47549000.0,54537000.0,,,82663000.0,58538000.0,,157699000.0,,,214423000.0,,803336000.0
3559,LULU,Consumer Cyclical,Apparel Retail,2022-01-31,975322000.0,975322000.0,15191000.0,,224206000.0,224206000.0,69137000.0,-5180000.0,,129131000.0,160208000.0,,,117655000.0,238433000.0,,342311000.0,,,413786000.0,,1389108000.0
3560,LULU,Consumer Cyclical,Apparel Retail,2023-01-31,854800000.0,854800000.0,-48829000.0,,291791000.0,291791000.0,78075000.0,3042000.0,,-596992000.0,139425000.0,,,-107280000.0,-71294000.0,,-5930000.0,,,-23337000.0,,966463000.0
3707,ROST,Consumer Cyclical,Apparel Retail,2020-01-31,1660928000.0,1660928000.0,,,350892000.0,350892000.0,95438000.0,32009000.0,,32279000.0,15064000.0,,,114153000.0,78914000.0,,78914000.0,,,,,2171546000.0
3708,ROST,Consumer Cyclical,Apparel Retail,2021-01-31,85382000.0,85382000.0,239953000.0,,364245000.0,364245000.0,101568000.0,-27812000.0,,1482597000.0,48559000.0,,,938837000.0,978643000.0,,978643000.0,,,,,2245933000.0


Verified that alternative fields are also null. Conclude that cahngeInReceivables is the best one to use in final
data set and some values will be null.

In [91]:
operating

['company',
 'yahoo_sector',
 'industry',
 'st_date',
 'netIncome',
 'netIncomeFromContinuingOperations',
 'operatingGainsLosses',
 'depreciation',
 'depreciationAmortizationDepletion',
 'depreciationAndAmortization',
 'stockBasedCompensation',
 'deferredIncomeTax',
 'changeInInterestPayable',
 'changeInWorkingCapital',
 'changeInOtherWorkingCapital',
 'changesInAccountReceivables',
 'changeInReceivables',
 'changeInAccountPayable',
 'changeInPayable',
 'paymentstoSuppliersforGoodsandServices',
 'changeInPayablesAndAccruedExpense',
 'otherCashPaymentsfromOperatingActivities',
 'otherCashReceiptsfromOperatingActivities',
 'otherNonCashItems',
 'cashFlowsfromusedinOperatingActivitiesDirect',
 'cashFlowFromContinuingOperatingActivities']

In [94]:
# Select fields for final cut

final_operating = ['netIncome',
                   'netIncomeFromContinuingOperations',
                   'depreciationAmortizationDepletion',
                   'stockBasedCompensation',
                   'cashFlowFromContinuingOperatingActivities']

is_col_name_mapping['cf_operatingIncome'] = ['netIncomeFromContinuingOperations']

## Investing

In [29]:
df[investing].isnull().sum().sort_values()

company                                         0
investingCashFlow                               0
cashFlowFromContinuingInvestingActivities       0
cashFlowFromContinuingInvestingActivities       0
st_date                                         0
industry                                        0
yahoo_sector                                    0
capitalExpenditure                             11
netBusinessPurchaseAndSale                    532
netPPEPurchaseAndSale                         696
purchaseOfBusiness                            701
netOtherInvestingChanges                      959
purchaseOfPPE                                1046
netInvestmentPurchaseAndSale                 1101
purchaseOfInvestment                         1364
saleOfInvestment                             1377
capitalExpenditureReported                   1565
saleOfPPE                                    1889
saleOfBusiness                               1935
purchaseOfIntangibles                        2383


In [32]:
df[df['capitalExpenditure'].isnull()].sort_values(by=['industry','company','st_date'])[investing]

Unnamed: 0,company,yahoo_sector,industry,st_date,capitalExpenditure,capitalExpenditureReported,cashFlowFromContinuingInvestingActivities,netInvestmentPropertiesPurchaseAndSale,netInvestmentPurchaseAndSale,netBusinessPurchaseAndSale,netPPEPurchaseAndSale,purchaseOfBusiness,purchaseOfIntangibles,purchaseOfInvestment,purchaseOfInvestmentProperties,purchaseOfPPE,saleOfBusiness,saleOfIntangibles,saleOfInvestment,saleOfInvestmentProperties,saleOfPPE,netOtherInvestingChanges,investingCashFlow,cashFlowFromContinuingInvestingActivities.1
4343,RPRX,Healthcare,Biotechnology,2019-12-31,,,-2116142000.0,,-1839100000.0,-27042000.0,,-27042000.0,,-2363651000.0,,,,,524551000.0,,,-250000000.0,-2116142000.0,-2116142000.0
4344,RPRX,Healthcare,Biotechnology,2020-12-31,,,-2759000000.0,,-2734000000.0,-40000000.0,,-40000000.0,,-3937000000.0,,,,,1203000000.0,,,-2182000000.0,-2759000000.0,-2759000000.0
4345,RPRX,Healthcare,Biotechnology,2021-12-31,,,-1870280000.0,,-1817348000.0,-34855000.0,,-34855000.0,,-3593656000.0,,,,,1776308000.0,,,-18600000.0,-1870280000.0,-1870280000.0
4346,RPRX,Healthcare,Biotechnology,2022-12-31,,,-1029421000.0,,-1019525000.0,-9896000.0,,-9896000.0,,-2565068000.0,,,,,1545543000.0,,,,-1029421000.0,-1029421000.0
4435,VEEV,Healthcare,Health Information Services,2021-01-31,,,-333634000.0,,-324951000.0,0.0,,0.0,,-979292000.0,,,,,654341000.0,,,-8683000.0,-333634000.0,-333634000.0
4436,VEEV,Healthcare,Health Information Services,2022-01-31,,,-346152000.0,,-324158000.0,-7780000.0,,-7780000.0,,-1117076000.0,,,,,792918000.0,,,-14214000.0,-346152000.0,-346152000.0
4432,VEEV,Healthcare,Health Information Services,2023-01-31,,,-1007683000.0,,-994171000.0,0.0,,0.0,,-1996878000.0,,,,,1002707000.0,,,-13512000.0,-1007683000.0,-1007683000.0
203,LNT,Utilities,Utilities—Regulated Electric,2019-12-31,,,-1287300000.0,,,-1640100000.0,,-1640100000.0,,,,,,,,,,352800000.0,-1287300000.0,-1287300000.0
204,LNT,Utilities,Utilities—Regulated Electric,2020-12-31,,,-951000000.0,,,-1366000000.0,,-1366000000.0,,,,,,,,,,415000000.0,-951000000.0,-951000000.0
202,LNT,Utilities,Utilities—Regulated Electric,2021-12-31,,,-728000000.0,,,-1169000000.0,,-1169000000.0,,,,,,,,,,441000000.0,-728000000.0,-728000000.0


In [34]:
col_list=['company','st_date',
          'cashFlowFromContinuingOperatingActivities','capitalExpenditure','freeCashFlow']
df[df['company'].isin(['RPRX','VEEV','LNT'])][col_list].sort_values(by=['company','st_date'])

Unnamed: 0,company,st_date,cashFlowFromContinuingOperatingActivities,capitalExpenditure,freeCashFlow
203,LNT,2019-12-31,660400000.0,,660400000.0
204,LNT,2020-12-31,501000000.0,,501000000.0
202,LNT,2021-12-31,582000000.0,,582000000.0
200,LNT,2022-12-31,486000000.0,,486000000.0
4343,RPRX,2019-12-31,1667239000.0,,1667239000.0
4344,RPRX,2020-12-31,2034629000.0,,2035000000.0
4345,RPRX,2021-12-31,2017536000.0,,2017536000.0
4346,RPRX,2022-12-31,2143980000.0,,2143980000.0
4434,VEEV,2020-01-31,437375000.0,-4321000.0,433054000.0
4435,VEEV,2021-01-31,551246000.0,,551246000.0


Where CapEx is null, free cash flow is equal to operating cash flow. So we can deduce that capex is 0 if null.
Set capitalExpenditure to 0 if null on this basis.

In [35]:
df['capitalExpenditure'] = df['capitalExpenditure'].fillna(0)

In [38]:
df[df['netBusinessPurchaseAndSale'].isnull()].sort_values(by=['industry','company','st_date'])[investing]

Unnamed: 0,company,yahoo_sector,industry,st_date,capitalExpenditure,capitalExpenditureReported,cashFlowFromContinuingInvestingActivities,netBusinessPurchaseAndSale,purchaseOfBusiness,saleOfBusiness,netPPEPurchaseAndSale,netInvestmentPropertiesPurchaseAndSale,purchaseOfIntangibles,purchaseOfInvestment,purchaseOfInvestmentProperties,purchaseOfPPE,saleOfInvestmentProperties,saleOfPPE,saleOfIntangibles,purchaseOfInvestment.1,saleOfInvestment,netInvestmentPurchaseAndSale,netOtherInvestingChanges,investingCashFlow,cashFlowFromContinuingInvestingActivities.1
1682,BA,Industrials,Aerospace & Defense,2020-12-31,-1303000000.0,,-18366000000.0,,,,-1007000000.0,,,-37616000000.0,,-1303000000.0,,296000000.0,,-37616000000.0,20275000000.0,-17341000000.0,-18000000.0,-18366000000.0,-18366000000.0
1679,BA,Industrials,Aerospace & Defense,2022-12-31,-1222000000.0,,4370000000.0,,,,-1187000000.0,,,-5051000000.0,,-1222000000.0,,35000000.0,,-5051000000.0,10619000000.0,5568000000.0,-11000000.0,4370000000.0,4370000000.0
1903,GD,Industrials,Aerospace & Defense,2021-12-31,-887000000.0,-887000000.0,-882000000.0,,,,,,,,,,,,,,,,5000000.0,-882000000.0,-882000000.0
1904,GD,Industrials,Aerospace & Defense,2022-12-31,-1114000000.0,-1114000000.0,-1489000000.0,,,,,,,,,,,,,,,,-375000000.0,-1489000000.0,-1489000000.0
1949,HWM,Industrials,Aerospace & Defense,2019-12-31,-586000000.0,-586000000.0,583000000.0,,,,,,,,,,,,,,73000000.0,73000000.0,1096000000.0,583000000.0,583000000.0
1950,HWM,Industrials,Aerospace & Defense,2020-12-31,-267000000.0,-267000000.0,271000000.0,,,,,,,,,,,,,,0.0,0.0,538000000.0,271000000.0,271000000.0
1951,HWM,Industrials,Aerospace & Defense,2021-12-31,-199000000.0,-199000000.0,107000000.0,,,,,,,,,,,,,,6000000.0,6000000.0,300000000.0,107000000.0,107000000.0
1947,HWM,Industrials,Aerospace & Defense,2022-12-31,-193000000.0,-193000000.0,-135000000.0,,,,,,,,,,,,,,0.0,0.0,58000000.0,-135000000.0,-135000000.0
1937,HXL,Industrials,Aerospace & Defense,2022-12-31,-76300000.0,-76300000.0,-54600000.0,,,,21200000.0,,,,,,,21200000.0,,,500000.0,500000.0,,-54600000.0,-54600000.0
2053,LMT,Industrials,Aerospace & Defense,2019-12-31,-1484000000.0,-1484000000.0,-1241000000.0,,,,,,,,,,,,,,,,243000000.0,-1241000000.0,-1241000000.0


We are interested in net spend on acquisitions and we can allow for any offsets resulting from disposals. So based
on count and verification from above, we can take netBusinessPurchaseAndSale as amount spent on acquisitions. If 
positive then it is amount resulting from disposals.

It's reasonable to set netBusinessPurchaseAndSale to 0 if null. I think it is fair to apply the same to purchaseOfBusiness
and saleOfBusiness.

In [39]:
df['netBusinessPurchaseAndSale'] = df['netBusinessPurchaseAndSale'].fillna(0)
df['purchaseOfBusiness'] = df['purchaseOfBusiness'].fillna(0)
df['saleOfBusiness'] = df['saleOfBusiness'].fillna(0)

In [59]:
df[df['netInvestmentPurchaseAndSale'].isnull()].sort_values(by=['industry','company','st_date'])[investing]

Unnamed: 0,company,yahoo_sector,industry,st_date,capitalExpenditure,capitalExpenditureReported,cashFlowFromContinuingInvestingActivities,netBusinessPurchaseAndSale,purchaseOfBusiness,saleOfBusiness,netPPEPurchaseAndSale,netInvestmentPropertiesPurchaseAndSale,purchaseOfIntangibles,purchaseOfInvestment,purchaseOfInvestmentProperties,purchaseOfPPE,saleOfInvestmentProperties,saleOfPPE,saleOfIntangibles,purchaseOfInvestment.1,saleOfInvestment,netInvestmentPurchaseAndSale,purchaseOfIntangibles.1,saleOfIntangibles.1,netIntangiblesPurchaseAndSale,netOtherInvestingChanges,investingCashFlow,cashFlowFromContinuingInvestingActivities.1


In [43]:
df['netInvestmentPurchaseAndSale'] = df['netInvestmentPurchaseAndSale'].fillna(0)
df['purchaseOfInvestment'] = df['purchaseOfInvestment'].fillna(0)
df['saleOfInvestment'] = df['saleOfInvestment'].fillna(0)

In [42]:
df[df['netInvestmentPropertiesPurchaseAndSale'].isnull()].sort_values(by=['industry','company','st_date'])[investing]

Unnamed: 0,company,yahoo_sector,industry,st_date,capitalExpenditure,capitalExpenditureReported,cashFlowFromContinuingInvestingActivities,netBusinessPurchaseAndSale,purchaseOfBusiness,saleOfBusiness,netPPEPurchaseAndSale,netInvestmentPropertiesPurchaseAndSale,purchaseOfIntangibles,purchaseOfInvestment,purchaseOfInvestmentProperties,purchaseOfPPE,saleOfInvestmentProperties,saleOfPPE,saleOfIntangibles,purchaseOfInvestment.1,saleOfInvestment,netInvestmentPurchaseAndSale,netOtherInvestingChanges,investingCashFlow,cashFlowFromContinuingInvestingActivities.1
484,IPG,Communication Services,Advertising Agencies,2019-12-31,-198500000.00,-198500000.00,-161700000.00,-600000.00,-600000.00,0.00,,,,0.00,,,,,,0.00,0.00,0.00,37400000.00,-161700000.00,-161700000.00
485,IPG,Communication Services,Advertising Agencies,2020-12-31,-167500000.00,-167500000.00,-216200000.00,-4900000.00,-4900000.00,0.00,,,,0.00,,,,,,0.00,0.00,0.00,-43800000.00,-216200000.00,-216200000.00
486,IPG,Communication Services,Advertising Agencies,2021-12-31,-195300000.00,-195300000.00,-185300000.00,-16300000.00,-16300000.00,0.00,,,,0.00,,,,,,0.00,34800000.00,34800000.00,-8500000.00,-185300000.00,-185300000.00
487,IPG,Communication Services,Advertising Agencies,2022-12-31,-178100000.00,-178100000.00,-430100000.00,-252600000.00,-252600000.00,0.00,,,,0.00,,,,,,0.00,2600000.00,2600000.00,-2000000.00,-430100000.00,-430100000.00
579,OMC,Communication Services,Advertising Agencies,2019-12-31,-102200000.00,-102200000.00,-30900000.00,69400000.00,-10000000.00,79400000.00,,,,0.00,,,,,,0.00,1900000.00,1900000.00,,-30900000.00,-30900000.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2277,SRCL,Industrials,Waste Management,2022-12-31,-132200000.00,-132200000.00,-84600000.00,46700000.00,0.00,46700000.00,,,,0.00,,,,,,0.00,0.00,0.00,900000.00,-84600000.00,-84600000.00
2389,WM,Industrials,Waste Management,2019-12-31,-1818000000.00,-1818000000.00,-2376000000.00,-472000000.00,-521000000.00,49000000.00,,,,0.00,,,,,,0.00,0.00,0.00,-86000000.00,-2376000000.00,-2376000000.00
2390,WM,Industrials,Waste Management,2020-12-31,-1632000000.00,-1632000000.00,-4847000000.00,-3200000000.00,-4085000000.00,885000000.00,,,,0.00,,,,,,0.00,0.00,0.00,-15000000.00,-4847000000.00,-4847000000.00
2391,WM,Industrials,Waste Management,2021-12-31,-1904000000.00,-1904000000.00,-1894000000.00,21000000.00,-75000000.00,96000000.00,,,,0.00,,,,,,0.00,0.00,0.00,-11000000.00,-1894000000.00,-1894000000.00


In [56]:
df['netInvestmentPropertiesPurchaseAndSale'] = df['netInvestmentPropertiesPurchaseAndSale'].fillna(0)
df['purchaseOfInvestmentProperties'] = df['purchaseOfInvestmentProperties'].fillna(0)
df['saleOfInvestmentProperties'] = df['saleOfInvestmentProperties'].fillna(0)

In [50]:
df[df['netIntangiblesPurchaseAndSale'].notna()].sort_values(by=['industry','company','st_date'])[investing]

Unnamed: 0,company,yahoo_sector,industry,st_date,capitalExpenditure,capitalExpenditureReported,cashFlowFromContinuingInvestingActivities,netBusinessPurchaseAndSale,purchaseOfBusiness,saleOfBusiness,netPPEPurchaseAndSale,netInvestmentPropertiesPurchaseAndSale,purchaseOfIntangibles,purchaseOfInvestment,purchaseOfInvestmentProperties,purchaseOfPPE,saleOfInvestmentProperties,saleOfPPE,saleOfIntangibles,purchaseOfInvestment.1,saleOfInvestment,netInvestmentPurchaseAndSale,purchaseOfIntangibles.1,saleOfIntangibles.1,netIntangiblesPurchaseAndSale,netOtherInvestingChanges,investingCashFlow,cashFlowFromContinuingInvestingActivities.1
1667,AXON,Industrials,Aerospace & Defense,2019-12-31,-16343000.0,,-240737000.0,0.0,0.0,0.0,-15939000.0,0.0,-404000.0,-354477000.0,,-15939000.0,,0.0,,-354477000.0,130083000.0,-224394000.0,-404000.0,,-404000.0,,-240737000.0,-240737000.0
1668,AXON,Industrials,Aerospace & Defense,2020-12-31,-72870000.0,,-356526000.0,-7068000.0,-7068000.0,0.0,-72534000.0,0.0,-241000.0,-656522000.0,,-72629000.0,,95000.0,,-656522000.0,379839000.0,-276683000.0,-241000.0,,-241000.0,,-356526000.0,-356526000.0
1669,AXON,Industrials,Aerospace & Defense,2021-12-31,-50278000.0,,252556000.0,-22393000.0,-22393000.0,0.0,-49843000.0,0.0,-392000.0,-407979000.0,,-49886000.0,,43000.0,,-407979000.0,733163000.0,325184000.0,-392000.0,,-392000.0,,252556000.0,252556000.0
1670,AXON,Industrials,Aerospace & Defense,2022-12-31,-56109000.0,,-830967000.0,-2104000.0,-2104000.0,0.0,-55515000.0,0.0,-307000.0,-845179000.0,,-55802000.0,,287000.0,,-845179000.0,72138000.0,-773041000.0,-307000.0,,-307000.0,,-830967000.0,-830967000.0
1681,BA,Industrials,Aerospace & Defense,2019-12-31,-1961000000.0,,-1530000000.0,9000000.0,-455000000.0,464000000.0,-1500000000.0,0.0,-127000000.0,-1658000000.0,,-1834000000.0,,334000000.0,,-1658000000.0,1759000000.0,101000000.0,-127000000.0,,-127000000.0,-13000000.0,-1530000000.0,-1530000000.0
2176,RTX,Industrials,Aerospace & Defense,2019-12-31,-2607000000.0,-2256000000.0,-3092000000.0,82000000.0,-56000000.0,82000000.0,,0.0,-351000000.0,-658000000.0,,0.0,,0.0,,-658000000.0,336000000.0,-322000000.0,-351000000.0,,-351000000.0,-245000000.0,-3092000000.0,-3092000000.0
2177,RTX,Industrials,Aerospace & Defense,2020-12-31,-1967000000.0,-1795000000.0,3343000000.0,5345000000.0,-419000000.0,5764000000.0,,0.0,-172000000.0,-312000000.0,,0.0,,0.0,,-312000000.0,368000000.0,56000000.0,-172000000.0,,-172000000.0,-91000000.0,3102000000.0,3343000000.0
2178,RTX,Industrials,Aerospace & Defense,2021-12-31,-2322000000.0,-2134000000.0,-1364000000.0,791000000.0,-1088000000.0,1879000000.0,,0.0,-188000000.0,-16000000.0,,0.0,,0.0,,-16000000.0,158000000.0,142000000.0,-188000000.0,,-188000000.0,25000000.0,-1364000000.0,-1364000000.0
2174,RTX,Industrials,Aerospace & Defense,2022-12-31,-2775000000.0,-2288000000.0,-2829000000.0,28000000.0,-66000000.0,94000000.0,,0.0,-487000000.0,-355000000.0,,0.0,,0.0,,-355000000.0,179000000.0,-176000000.0,-487000000.0,,-487000000.0,94000000.0,-2829000000.0,-2829000000.0
1017,CF,Basic Materials,Agricultural Inputs,2021-12-31,-524000000.0,,-466000000.0,0.0,0.0,0.0,-513000000.0,0.0,-10000000.0,-13000000.0,,-514000000.0,,1000000.0,58000000.0,-13000000.0,12000000.0,-1000000.0,-10000000.0,58000000.0,48000000.0,-1000000.0,-466000000.0,-466000000.0


Conclusion: purchase and sale of intangibles is significant. Net purchases and sales of intangibles is the best 
field to take in final cut. 

Reasonable to set fields to 0 where they are null.

In [54]:
df['netIntangiblesPurchaseAndSale'] = df['netIntangiblesPurchaseAndSale'].fillna(0)
df['purchaseOfIntangibles'] = df['purchaseOfIntangibles'].fillna(0)
df['saleOfIntangibles'] = df['saleOfIntangibles'].fillna(0)

In [60]:
df[df['netPPEPurchaseAndSale'].notna()].sort_values(by=['industry','company','st_date'])[investing]

Unnamed: 0,company,yahoo_sector,industry,st_date,capitalExpenditure,capitalExpenditureReported,cashFlowFromContinuingInvestingActivities,netBusinessPurchaseAndSale,purchaseOfBusiness,saleOfBusiness,netPPEPurchaseAndSale,netInvestmentPropertiesPurchaseAndSale,purchaseOfIntangibles,purchaseOfInvestment,purchaseOfInvestmentProperties,purchaseOfPPE,saleOfInvestmentProperties,saleOfPPE,saleOfIntangibles,purchaseOfInvestment.1,saleOfInvestment,netInvestmentPurchaseAndSale,purchaseOfIntangibles.1,saleOfIntangibles.1,netIntangiblesPurchaseAndSale,netOtherInvestingChanges,investingCashFlow,cashFlowFromContinuingInvestingActivities.1
1667,AXON,Industrials,Aerospace & Defense,2019-12-31,-16343000.00,,-240737000.00,0.00,0.00,0.00,-15939000.00,0.00,-404000.00,-354477000.00,0.00,-15939000.00,0.00,0.00,0.00,-354477000.00,130083000.00,-224394000.00,-404000.00,0.00,-404000.00,,-240737000.00,-240737000.00
1668,AXON,Industrials,Aerospace & Defense,2020-12-31,-72870000.00,,-356526000.00,-7068000.00,-7068000.00,0.00,-72534000.00,0.00,-241000.00,-656522000.00,0.00,-72629000.00,0.00,95000.00,0.00,-656522000.00,379839000.00,-276683000.00,-241000.00,0.00,-241000.00,,-356526000.00,-356526000.00
1669,AXON,Industrials,Aerospace & Defense,2021-12-31,-50278000.00,,252556000.00,-22393000.00,-22393000.00,0.00,-49843000.00,0.00,-392000.00,-407979000.00,0.00,-49886000.00,0.00,43000.00,0.00,-407979000.00,733163000.00,325184000.00,-392000.00,0.00,-392000.00,,252556000.00,252556000.00
1670,AXON,Industrials,Aerospace & Defense,2022-12-31,-56109000.00,,-830967000.00,-2104000.00,-2104000.00,0.00,-55515000.00,0.00,-307000.00,-845179000.00,0.00,-55802000.00,0.00,287000.00,0.00,-845179000.00,72138000.00,-773041000.00,-307000.00,0.00,-307000.00,,-830967000.00,-830967000.00
1681,BA,Industrials,Aerospace & Defense,2019-12-31,-1961000000.00,,-1530000000.00,9000000.00,-455000000.00,464000000.00,-1500000000.00,0.00,-127000000.00,-1658000000.00,0.00,-1834000000.00,0.00,334000000.00,0.00,-1658000000.00,1759000000.00,101000000.00,-127000000.00,0.00,-127000000.00,-13000000.00,-1530000000.00,-1530000000.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1745,CLH,Industrials,Waste Management,2022-12-31,-347022000.00,,-388944000.00,-69467000.00,-86278000.00,16811000.00,-336277000.00,0.00,-1966000.00,-49845000.00,0.00,-345056000.00,0.00,8779000.00,0.00,-49845000.00,68611000.00,18766000.00,-1966000.00,0.00,-1966000.00,,-388944000.00,-388944000.00
2184,RSG,Industrials,Waste Management,2019-12-31,-1207100000.00,,-1719000000.00,-532300000.00,-575100000.00,42800000.00,-1185400000.00,0.00,0.00,0.00,0.00,-1207100000.00,0.00,21700000.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,-1300000.00,-1719000000.00,-1719000000.00
2185,RSG,Industrials,Waste Management,2020-12-31,-1194600000.00,,-1922800000.00,-736600000.00,-769500000.00,32900000.00,-1164500000.00,0.00,0.00,0.00,0.00,-1194600000.00,0.00,30100000.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,-21700000.00,-1922800000.00,-1922800000.00
2186,RSG,Industrials,Waste Management,2021-12-31,-1316300000.00,,-2466100000.00,-1175400000.00,-1221700000.00,46300000.00,-1296800000.00,0.00,0.00,0.00,0.00,-1316300000.00,0.00,19500000.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,6100000.00,-2466100000.00,-2466100000.00


In [62]:
df['netPPEPurchaseAndSale'] = df['netPPEPurchaseAndSale'].fillna(0)
df['purchaseOfPPE'] = df['purchaseOfPPE'].fillna(0)
df['saleOfPPE'] = df['saleOfPPE'].fillna(0)

In [63]:
# Check count of nulls to see what's left

df[investing].isnull().sum().sort_values()

company                                         0
netIntangiblesPurchaseAndSale                   0
saleOfIntangibles                               0
purchaseOfIntangibles                           0
netInvestmentPurchaseAndSale                    0
saleOfInvestment                                0
purchaseOfInvestment                            0
saleOfIntangibles                               0
saleOfPPE                                       0
saleOfInvestmentProperties                      0
purchaseOfPPE                                   0
purchaseOfInvestmentProperties                  0
investingCashFlow                               0
purchaseOfInvestment                            0
netInvestmentPropertiesPurchaseAndSale          0
netPPEPurchaseAndSale                           0
saleOfBusiness                                  0
purchaseOfBusiness                              0
netBusinessPurchaseAndSale                      0
cashFlowFromContinuingInvestingActivities       0


In [64]:
df['netOtherInvestingChanges'] = df['netOtherInvestingChanges'].fillna(0)

In [96]:
# Select fields for final cut

final_investing = ['capitalExpenditure', 
                   'netBusinessPurchaseAndSale',
                   'netPPEPurchaseAndSale',
                   'netInvestmentPropertiesPurchaseAndSale',
                   'netInvestmentPurchaseAndSale',
                   'netIntangiblesPurchaseAndSale',
                   'cashFlowFromContinuingInvestingActivities']

is_col_name_mapping['cf_investingCashFlow'] = 'cashFlowFromContinuingInvestingActivities'
is_col_name_mapping['cf_netAcquisitions'] = 'netBusinessPurchaseAndSale'

## Finance

In [67]:
df[finance].isnull().sum().sort_values()

company                         0
yahoo_sector                    0
industry                        0
st_date                         0
netIssuancePaymentsOfDebt     111
netLongTermDebtIssuance       159
repaymentOfDebt               188
longTermDebtPayments          257
issuanceOfDebt                290
netOtherFinancingCharges      311
netCommonStockIssuance        311
longTermDebtIssuance          394
repurchaseOfCapitalStock      667
commonStockPayments           699
cashDividendsPaid            1041
netShortTermDebtIssuance     1298
commonStockDividendPaid      1351
issuanceOfCapitalStock       1717
commonStockIssuance          1765
shortTermDebtPayments        1949
shortTermDebtIssuance        1988
netPreferredStockIssuance    2702
dtype: int64

In [72]:
df[df['netIssuancePaymentsOfDebt'].isnull()].sort_values(by=['industry','company','st_date'])[finance]

Unnamed: 0,company,yahoo_sector,industry,st_date,shortTermDebtIssuance,shortTermDebtPayments,netShortTermDebtIssuance,longTermDebtIssuance,longTermDebtPayments,netLongTermDebtIssuance,issuanceOfDebt,repaymentOfDebt,netIssuancePaymentsOfDebt,commonStockDividendPaid,commonStockIssuance,commonStockPayments,netCommonStockIssuance,issuanceOfCapitalStock,netPreferredStockIssuance,commonStockDividendPaid.1,netOtherFinancingCharges,repurchaseOfCapitalStock
1667,AXON,Industrials,Aerospace & Defense,2019-12-31,,,,,,,,,,,0.0,,0.0,0.0,,,-4051000.0,
1668,AXON,Industrials,Aerospace & Defense,2020-12-31,,,,,,,,,,,306779000.0,,306779000.0,306779000.0,,,-7809000.0,
1669,AXON,Industrials,Aerospace & Defense,2021-12-31,,,,,,,,,,,105514000.0,,105514000.0,105514000.0,,,-331309000.0,
3561,LULU,Consumer Cyclical,Apparel Retail,2020-01-31,,,,,,,,,,,,-173399000.0,-173399000.0,,,,-21944000.0,-173399000.0
3558,LULU,Consumer Cyclical,Apparel Retail,2021-01-31,,,,,,,,,,,,-63663000.0,-63663000.0,,,,-32388000.0,-63663000.0
3559,LULU,Consumer Cyclical,Apparel Retail,2022-01-31,,,,,,,,,,,,-812602000.0,-812602000.0,,,,-50579000.0,-812602000.0
3560,LULU,Consumer Cyclical,Apparel Retail,2023-01-31,,,,,,,,,,,,-479159000.0,-479159000.0,,,,-32000.0,-479159000.0
3707,ROST,Consumer Cyclical,Apparel Retail,2020-01-31,,,,,,,,,,-369793000.0,,-1335665000.0,-1335665000.0,,,-369793000.0,,-1335665000.0
3761,TJX,Consumer Cyclical,Apparel Retail,2020-01-31,,,,,,,,,,-1071600000.0,232100000.0,-1552000000.0,-1319900000.0,232100000.0,,-1071600000.0,-23400000.0,-1552000000.0
3689,QS,Consumer Cyclical,Auto Parts,2020-12-31,,,,,,,,,,,99800000.0,,99800000.0,276262000.0,176462000.0,,676863000.0,


In [74]:
df[df['shortTermDebtPayments'].notna()].sort_values(by=['industry','company','st_date'])[finance].head(500)

Unnamed: 0,company,yahoo_sector,industry,st_date,shortTermDebtIssuance,shortTermDebtPayments,netShortTermDebtIssuance,longTermDebtIssuance,longTermDebtPayments,netLongTermDebtIssuance,issuanceOfDebt,repaymentOfDebt,netIssuancePaymentsOfDebt,commonStockDividendPaid,commonStockIssuance,commonStockPayments,netCommonStockIssuance,issuanceOfCapitalStock,netPreferredStockIssuance,commonStockDividendPaid.1,netOtherFinancingCharges,repurchaseOfCapitalStock
1791,CW,Industrials,Aerospace & Defense,2019-12-31,37692000.0,-37934000.0,-242000.0,37692000.0,0.0,0.0,37692000.0,-37934000.0,-242000.0,-28200000.0,,-50661000.0,-50661000.0,,,-28200000.0,-812000.0,-50661000.0
1792,CW,Industrials,Aerospace & Defense,2020-12-31,570675000.0,-570675000.0,0.0,300000000.0,0.0,300000000.0,870675000.0,-570675000.0,300000000.0,-28175000.0,,-200018000.0,-200018000.0,,,-28175000.0,-874000.0,-200018000.0
1793,CW,Industrials,Aerospace & Defense,2021-12-31,455950000.0,-362050000.0,93900000.0,0.0,-100000000.0,-100000000.0,455950000.0,-462050000.0,-6100000.0,,,-343129000.0,-343129000.0,,,,-945000.0,-343129000.0
1794,CW,Industrials,Aerospace & Defense,2022-12-31,1697647000.0,-1791547000.0,-93900000.0,300000000.0,0.0,300000000.0,1997647000.0,-1791547000.0,206100000.0,-28779000.0,,-56870000.0,-56870000.0,,,-28779000.0,-1020000.0,-56870000.0
1901,GD,Industrials,Aerospace & Defense,2019-12-31,,-850000000.0,-850000000.0,0.0,-850000000.0,-850000000.0,0.0,-850000000.0,-850000000.0,-1152000000.0,,-231000000.0,-231000000.0,,,-1152000000.0,236000000.0,-231000000.0
1902,GD,Industrials,Aerospace & Defense,2020-12-31,420000000.0,-861000000.0,-441000000.0,3960000000.0,-2500000000.0,1460000000.0,4380000000.0,-3361000000.0,1019000000.0,-1240000000.0,,-587000000.0,-587000000.0,,,-1240000000.0,-95000000.0,-587000000.0
1903,GD,Industrials,Aerospace & Defense,2021-12-31,2003000000.0,-1997000000.0,6000000.0,1497000000.0,-3000000000.0,-1503000000.0,3500000000.0,-4997000000.0,-1497000000.0,-1315000000.0,,-1828000000.0,-1828000000.0,,,-1315000000.0,50000000.0,-1828000000.0
1904,GD,Industrials,Aerospace & Defense,2022-12-31,0.0,0.0,0.0,0.0,-1000000000.0,-1000000000.0,0.0,-1000000000.0,-1000000000.0,,,-1229000000.0,-1229000000.0,,,,127000000.0,-1229000000.0
1930,HEI,Industrials,Aerospace & Defense,2021-10-31,0.0,-505000000.0,-505000000.0,,,,0.0,-505000000.0,-505000000.0,-23002000.0,,,,,,-23002000.0,-32519000.0,
1957,HII,Industrials,Aerospace & Defense,2019-12-31,5119000000.0,-5119000000.0,0.0,0.0,0.0,0.0,5119000000.0,-5119000000.0,0.0,-149000000.0,,-262000000.0,-262000000.0,,,-149000000.0,-23000000.0,-262000000.0


Take netIssuancePaymentOfDebt as net debt issued. If positive then this is net debt repaid. We can ignore the other
debt related fields.

Reasonable to set the debt related fields to 0 if null.

In [None]:
TBD - Need to revisit this section and complete analysis

In [32]:
# Check whether issuanceOfDebt includes long and short term debt.

df[(df['shortTermDebtIssuance'] + df['longTermDebtIssuance']) != df['issuanceOfDebt']][finance].head(400)

Unnamed: 0,company,yahoo_sector,industry,st_date,shortTermDebtIssuance,shortTermDebtPayments,netShortTermDebtIssuance,longTermDebtIssuance,longTermDebtPayments,netLongTermDebtIssuance,issuanceOfDebt,repaymentOfDebt,netIssuancePaymentsOfDebt,commonStockDividendPaid,commonStockIssuance,commonStockPayments,netCommonStockIssuance,issuanceOfCapitalStock,netPreferredStockIssuance,commonStockDividendPaid.1,netOtherFinancingCharges,repurchaseOfCapitalStock
0,AM,Energy,Oil & Gas Midstream,2022-12-31,,,234800000.0,0.0,0.0,0.0,0.0,0.0,234800000.0,-432825000.0,,0.0,0.0,,,-432825000.0,-7203000.0,0.0
2,AM,Energy,Oil & Gas Midstream,2019-12-31,,-115500000.0,-115500000.0,650000000.0,-115500000.0,650000000.0,650000000.0,-115500000.0,534500000.0,-495823000.0,,-125519000.0,-125519000.0,,,-495823000.0,-11083000.0,-125519000.0
3,AM,Energy,Oil & Gas Midstream,2020-12-31,,-346000000.0,-346000000.0,550000000.0,-346000000.0,550000000.0,550000000.0,-346000000.0,204000000.0,-589640000.0,,-24713000.0,-24713000.0,,,-589640000.0,-123843000.0,-24713000.0
4,AM,Energy,Oil & Gas Midstream,2021-12-31,,-66300000.0,-66300000.0,750000000.0,-667472000.0,82528000.0,750000000.0,-733772000.0,16228000.0,-471171000.0,,0.0,0.0,,,-471171000.0,-21657000.0,0.0
5,AR,Energy,Oil & Gas E&P,2019-12-31,,,,882000000.0,-191092000.0,690908000.0,882000000.0,-191092000.0,690908000.0,,0.0,-38772000.0,-38772000.0,0.0,,,-94572000.0,-38772000.0
6,AR,Energy,Oil & Gas E&P,2020-12-31,,,,752500000.0,-1219019000.0,-466519000.0,752500000.0,-1219019000.0,-466519000.0,,,-43443000.0,-43443000.0,,,,304383000.0,-43443000.0
7,AR,Energy,Oil & Gas E&P,2021-12-31,,,-1017000000.0,1800000000.0,-1640305000.0,159695000.0,1800000000.0,-1640305000.0,-857305000.0,,,0.0,0.0,,,,-92027000.0,0.0
8,AR,Energy,Oil & Gas E&P,2022-12-31,,,34800000.0,0.0,-1027728000.0,-1027728000.0,0.0,-1027728000.0,-992928000.0,,,-873744000.0,-873744000.0,,,,-241058000.0,-873744000.0
17,BKR,Energy,Oil & Gas Equipment & Services,2019-12-31,,,-542000000.0,525000000.0,-587000000.0,-587000000.0,525000000.0,-587000000.0,-587000000.0,-745000000.0,,-250000000.0,-250000000.0,,,-745000000.0,48000000.0,-250000000.0
19,BKR,Energy,Oil & Gas Equipment & Services,2021-12-31,,-873000000.0,-832000000.0,1250000000.0,-1354000000.0,-104000000.0,1250000000.0,-1354000000.0,-936000000.0,,,-434000000.0,-434000000.0,,,,-24000000.0,-434000000.0


Conclusion: issuanceOfDebt is longTermDebtIssuance only.

In [None]:
df[(df['shortTermDebtIssuance'] + df['longTermDebtIssuance']) != df['issuanceOfDebt']][finance]

In [37]:
# Check that netIssuancePaymentsOfDebt is equal to the sum of short and long term debt.

col_list = ['company','st_date','netShortTermDebtIssuance','netLongTermDebtIssuance','netIssuancePaymentsOfDebt']
df[(df['netShortTermDebtIssuance'] + df['netLongTermDebtIssuance']) != df['netIssuancePaymentsOfDebt']][col_list].head(500)

Unnamed: 0,company,st_date,netShortTermDebtIssuance,netLongTermDebtIssuance,netIssuancePaymentsOfDebt
5,AR,2019-12-31,,690908000.0,690908000.0
6,AR,2020-12-31,,-466519000.0,-466519000.0
12,APA,2019-12-31,396000000.0,235000000.0,235000000.0
15,BKR,2022-12-31,-28000000.0,-28000000.0,-28000000.0
17,BKR,2019-12-31,-542000000.0,-587000000.0,-587000000.0
20,LNG,2019-12-31,,2088000000.0,2088000000.0
21,LNG,2020-12-31,,883000000.0,883000000.0
22,LNG,2021-12-31,,-899000000.0,-899000000.0
23,LNG,2022-12-31,,-5203000000.0,-5203000000.0
29,CVX,2022-12-31,263000000.0,-8742000000.0,-8500000000.0


Didn't expect this. Totally inconsistent whether netIssuancePaymentsOfDebt includes long and short term. Disregard
this field as it is unreliable.

In [40]:
# Let's check that netShortTermDebtIssuance is equal to sum of shortTermIssuance and shortTermPayments

df[(df['shortTermDebtIssuance'] + df['shortTermDebtPayments']) != df['netShortTermDebtIssuance']][finance].head(500)

Unnamed: 0,company,yahoo_sector,industry,st_date,shortTermDebtIssuance,shortTermDebtPayments,netShortTermDebtIssuance,longTermDebtIssuance,longTermDebtPayments,netLongTermDebtIssuance,issuanceOfDebt,repaymentOfDebt,netIssuancePaymentsOfDebt,commonStockDividendPaid,commonStockIssuance,commonStockPayments,netCommonStockIssuance,issuanceOfCapitalStock,netPreferredStockIssuance,commonStockDividendPaid.1,netOtherFinancingCharges,repurchaseOfCapitalStock
0,AM,Energy,Oil & Gas Midstream,2022-12-31,,,234800000.0,0.0,0.0,0.0,0.0,0.0,234800000.0,-432825000.0,,0.0,0.0,,,-432825000.0,-7203000.0,0.0
2,AM,Energy,Oil & Gas Midstream,2019-12-31,,-115500000.0,-115500000.0,650000000.0,-115500000.0,650000000.0,650000000.0,-115500000.0,534500000.0,-495823000.0,,-125519000.0,-125519000.0,,,-495823000.0,-11083000.0,-125519000.0
3,AM,Energy,Oil & Gas Midstream,2020-12-31,,-346000000.0,-346000000.0,550000000.0,-346000000.0,550000000.0,550000000.0,-346000000.0,204000000.0,-589640000.0,,-24713000.0,-24713000.0,,,-589640000.0,-123843000.0,-24713000.0
4,AM,Energy,Oil & Gas Midstream,2021-12-31,,-66300000.0,-66300000.0,750000000.0,-667472000.0,82528000.0,750000000.0,-733772000.0,16228000.0,-471171000.0,,0.0,0.0,,,-471171000.0,-21657000.0,0.0
5,AR,Energy,Oil & Gas E&P,2019-12-31,,,,882000000.0,-191092000.0,690908000.0,882000000.0,-191092000.0,690908000.0,,0.0,-38772000.0,-38772000.0,0.0,,,-94572000.0,-38772000.0
6,AR,Energy,Oil & Gas E&P,2020-12-31,,,,752500000.0,-1219019000.0,-466519000.0,752500000.0,-1219019000.0,-466519000.0,,,-43443000.0,-43443000.0,,,,304383000.0,-43443000.0
7,AR,Energy,Oil & Gas E&P,2021-12-31,,,-1017000000.0,1800000000.0,-1640305000.0,159695000.0,1800000000.0,-1640305000.0,-857305000.0,,,0.0,0.0,,,,-92027000.0,0.0
8,AR,Energy,Oil & Gas E&P,2022-12-31,,,34800000.0,0.0,-1027728000.0,-1027728000.0,0.0,-1027728000.0,-992928000.0,,,-873744000.0,-873744000.0,,,,-241058000.0,-873744000.0
10,APA,Energy,Oil & Gas E&P,2022-12-31,24000000.0,,24000000.0,0.0,-1493000000.0,-1493000000.0,24000000.0,-1493000000.0,-1469000000.0,-207000000.0,,-1423000000.0,-1423000000.0,,,-207000000.0,-379000000.0,-1423000000.0
12,APA,Energy,Oil & Gas E&P,2019-12-31,396000000.0,,396000000.0,989000000.0,-1150000000.0,235000000.0,1385000000.0,-1150000000.0,235000000.0,-376000000.0,2000000.0,,2000000.0,2000000.0,,-376000000.0,251000000.0,


Plenty of inconsistencies. Many cases where netShortTermDebtIssuance is populated but the other two fields aren't.
The totals do not always add up. 
Best we can do is take netShortTermDebtIssuance in the final cut as representative of net ShortTermDebt.


In [41]:
# Let's check that netLongTermDebtIssuance is equal to sum of longTermIssuance and longTermPayments

df[(df['longTermDebtIssuance'] + df['longTermDebtPayments']) != df['netLongTermDebtIssuance']][finance].head(500)

Unnamed: 0,company,yahoo_sector,industry,st_date,shortTermDebtIssuance,shortTermDebtPayments,netShortTermDebtIssuance,longTermDebtIssuance,longTermDebtPayments,netLongTermDebtIssuance,issuanceOfDebt,repaymentOfDebt,netIssuancePaymentsOfDebt,commonStockDividendPaid,commonStockIssuance,commonStockPayments,netCommonStockIssuance,issuanceOfCapitalStock,netPreferredStockIssuance,commonStockDividendPaid.1,netOtherFinancingCharges,repurchaseOfCapitalStock
2,AM,Energy,Oil & Gas Midstream,2019-12-31,,-115500000.0,-115500000.0,650000000.0,-115500000.0,650000000.0,650000000.0,-115500000.0,534500000.0,-495823000.0,,-125519000.0,-125519000.0,,,-495823000.0,-11083000.0,-125519000.0
3,AM,Energy,Oil & Gas Midstream,2020-12-31,,-346000000.0,-346000000.0,550000000.0,-346000000.0,550000000.0,550000000.0,-346000000.0,204000000.0,-589640000.0,,-24713000.0,-24713000.0,,,-589640000.0,-123843000.0,-24713000.0
12,APA,Energy,Oil & Gas E&P,2019-12-31,396000000.0,,396000000.0,989000000.0,-1150000000.0,235000000.0,1385000000.0,-1150000000.0,235000000.0,-376000000.0,2000000.0,,2000000.0,2000000.0,,-376000000.0,251000000.0,
13,APA,Energy,Oil & Gas E&P,2020-12-31,228000000.0,,378000000.0,145000000.0,-1243000000.0,-5000000.0,373000000.0,-1243000000.0,373000000.0,-123000000.0,1000000.0,,1000000.0,1000000.0,,-123000000.0,93000000.0,
17,BKR,Energy,Oil & Gas Equipment & Services,2019-12-31,,,-542000000.0,525000000.0,-587000000.0,-587000000.0,525000000.0,-587000000.0,-587000000.0,-745000000.0,,-250000000.0,-250000000.0,,,-745000000.0,48000000.0,-250000000.0
30,CVX,Energy,Oil & Gas Integrated,2019-12-31,2586000000.0,-5407000000.0,-2821000000.0,0.0,-5025000000.0,-7800000000.0,2586000000.0,-10432000000.0,-7800000000.0,-9000000000.0,,-2900000000.0,-2900000000.0,,,-9000000000.0,-100000000.0,-2900000000.0
31,CVX,Energy,Oil & Gas Integrated,2020-12-31,10846000000.0,-10195000000.0,651000000.0,12308000000.0,-5489000000.0,7500000000.0,23154000000.0,-15684000000.0,7500000000.0,-9700000000.0,,-1500000000.0,-1500000000.0,,,-9700000000.0,-24000000.0,-1500000000.0
32,CVX,Energy,Oil & Gas Integrated,2021-12-31,4448000000.0,-10020000000.0,-5572000000.0,0.0,-7364000000.0,-12900000000.0,4448000000.0,-17384000000.0,-12900000000.0,-10200000000.0,,,0.0,,,-10200000000.0,-36000000.0,
36,COP,Energy,Oil & Gas E&P,2019-12-31,,,,,-80000000.0,-80000000.0,,-80000000.0,-80000000.0,-1500000000.0,,-3530000000.0,-3530000000.0,,,-1500000000.0,-119000000.0,-3530000000.0
39,CTRA,Energy,Oil & Gas E&P,2022-12-31,,,,0.0,-6000000.0,-880000000.0,0.0,-6000000.0,-880000000.0,-1992000000.0,,-1250000000.0,-1250000000.0,,-10000000.0,-1992000000.0,-25000000.0,-1260000000.0


Similar inconsistencies as with shortTermDebt. Make a decision to just use netLongTermDebtIssuance.

In [42]:
# Check the following.

df[(df['issuanceOfDebt'] + df['repaymentOfDebt']) != df['netIssuancePaymentsOfDebt']][finance].head(500)

Unnamed: 0,company,yahoo_sector,industry,st_date,shortTermDebtIssuance,shortTermDebtPayments,netShortTermDebtIssuance,longTermDebtIssuance,longTermDebtPayments,netLongTermDebtIssuance,issuanceOfDebt,repaymentOfDebt,netIssuancePaymentsOfDebt,commonStockDividendPaid,commonStockIssuance,commonStockPayments,netCommonStockIssuance,issuanceOfCapitalStock,netPreferredStockIssuance,commonStockDividendPaid.1,netOtherFinancingCharges,repurchaseOfCapitalStock
0,AM,Energy,Oil & Gas Midstream,2022-12-31,,,234800000.0,0.0,0.0,0.0,0.0,0.0,234800000.0,-432825000.0,,0.0,0.0,,,-432825000.0,-7203000.0,0.0
7,AR,Energy,Oil & Gas E&P,2021-12-31,,,-1017000000.0,1800000000.0,-1640305000.0,159695000.0,1800000000.0,-1640305000.0,-857305000.0,,,0.0,0.0,,,,-92027000.0,0.0
8,AR,Energy,Oil & Gas E&P,2022-12-31,,,34800000.0,0.0,-1027728000.0,-1027728000.0,0.0,-1027728000.0,-992928000.0,,,-873744000.0,-873744000.0,,,,-241058000.0,-873744000.0
13,APA,Energy,Oil & Gas E&P,2020-12-31,228000000.0,,378000000.0,145000000.0,-1243000000.0,-5000000.0,373000000.0,-1243000000.0,373000000.0,-123000000.0,1000000.0,,1000000.0,1000000.0,,-123000000.0,93000000.0,
17,BKR,Energy,Oil & Gas Equipment & Services,2019-12-31,,,-542000000.0,525000000.0,-587000000.0,-587000000.0,525000000.0,-587000000.0,-587000000.0,-745000000.0,,-250000000.0,-250000000.0,,,-745000000.0,48000000.0,-250000000.0
19,BKR,Energy,Oil & Gas Equipment & Services,2021-12-31,,-873000000.0,-832000000.0,1250000000.0,-1354000000.0,-104000000.0,1250000000.0,-1354000000.0,-936000000.0,,,-434000000.0,-434000000.0,,,,-24000000.0,-434000000.0
29,CVX,Energy,Oil & Gas Integrated,2022-12-31,263000000.0,0.0,263000000.0,0.0,-8742000000.0,-8742000000.0,263000000.0,-8742000000.0,-8500000000.0,-11000000000.0,,-11300000000.0,-11300000000.0,,,-11000000000.0,-114000000.0,-11300000000.0
30,CVX,Energy,Oil & Gas Integrated,2019-12-31,2586000000.0,-5407000000.0,-2821000000.0,0.0,-5025000000.0,-7800000000.0,2586000000.0,-10432000000.0,-7800000000.0,-9000000000.0,,-2900000000.0,-2900000000.0,,,-9000000000.0,-100000000.0,-2900000000.0
31,CVX,Energy,Oil & Gas Integrated,2020-12-31,10846000000.0,-10195000000.0,651000000.0,12308000000.0,-5489000000.0,7500000000.0,23154000000.0,-15684000000.0,7500000000.0,-9700000000.0,,-1500000000.0,-1500000000.0,,,-9700000000.0,-24000000.0,-1500000000.0
32,CVX,Energy,Oil & Gas Integrated,2021-12-31,4448000000.0,-10020000000.0,-5572000000.0,0.0,-7364000000.0,-12900000000.0,4448000000.0,-17384000000.0,-12900000000.0,-10200000000.0,,,0.0,,,-10200000000.0,-36000000.0,


Plenty of discrepenacies. Take netIssuancePaymentsOfDebt, netShortTermDebtIssuance and netLongTermDebtIssuance 
in final cut and accept as reported. 

In [50]:
# Check for the case where both related fields are null but netCommonStockIssuance isn't.

df[df['issuanceOfDebt'].isnull() & \
   df['repaymentOfDebt'].isnull() & \
   df['netIssuancePaymentsOfDebt'].notna()][finance].head(400)

Unnamed: 0,company,yahoo_sector,industry,st_date,shortTermDebtIssuance,shortTermDebtPayments,netShortTermDebtIssuance,longTermDebtIssuance,longTermDebtPayments,netLongTermDebtIssuance,issuanceOfDebt,repaymentOfDebt,netIssuancePaymentsOfDebt,commonStockDividendPaid,commonStockIssuance,commonStockPayments,netCommonStockIssuance,issuanceOfCapitalStock,netPreferredStockIssuance,commonStockDividendPaid.1,netOtherFinancingCharges,repurchaseOfCapitalStock
848,MNST,Consumer Defensive,Beverages—Non-Alcoholic,2022-12-31,,,,,,75000.0,,,75000.0,,64015000.0,-771028000.0,-707013000.0,64015000.0,,,,-771028000.0
852,MNST,Consumer Defensive,Beverages—Non-Alcoholic,2021-12-31,,,,,,2928000.0,,,2928000.0,,45723000.0,-13830000.0,31893000.0,45723000.0,,,,-13830000.0
1724,CARR,Industrials,Building Products & Equipment,2019-12-31,,,25000000.0,,,,,,25000000.0,,,,,,,,-1992000000.0,
1817,DOV,Industrials,Specialty Industrial Machinery,2022-12-31,,,629891000.0,,,,,,629891000.0,,,-585000000.0,-585000000.0,,,,-17605000.0,-585000000.0
2048,LECO,Industrials,Tools & Accessories,2021-12-31,,,,,,45968000.0,,,45968000.0,,,-164526000.0,-164526000.0,,,,-763000.0,-164526000.0
2135,OTIS,Industrials,Specialty Industrial Machinery,2019-12-31,,,6000000.0,,,,,,6000000.0,,,,,,,,-1139000000.0,
2249,AOS,Industrials,Specialty Industrial Machinery,2019-12-31,,,,,,62600000.0,,,62600000.0,-149200000.0,,-287700000.0,-287700000.0,,,-149200000.0,-1000000.0,-287700000.0
2250,AOS,Industrials,Specialty Industrial Machinery,2020-12-31,,,,,,-170800000.0,,,-170800000.0,-158700000.0,,-56700000.0,-56700000.0,,,-158700000.0,,-56700000.0
2427,ACN,Technology,Information Technology Services,2019-08-31,,,,,,-4772000.0,,,-4772000.0,-1864353000.0,848445000.0,-2691114000.0,-1842669000.0,848445000.0,,-1864353000.0,-60149000.0,-2691114000.0
2428,ACN,Technology,Information Technology Services,2020-08-31,,,,,,-6719000.0,,,-6719000.0,-2037733000.0,955308000.0,-2915847000.0,-1960539000.0,955308000.0,,-2037733000.0,-50820000.0,-2915847000.0


In [52]:
df[(df['issuanceOfDebt'].notna() |\
   df['repaymentOfDebt'].notna()) & \
   df['netIssuancePaymentsOfDebt'].isnull()][finance].head(400)

Unnamed: 0,company,yahoo_sector,industry,st_date,shortTermDebtIssuance,shortTermDebtPayments,netShortTermDebtIssuance,longTermDebtIssuance,longTermDebtPayments,netLongTermDebtIssuance,issuanceOfDebt,repaymentOfDebt,netIssuancePaymentsOfDebt,commonStockDividendPaid,commonStockIssuance,commonStockPayments,netCommonStockIssuance,issuanceOfCapitalStock,netPreferredStockIssuance,commonStockDividendPaid.1,netOtherFinancingCharges,repurchaseOfCapitalStock


In [48]:
# Check for the case where both related fields are null but netCommonStockIssuance isn't.

df[df['commonStockIssuance'].isnull() & \
   df['commonStockPayments'].isnull() & \
   df['netCommonStockIssuance'].notna()][finance].head(400)


Unnamed: 0,company,yahoo_sector,industry,st_date,shortTermDebtIssuance,shortTermDebtPayments,netShortTermDebtIssuance,longTermDebtIssuance,longTermDebtPayments,netLongTermDebtIssuance,issuanceOfDebt,repaymentOfDebt,netIssuancePaymentsOfDebt,commonStockDividendPaid,commonStockIssuance,commonStockPayments,netCommonStockIssuance,issuanceOfCapitalStock,netPreferredStockIssuance,commonStockDividendPaid.1,netOtherFinancingCharges,repurchaseOfCapitalStock
32,CVX,Energy,Oil & Gas Integrated,2021-12-31,4448000000.0,-10020000000.0,-5572000000.0,0.0,-7364000000.0,-12900000000.0,4448000000.0,-17384000000.0,-12900000000.0,-10200000000.0,,,0.0,,,-10200000000.0,-36000000.0,
324,NFG,Energy,Oil & Gas Integrated,2019-09-30,,,55200000.0,0.0,0.0,0.0,0.0,0.0,55200000.0,-147418000.0,,,-8877000.0,,,-147418000.0,,
325,NFG,Energy,Oil & Gas Integrated,2020-09-30,,,-25200000.0,493007000.0,0.0,493007000.0,493007000.0,0.0,467807000.0,-153322000.0,,,161603000.0,,,-153322000.0,,
326,NFG,Energy,Oil & Gas Integrated,2021-09-30,,,128500000.0,495267000.0,-515715000.0,-20448000.0,495267000.0,-515715000.0,108052000.0,-163089000.0,,,-3702000.0,,,-163089000.0,,
354,PNW,Utilities,Utilities—Regulated Electric,2022-12-31,48720000.0,0.0,48720000.0,875537000.0,-150000000.0,725537000.0,924257000.0,-150000000.0,774257000.0,-378881000.0,,,-2653000.0,,,-378881000.0,-21255000.0,
356,PNW,Utilities,Utilities—Regulated Electric,2019-12-31,103275000.0,-65000000.0,38275000.0,1092188000.0,-600000000.0,492188000.0,1195463000.0,-665000000.0,530463000.0,-329643000.0,,,692000.0,,,-329643000.0,-22744000.0,
357,PNW,Utilities,Utilities—Regulated Electric,2020-12-31,825015000.0,-770690000.0,54325000.0,1596672000.0,-915150000.0,681522000.0,2421687000.0,-1685840000.0,735847000.0,-350577000.0,,,-1389000.0,,,-350577000.0,-22743000.0,
358,PNW,Utilities,Utilities—Regulated Electric,2021-12-31,142000000.0,-19000000.0,123000000.0,746999000.0,0.0,746999000.0,888999000.0,-19000000.0,869999000.0,-369478000.0,,,-2350000.0,,,-369478000.0,-21255000.0,
800,INGR,Consumer Defensive,Packaged Foods,2021-12-31,,,250000000.0,1300000000.0,-1690000000.0,-390000000.0,1300000000.0,-1690000000.0,-140000000.0,-184000000.0,,,-49000000.0,,,-184000000.0,,
1893,GE,Industrials,Specialty Industrial Machinery,2019-12-31,,,280000000.0,2185000000.0,-16567000000.0,-14382000000.0,2185000000.0,-16567000000.0,-14102000000.0,-649000000.0,,,29000000.0,,,-649000000.0,-1043000000.0,


In [49]:
# Check for case where netCommonStockIssuance is null but related fields aren't

df[(df['commonStockIssuance'].notna() | \
   df['commonStockPayments'].notna()) & \
   df['netCommonStockIssuance'].isnull()][finance].head(400)

Unnamed: 0,company,yahoo_sector,industry,st_date,shortTermDebtIssuance,shortTermDebtPayments,netShortTermDebtIssuance,longTermDebtIssuance,longTermDebtPayments,netLongTermDebtIssuance,issuanceOfDebt,repaymentOfDebt,netIssuancePaymentsOfDebt,commonStockDividendPaid,commonStockIssuance,commonStockPayments,netCommonStockIssuance,issuanceOfCapitalStock,netPreferredStockIssuance,commonStockDividendPaid.1,netOtherFinancingCharges,repurchaseOfCapitalStock


Conclusion: netCommonStockIssuance is the most reliable field for stock issued and should be included in final cut
as stock issued.

In [73]:
# cashDividendsPaid has a lower null count than commonStockDividendPaid. Check if there are cases where 
# cashDividendsPaid is null but commonStockDividendPaid is populated.

df[df['cashDividendsPaid'].isnull() & df['commonStockDividendPaid'].notna()][finance]

Unnamed: 0,company,yahoo_sector,industry,st_date,shortTermDebtIssuance,shortTermDebtPayments,netShortTermDebtIssuance,longTermDebtIssuance,longTermDebtPayments,netLongTermDebtIssuance,issuanceOfDebt,repaymentOfDebt,netIssuancePaymentsOfDebt,cashDividendsPaid,commonStockDividendPaid,commonStockIssuance,commonStockPayments,netCommonStockIssuance,issuanceOfCapitalStock,netPreferredStockIssuance,netOtherFinancingCharges,repurchaseOfCapitalStock


In [83]:
# Sanity check to confirm that cashDividends paid is the sum of dividends paid for common and preferred shares.

df_temp = df[['company','st_date','cashDividendsPaid',
             'commonStockDividendPaid','preferredStockDividendPaid']].copy()
df_temp['totalDividendsPaid'] = df_temp['commonStockDividendPaid'] + df_temp['preferredStockDividendPaid']

df_temp[df_temp['cashDividendsPaid'].notna()].head(300)

Unnamed: 0,company,st_date,cashDividendsPaid,commonStockDividendPaid,preferredStockDividendPaid,totalDividendsPaid
0,AM,2022-12-31,-433375000.0,-432825000.0,-550000.0,-433375000.0
2,AM,2019-12-31,-496197000.0,-495823000.0,-374000.0,-496197000.0
3,AM,2020-12-31,-590190000.0,-589640000.0,-550000.0,-590190000.0
4,AM,2021-12-31,-471721000.0,-471171000.0,-550000.0,-471721000.0
10,APA,2022-12-31,-218000000.0,-207000000.0,-11000000.0,-218000000.0
12,APA,2019-12-31,-376000000.0,-376000000.0,,
13,APA,2020-12-31,-146000000.0,-123000000.0,-23000000.0,-146000000.0
14,APA,2021-12-31,-98000000.0,-52000000.0,-46000000.0,-98000000.0
15,BKR,2022-12-31,-743000000.0,-743000000.0,,
17,BKR,2019-12-31,-745000000.0,-745000000.0,,


In [88]:
# Set selected fields to 0 if not popupated

col_list = ['netIssuancePaymentsOfDebt','netLongTermDebtIssuance','netShortTermDebtIssuance',
            'commonStockDividendPaid', 'netCommonStockIssuance', 'netOtherFinancingCharges',
            'cashDividendsPaid']

df[col_list] = df[col_list].fillna(0)

In [86]:
# Check null counts

df[finance].isnull().sum().sort_values()

company                          0
yahoo_sector                     0
industry                         0
st_date                          0
netCommonStockIssuance           0
commonStockDividendPaid          0
netShortTermDebtIssuance         0
cashDividendsPaid                0
netIssuancePaymentsOfDebt        0
netLongTermDebtIssuance          0
netOtherFinancingCharges         0
repaymentOfDebt                188
longTermDebtPayments           257
issuanceOfDebt                 290
longTermDebtIssuance           394
repurchaseOfCapitalStock       667
commonStockPayments            699
issuanceOfCapitalStock        1717
commonStockIssuance           1765
shortTermDebtPayments         1949
shortTermDebtIssuance         1988
netPreferredStockIssuance     2702
preferredStockDividendPaid    2846
dtype: int64

In [102]:
final_finance = ['netCommonStockIssuance','cashDividendsPaid','netIssuancePaymentsOfDebt',
                 'netLongTermDebtIssuance','netShortTermDebtIssuance','netOtherFinancingCharges',
                 'repurchaseOfCapitalStock', 'cashFlowFromContinuingFinancingActivities']

is_col_name_mapping['cf_financeCashFlow'] = 'cashFlowFromContinuingFinanceActivities'

## Adhoc

In [76]:
df[adhoc].isnull().sum().sort_values()

company                                      0
yahoo_sector                                 0
industry                                     0
st_date                                      0
cashFlowFromContinuingOperatingActivities    0
capitalExpenditure                           0
freeCashFlow                                 0
dtype: int64

In [78]:
df[adhoc].sort_values(by=['industry','company','st_date']).head(500)

Unnamed: 0,company,yahoo_sector,industry,st_date,cashFlowFromContinuingOperatingActivities,capitalExpenditure,freeCashFlow
484,IPG,Communication Services,Advertising Agencies,2019-12-31,1529200000.0,-198500000.0,1330700000.0
485,IPG,Communication Services,Advertising Agencies,2020-12-31,1847200000.0,-167500000.0,1679700000.0
486,IPG,Communication Services,Advertising Agencies,2021-12-31,2075600000.0,-195300000.0,1880300000.0
487,IPG,Communication Services,Advertising Agencies,2022-12-31,608800000.0,-178100000.0,430700000.0
579,OMC,Communication Services,Advertising Agencies,2019-12-31,1856000000.0,-102200000.0,1753800000.0
580,OMC,Communication Services,Advertising Agencies,2020-12-31,1724600000.0,-75400000.0,1649200000.0
581,OMC,Communication Services,Advertising Agencies,2021-12-31,1945400000.0,-665800000.0,1279600000.0
582,OMC,Communication Services,Advertising Agencies,2022-12-31,926500000.0,-78200000.0,848300000.0
1667,AXON,Industrials,Aerospace & Defense,2019-12-31,65673000.0,-16343000.0,49330000.0
1668,AXON,Industrials,Aerospace & Defense,2020-12-31,38481000.0,-72870000.0,-34389000.0


In [97]:
final_adhoc = ['freeCashFlow']

## Save Modified Data

In [89]:
with open('/home/priyesh/projects/TS1/pickle/yahoo_modified_cash_sheets.pkl','wb') as f:
    pickle.dump(df,f)

In [90]:
with open('/home/priyesh/projects/TS1/pickle/yahoo_drop_data_details_stage5.pkl','wb') as f:
    pickle.dump(drop_data_details,f)

## Build final cut data for cash sheet

In [3]:
df = df.reset_index(drop=True)

NameError: name 'df' is not defined

In [103]:
col_list = ['company', 'yahoo_sector','gics_sector','industry','st_date', 'st_YR','st_Mnth'] + \
           final_operating + final_investing + final_finance + final_adhoc

df_final = df[col_list].copy()

In [105]:
df_final.head(100)

Unnamed: 0,company,yahoo_sector,gics_sector,industry,st_date,st_YR,st_Mnth,netIncome,netIncomeFromContinuingOperations,depreciationAmortizationDepletion,stockBasedCompensation,cashFlowFromContinuingOperatingActivities,capitalExpenditure,netBusinessPurchaseAndSale,netPPEPurchaseAndSale,netInvestmentPropertiesPurchaseAndSale,netInvestmentPurchaseAndSale,netIntangiblesPurchaseAndSale,cashFlowFromContinuingInvestingActivities,netCommonStockIssuance,cashDividendsPaid,netIssuancePaymentsOfDebt,netLongTermDebtIssuance,netShortTermDebtIssuance,netOtherFinancingCharges,repurchaseOfCapitalStock,cashFlowFromContinuingFinancingActivities,freeCashFlow
0,AM,Energy,Energy,Oil & Gas Midstream,2022-12-31,2022,2022,326242000.0,326242000.0,202434000.0,19654000.0,699604000.0,-515650000.0,17000000.0,-515650000.0,,,,-493826000.0,0.0,-433375000.0,234800000.0,0.0,234800000.0,-7203000.0,0.0,-205778000.0,183954000.0
2,AM,Energy,Energy,Oil & Gas Midstream,2019-12-31,2019,2019,-355114000.0,-355114000.0,152536000.0,73517000.0,622387000.0,-267383000.0,-133536000.0,-267383000.0,,,,-525675000.0,-125519000.0,-496197000.0,534500000.0,650000000.0,-115500000.0,-11083000.0,-125519000.0,-98299000.0,355004000.0
3,AM,Energy,Energy,Oil & Gas Midstream,2020-12-31,2020,2020,-122527000.0,-122527000.0,179462000.0,12778000.0,753382000.0,-157931000.0,-25267000.0,-157931000.0,,,,-219231000.0,-24713000.0,-590190000.0,204000000.0,550000000.0,-346000000.0,-123843000.0,-24713000.0,-534746000.0,595451000.0
4,AM,Energy,Energy,Oil & Gas Midstream,2021-12-31,2021,2021,78626000.0,331617000.0,179462000.0,13529000.0,709752000.0,-232825000.0,-2070000.0,-232825000.0,,,,-233242000.0,0.0,-471721000.0,16228000.0,82528000.0,-66300000.0,-21657000.0,0.0,-477150000.0,476927000.0
5,AR,Energy,Energy,Oil & Gas E&P,2019-12-31,2019,2019,-340129000.0,-293136000.0,918629000.0,23559000.0,1103458000.0,-168037000.0,371591000.0,-168037000.0,,,,-1041490000.0,-38772000.0,0.0,690908000.0,690908000.0,0.0,-94572000.0,-38772000.0,557564000.0,935421000.0
6,AR,Energy,Energy,Oil & Gas E&P,2020-12-31,2020,2020,-1267897000.0,-1260411000.0,865291000.0,23317000.0,735640000.0,-48092000.0,125000000.0,-48092000.0,,,,-530061000.0,-43443000.0,0.0,-466519000.0,-466519000.0,0.0,304383000.0,-43443000.0,-205579000.0,687548000.0
7,AR,Energy,Energy,Oil & Gas E&P,2021-12-31,2021,2021,-186899000.0,-154109000.0,745829000.0,20437000.0,1660116000.0,-114761000.0,0.0,-114761000.0,,,,-710784000.0,0.0,0.0,-857305000.0,159695000.0,-1017000000.0,-92027000.0,0.0,-949332000.0,1545355000.0
8,AR,Energy,Energy,Oil & Gas E&P,2022-12-31,2022,2022,1898771000.0,2025972000.0,685227000.0,35443000.0,3051342000.0,-163322000.0,0.0,-163322000.0,,,,-943612000.0,-873744000.0,0.0,-992928000.0,-1027728000.0,34800000.0,-241058000.0,-873744000.0,-2107730000.0,2888020000.0
10,APA,Energy,Energy,Oil & Gas E&P,2022-12-31,2022,2022,3604000000.0,4082000000.0,1233000000.0,,4943000000.0,-2398000000.0,-143000000.0,-2398000000.0,,224000000.0,,-1511000000.0,-1423000000.0,-218000000.0,-1469000000.0,-1493000000.0,24000000.0,-379000000.0,-1423000000.0,-3489000000.0,2545000000.0
12,APA,Energy,Energy,Oil & Gas E&P,2019-12-31,2019,2019,-3515000000.0,-3682000000.0,2680000000.0,,2867000000.0,-2961000000.0,-1172000000.0,-2243000000.0,,,,-3446000000.0,2000000.0,-376000000.0,235000000.0,235000000.0,396000000.0,251000000.0,,112000000.0,-94000000.0


In [106]:
# Save final cut

with open('/home/priyesh/projects/TS1/pickle/yahoo_cash_sheets_final_stage5.pkl','wb') as f:
    pickle.dump(df_final,f)