## Extract Income Sheet Data

Take the data downloaded from Yahoo, select a subset of useful features, validate date and exclude problem 
companies to create a clean data sets.

In [1]:
import pandas as pd
import numpy as np
import pickle
import os

In [2]:
# Functions shared across multiple notebooks are stored in yahoo_data_ext_kit.py which can be found in the same
# directory as the notebooks.

import yahoo_data_ext_kit as ext

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
pd.set_option('display.max_rows', 800)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 100)
pd.options.display.float_format = '{:.0f}'.format

In [5]:
DATA_ROOT_DIR='/mnt/data/projects/MD3'
PROJ_ROOT_DIR='/home/priyesh/projects/MD3'

## Income Sheet

Extract Income Sheets from data downloaded from Yahoo and format as a DataFrame

In [6]:
filepath=os.path.join(PROJ_ROOT_DIR,'pickle','yahoo_company_data.pkl')
master_company_data = pd.read_pickle(filepath)
filepath=os.path.join(PROJ_ROOT_DIR,'pickle','yahoo_drop_data_details_stage3.pkl')
drop_data_details = pd.read_pickle(filepath)

In [7]:
income_sheets = ext.extract_statements('income',master_company_data)

AM
AR
APA
BKR
LNG
CHK
CVX
COP
CTRA
DVN
FANG
DTM
EVA
EOG
EQT
XOM
HAL
HES
DINO
KMI
MRO
MPC
NFE
NOV
OXY
OKE
OVV
PDCE
PSX
PXD
RRC
SLB
SWN
TRGP
TPL
VLO
VTS
WMB
AES
LNT
AEE
AEP
AWK
ATO
AGR
BEPC
CNP
CMS
ED
CEG
D
DTE
DUK
EIX
ETR
WTRG
EVRG
ES
EXC
FE
HE
IDA
MDU
NFG
NEE
NI
NRG
OGE
PCG
PNW
PPL
PEG
SRE
SO
UGI
VST
WEC
XEL
ATVI
GOOG
ATUS
AMC
T
CABO
CHTR
CMCSA
DISH
DIS
EA
FOXA
FOX
FYBR
IAC
IPG
LBRDA
LBRDK
FWONA
FWONK
LSXMA
LSXMK
LYV
LUMN
MSGS
MTCH
META
NFLX
NYT
NWSA
NWS
NXST
OMC
PARAA
PARA
PINS
PLTK
RBLX
ROKU
SIRI
SPOT
TMUS
TTWO
TRIP
VZ
WBD
WWE
ZI
ADM
ACI
MO
BJ
SAM
BF-A
BG
CPB
CASY
CHD
CLX
KO
CL
CAG
STZ
COST
COTY
DAR
DG
DLTR
EL
FLO
FRPT
GIS
GO
HSY
HRL
INGR
K
KDP
KMB
KHC
KR
LW
MKC
TAP
MDLZ
MNST
OLPX
PEP
PFGC
PM
PPC
POST
PG
REYN
SEB
SJM
SPB
SYY
TGT
TSN
USFD
WBA
WMT
APD
ALB
AA
AMCR
ATR
AMBP
ASH
AVY
AXTA
BALL
BERY
CE
CF
CC
CLF
CTVA
CCK
DOW
DD
EXP
EMN
ECL
ESI
FMC
FCX
DNA
GPK
HUN
IFF
IP
LIN
LPX
LYB
MLM
MOS
MP
NEU
NEM
NUE
OLN
PKG
PPG
RS
RGLD
RPM
SMG
SEE
SHW
SLGN
SON
SCCO
SSRM
STLD
X
VVV
VMC
WLK
WRK
ARE
AMH
A

In [8]:
# By taking a copy to modify, we always have the original to refer back to if and when required.

df = income_sheets.copy()
df.head()

Unnamed: 0,st_date,basicAverageShares,basicEPS,costOfRevenue,dilutedAverageShares,dilutedEPS,dilutedNIAvailtoComStockholders,eBITDA,earningsFromEquityInterest,ebit,gainOnSaleOfPPE,generalAndAdministrativeExpense,grossProfit,impairmentOfCapitalAssets,interestExpense,interestExpenseNonOperating,netIncome,netIncomeCommonStockholders,netIncomeContinuousOperations,netIncomeFromContinuingAndDiscontinuedOperation,netIncomeFromContinuingOperationNetMinorityInterest,netIncomeIncludingNoncontrollingInterests,netInterestIncome,netNonOperatingInterestIncomeExpense,normalizedEBITDA,normalizedIncome,operatingExpense,operatingIncome,operatingRevenue,otherGandA,otherIncomeExpense,otherOperatingExpenses,preferredStockDividends,pretaxIncome,reconciledCostOfRevenue,reconciledDepreciation,sellingGeneralAndAdministration,specialIncomeCharges,taxEffectOfUnusualItems,taxProvision,taxRateForCalcs,totalExpenses,totalOperatingIncomeAsReported,totalRevenue,totalUnusualItems,totalUnusualItemsExcludingGoodwill,otherunderPreferredStockDividend,restructuringAndMergernAcquisition,salariesAndWages,totalOtherFinanceCost,otherSpecialCharges,company,industry,yahoo_sector,gics_sector,st_YR,st_Mnth,gainOnSaleOfSecurity,minorityInterests,otherNonOperatingIncomeExpenses,sellingAndMarketingExpense,writeOff,gainOnSaleOfBusiness,interestIncome,interestIncomeNonOperating,otherTaxes,averageDilutionEarnings,researchAndDevelopment,amortization,amortizationOfIntangiblesIncomeStatement,depletionIncomeStatement,depreciationAmortizationDepletionIncomeStatement,depreciationAndAmortizationInIncomeStatement,depreciationIncomeStatement,netIncomeDiscontinuousOperations,rentAndLandingFees,rentExpenseSupplemental,securitiesAmortization,earningsFromEquityInterestNetOfTax,provisionForDoubtfulAccounts,insuranceAndClaims,exciseTaxes,netIncomeExtraordinary,netIncomeFromTaxLossCarryforward
0,2022-12-31,478232000.0,1.0,382688000,480300000.0,1.0,325692000,836118000.0,94218000,633684000,2251000.0,62125000,607969000,3702000.0,189948000,189948000,326242000,325692000,326242000,326242000,326242000,652484000,-189948000,-189948000,837569000,327308485,67052000,540917000,990657000,62125000,92767000,4927000,550000,443736000,382688000,202434000,62125000,-1451000,-384515,117494000,0,449740000,539466000,990657000,-1451000,-1451000,,,,,,AM,Oil & Gas Midstream,Energy,Energy,2022,12,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,2023-03-31,,,405445000,,,332159000,863429000.0,95442000,654099000,2378000.0,61541000,626196000,,200293000,200293000,332709000,332159000,332709000,332709000,332709000,665418000,-200293000,-200293000,864753000,333679694,66215000,559981000,1031641000,61541000,94118000,4674000,550000,453806000,405445000,209330000,61541000,-1324000,-353306,121097000,0,471660000,558657000,1031641000,-1324000,-1324000,,,,,,AM,Oil & Gas Midstream,Energy,Energy,2023,3,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,2019-12-31,442640000.0,-1.0,348354000,442640000.0,-1.0,-355114000,,51315000,-347178000,,118113000,501244000,761960000.0,110402000,110402000,-355114000,-355114000,-355114000,-355114000,-355114000,-355114000,-110402000,-110402000,575394000,242487652,129701000,371543000,849598000,44596000,-718721000,11588000,442000,-457580000,348354000,152536000,118113000,-770036000,-172434348,-102466000,0,478055000,-398493000,849598000,-770036000,-770036000,0.0,8076000.0,73517000.0,110402000.0,,AM,Oil & Gas Midstream,Energy,Energy,2019,12,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,2020-12-31,478278000.0,-0.0,344848000,478278000.0,-0.0,-123077000,,86430000,-31208000,-2929000.0,52213000,626543000,673640000.0,147007000,147007000,-122527000,-123077000,-122527000,-122527000,-122527000,-245054000,-147007000,-147007000,824823000,342630085,67612000,558931000,971391000,52213000,-590139000,15399000,550000,-178215000,344848000,179462000,52213000,-676569000,-211411915,-55688000,0,412460000,-117638000,971391000,-676569000,-676569000,,0.0,,,,AM,Oil & Gas Midstream,Energy,Energy,2020,12,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,2021-12-31,477270000.0,1.0,84254000,479736000.0,1.0,78626000,,24104000,151568000,0.0,16847000,149904000,3460000.0,44366000,44366000,78626000,78626000,78626000,78626000,78626000,157252000,-44366000,-44366000,335546000,81938205,17924000,131980000,234158000,16847000,19588000,1077000,550000,107202000,84254000,179462000,16847000,-4516000,-1203795,28576000,0,102178000,128520000,234158000,-4516000,-4516000,,0.0,,,1056000.0,AM,Oil & Gas Midstream,Energy,Energy,2021,12,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [9]:
len(df['company'].unique())

848

In [10]:
# Remove sectors, industries and individual companies previously identified for removal.

# Note: as we go through the analysis of the data for each of the financial statements, companies, industries and 
# sectors are identified for removal and stored in drop_data_details. 

for sector in drop_data_details['sector']:
   print(sector)
   df.drop(df[df['yahoo_sector'] == sector].index, inplace=True, axis=0)
    
for industry in drop_data_details['industry']:
   print(industry)
   df.drop(df[df['industry'] == industry].index, inplace=True, axis=0)
    
for company in drop_data_details['companies']:
  print(company)
  df.drop(df[df['company'] == company].index, inplace=True, axis=0)

Real Estate
AMBP
CI
ORCL
LHX
LSXMK
VTS
BF-A
CCCS
AGL
FWONK
NWSA
DRI
AR
FOXA
RPM
FDX
PARAA
LCID
GIS
HUM
RXO
HRB
LBRDK
UAA
BEPC
LSXMA
FYBR
CAG
DNA
ELV
PAYX
MBC
NKE
CASY
EHAB
ZG
ESAB
CXT
GEHC
FWONA
CTAS
CHPT
LW


In [11]:
len(df['company'].unique())

740

In [12]:
list(df.columns.sort_values())

['amortization',
 'amortizationOfIntangiblesIncomeStatement',
 'averageDilutionEarnings',
 'basicAverageShares',
 'basicEPS',
 'company',
 'costOfRevenue',
 'depletionIncomeStatement',
 'depreciationAmortizationDepletionIncomeStatement',
 'depreciationAndAmortizationInIncomeStatement',
 'depreciationIncomeStatement',
 'dilutedAverageShares',
 'dilutedEPS',
 'dilutedNIAvailtoComStockholders',
 'eBITDA',
 'earningsFromEquityInterest',
 'earningsFromEquityInterestNetOfTax',
 'ebit',
 'exciseTaxes',
 'gainOnSaleOfBusiness',
 'gainOnSaleOfPPE',
 'gainOnSaleOfSecurity',
 'generalAndAdministrativeExpense',
 'gics_sector',
 'grossProfit',
 'impairmentOfCapitalAssets',
 'industry',
 'insuranceAndClaims',
 'interestExpense',
 'interestExpenseNonOperating',
 'interestIncome',
 'interestIncomeNonOperating',
 'minorityInterests',
 'netIncome',
 'netIncomeCommonStockholders',
 'netIncomeContinuousOperations',
 'netIncomeDiscontinuousOperations',
 'netIncomeExtraordinary',
 'netIncomeFromContinuingAn

In [13]:
len(df.columns)

84

In [13]:
# Organise columns of interest into sections, to help with the analysis. 
# Also create a mapping of columns so that the final data set has meaningful column names.

is_col_name_mapping = {}

summary = ['company', 'yahoo_sector','industry','st_YR','st_Mnth','st_date',
           'totalRevenue', 'costOfRevenue', 'grossProfit',
           'researchAndDevelopment', 'sellingGeneralAndAdministration',
           'sellingAndMarketingExpense',
           'operatingExpense',
           'operatingIncome',
           'totalOperatingIncomeAsReported',
           'interestExpense',
           'otherNonOperatingIncomeExpenses',
           'netIncome',
           'netIncomeFromContinuingOperationNetMinorityInterest',
           'ebit',
           'eBITDA']

revenue = ['company', 'yahoo_sector','industry','st_date',
           'totalRevenue',
           'costOfRevenue',
           'reconciledCostOfRevenue',
           'operatingRevenue',
           'grossProfit']

income = ['company', 'yahoo_sector','industry','st_date',
          'interestIncome',
          'netInterestIncome',
          'netIncome',
          'operatingIncome',
          'otherIncomeExpense',
          'netIncomeContinuousOperations',
          'netIncomeFromContinuingAndDiscontinuedOperation',
          'netIncomeFromContinuingOperationNetMinorityInterest',
          'totalOperatingIncomeAsReported',
          'pretaxIncome']

expense = ['company', 'yahoo_sector','industry','st_date', 
           'sellingGeneralAndAdministration',
           'generalAndAdministrativeExpense',
           'otherGandA',
           'researchAndDevelopment',
           'sellingAndMarketingExpense',
           'netInterestIncome',
           'interestExpense',
           'interestExpenseNonOperating',
           'taxProvision',
           'otherOperatingExpenses',
           'operatingExpense',
           'totalExpenses']

adhoc = ['company', 'yahoo_sector','industry','st_date',
         'netInterestIncome',
         'interestIncome',
         'interestExpense',
         'interestExpenseNonOperating',
         'interestIncomeNonOperating', 
         'dilutedEPS',
         'basicEPS',
         'ebit',
         'eBITDA']

In [14]:
# Identify companies which have less than 4 years of data

len(df.groupby('company').filter(lambda x: len(x) < 4))

0

In [15]:
# We are epeccting max 4 years of data, so do we have ones with more?

df.groupby('company').filter(lambda x: len(x) > 4).groupby('company').size().sort_values()

company
ZTS      5
MANH     5
MAR      5
MAS      5
MASI     5
MAT      5
MCD      5
MAN      5
MCHP     5
MCW      5
MDB      5
MDLZ     5
MDT      5
MDU      5
META     5
MCK      5
TWLO     5
LYV      5
LYFT     5
LLY      5
LMT      5
LNG      5
LNT      5
LOPE     5
LOW      5
LPX      5
TXG      5
LSCC     5
LSTR     5
LULU     5
LUMN     5
LUV      5
LVS      5
LYB      5
MGM      5
LKQ      5
MHK      5
TWKS     5
TSCO     5
MTCH     5
MTD      5
MTN      5
TRU      5
TRMB     5
MSI      5
NATI     5
NCLH     5
NCNO     5
NCR      5
TRIP     5
NEE      5
NEM      5
NBIX     5
TSLA     5
MSA      5
MRVL     5
MKSI     5
MLM      5
MMM      5
MNST     5
MO       5
MOH      5
MOS      5
MP       5
TTWO     5
MPWR     5
MRK      5
TTD      5
TTC      5
TT       5
MRVI     5
MIDD     5
NET      5
TXN      5
LII      5
IEX      5
IFF      5
ILMN     5
INCY     5
INFA     5
INGR     5
IDXX     5
INTC     5
IONS     5
IP       5
IPG      5
IPGP     5
IQV      5
IR       5
ULTA     5
ID

Ones with 5 include TTM. Need to investigate why we have more than 5 though.

Taking MSFT as an example and comparing with Yahoo website, 2022-12-31 coresponds to TTM.

Looks like 2023-03-31 is spurious quarterly statement.

In [16]:
df[df['company'] == 'MSFT'][summary]

Unnamed: 0,company,yahoo_sector,industry,st_YR,st_Mnth,st_date,totalRevenue,costOfRevenue,grossProfit,researchAndDevelopment,sellingGeneralAndAdministration,sellingAndMarketingExpense,operatingExpense,operatingIncome,totalOperatingIncomeAsReported,interestExpense,otherNonOperatingIncomeExpenses,netIncome,netIncomeFromContinuingOperationNetMinorityInterest,ebit,eBITDA
2856,MSFT,Technology,Software—Infrastructure,2022,12,2022-12-31,204094000000,64984000000,139110000000,26627000000,29668000000,22704000000,56295000000,82815000000,82815000000,1989000000,-103000000,67449000000,67449000000,84577000000,98767000000.0
2857,MSFT,Technology,Software—Infrastructure,2023,3,2023-03-31,207591000000,65497000000,142094000000,27305000000,29986000000,22859000000,57291000000,84803000000,84803000000,1982000000,-185000000,69020000000,69020000000,87053000000,101019000000.0
2858,MSFT,Technology,Software—Infrastructure,2019,6,2019-06-30,125843000000,42910000000,82933000000,16876000000,23098000000,18213000000,39974000000,42959000000,42959000000,2686000000,729000000,39240000000,39240000000,42959000000,
2859,MSFT,Technology,Software—Infrastructure,2020,6,2020-06-30,143015000000,46078000000,96937000000,19269000000,24709000000,19598000000,43978000000,52959000000,52959000000,2591000000,77000000,44281000000,44281000000,52959000000,
2860,MSFT,Technology,Software—Infrastructure,2021,6,2021-06-30,168088000000,52232000000,115856000000,20716000000,25224000000,20117000000,45940000000,69916000000,69916000000,2346000000,1186000000,61271000000,61271000000,69916000000,
2861,MSFT,Technology,Software—Infrastructure,2022,6,2022-06-30,198270000000,62650000000,135620000000,24512000000,27725000000,21825000000,52237000000,83383000000,83383000000,2063000000,333000000,72738000000,72738000000,83383000000,


Remove spurious data where the statement date is not on the usual anniversary. Best way to do that is go through
each company and see which month is most common for the statement date and assume that is the usual month for
annual report.

In [17]:
df_new = pd.DataFrame()

ticker_list = list(df['company'].unique())

for ticker in ticker_list:
  print(ticker)
  df_company = df[df['company'] == ticker].copy()

  # Exract month and store as separate column
    
  df_company['M'] = df_company['st_date'].apply(lambda x: x.split("-")[1])
    
  # Determine the most common month for annual report for the company. Consider reports for any other month 
  # as spurious and can be deleted.
    
  mode_M = df_company['M'].mode()[0]
  df_company.drop(df_company[df_company['M'] != mode_M].index, axis=0, inplace=True)
    
  df_new = pd.concat([df_new,df_company],axis=0)

df = df_new

AM
APA
BKR
LNG
CHK
CVX
COP
CTRA
DVN
FANG
DTM
EVA
EOG
EQT
XOM
HAL
HES
DINO
KMI
MRO
MPC
NFE
NOV
OXY
OKE
OVV
PDCE
PSX
PXD
RRC
SLB
SWN
TRGP
TPL
VLO
WMB
AES
LNT
AEE
AEP
AWK
ATO
AGR
CNP
CMS
ED
CEG
D
DTE
DUK
EIX
ETR
WTRG
EVRG
ES
EXC
FE
HE
IDA
MDU
NFG
NEE
NI
NRG
OGE
PCG
PNW
PPL
PEG
SRE
SO
UGI
VST
WEC
XEL
ATVI
GOOG
ATUS
AMC
T
CABO
CHTR
CMCSA
DISH
DIS
EA
FOX
IAC
IPG
LBRDA
LYV
LUMN
MSGS
MTCH
META
NFLX
NYT
NWS
NXST
OMC
PARA
PINS
PLTK
RBLX
ROKU
SIRI
SPOT
TMUS
TTWO
TRIP
VZ
WBD
WWE
ZI
ADM
ACI
MO
BJ
SAM
BG
CPB
CHD
CLX
KO
CL
STZ
COST
COTY
DAR
DG
DLTR
EL
FLO
FRPT
GO
HSY
HRL
INGR
K
KDP
KMB
KHC
KR
MKC
TAP
MDLZ
MNST
OLPX
PEP
PFGC
PM
PPC
POST
PG
REYN
SEB
SJM
SPB
SYY
TGT
TSN
USFD
WBA
WMT
APD
ALB
AA
AMCR
ATR
ASH
AVY
AXTA
BALL
BERY
CE
CF
CC
CLF
CTVA
CCK
DOW
DD
EXP
EMN
ECL
ESI
FMC
FCX
GPK
HUN
IFF
IP
LIN
LPX
LYB
MLM
MOS
MP
NEU
NEM
NUE
OLN
PKG
PPG
RS
RGLD
SMG
SEE
SHW
SLGN
SON
SCCO
SSRM
STLD
X
VVV
VMC
WLK
WRK
Z
MMM
AYI
ADP
WMS
ACM
AGCO
AL
ALK
ALLE
ALSN
AAL
AME
AWI
CAR
AXON
AZEK
BA
BAH
BR
BLDR
BWXT
CHRW
CACI
CSL
CA

In [18]:
# Let's see if that leaves us with companies with less than 4 years of accounts

df.groupby('company').filter(lambda x: len(x) < 4)[summary]

Unnamed: 0,company,yahoo_sector,industry,st_YR,st_Mnth,st_date,totalRevenue,costOfRevenue,grossProfit,researchAndDevelopment,sellingGeneralAndAdministration,sellingAndMarketingExpense,operatingExpense,operatingIncome,totalOperatingIncomeAsReported,interestExpense,otherNonOperatingIncomeExpenses,netIncome,netIncomeFromContinuingOperationNetMinorityInterest,ebit,eBITDA
25,CHK,Energy,Oil & Gas E&P,2022,12,2022-12-31,11443000000,7744000000,3699000000,,142000000,,214000000,3485000000,3780000000.0,163000000.0,36000000.0,4936000000,4936000000,3814000000,5567000000.0
27,CHK,Energy,Oil & Gas E&P,2019,12,2019-12-31,8489000000,8093000000,396000000,,315000000,,447000000,-51000000,-31000000.0,657000000.0,30000000.0,-308000000,-308000000,18000000,
28,CHK,Energy,Oil & Gas E&P,2020,12,2020-12-31,5210000000,4590000000,620000000,,311000000,,818000000,-198000000,-8703000000.0,418000000.0,16000000.0,-9734000000,-9734000000,-9351000000,
1155,RGLD,Basic Materials,Gold,2019,6,2019-06-30,423056000,244703000,178353000,,30488000,,37646000,140707000,140707000.0,29650000.0,,93825000,93825000,136227000,
1156,RGLD,Basic Materials,Gold,2020,6,2020-06-30,498819000,263148000,235671000,,30195000,,35385000,200286000,198945000.0,9813000.0,,199343000,199343000,202409000,
1157,RGLD,Basic Materials,Gold,2021,6,2021-06-30,615856000,283210000,332646000,,28387000,,28950000,303696000,337602000.0,6419000.0,,302532000,302532000,346062000,
2108,OSK,Industrials,Farm & Heavy Construction Machinery,2019,9,2019-09-30,8382000000,6864600000,1517400000,,683500000,,720400000,797000000,797000000.0,54400000.0,1300000.0,579400000,579400000,805100000,
2109,OSK,Industrials,Farm & Heavy Construction Machinery,2020,9,2020-09-30,6856800000,5736500000,1120300000,,620600000,,631600000,488700000,488700000.0,59300000.0,2200000.0,324500000,324500000,498400000,
2110,OSK,Industrials,Farm & Heavy Construction Machinery,2021,9,2021-09-30,7737300000,6516500000,1220800000,,666500000,,676100000,544700000,544700000.0,48200000.0,-2100000.0,472700000,472700000,546100000,
2487,AZPN,Technology,Software—Application,2019,6,2019-06-30,598345000,57816000,540529000,83122000.0,174605000,111374000.0,257727000,282802000,282802000.0,8733000.0,664000.0,262734000,262734000,311923000,


In [19]:
df.groupby('company').filter(lambda x: len(x) < 4).groupby('company').size()

company
AZPN    3
CHK     3
DH      3
KD      3
OSK     3
PYCR    3
RGLD    3
UA      3
dtype: int64

Go back to unaltered income_sheets which we loaded and have a look at some of the data for a selection of companies.

In [20]:
income_sheets[income_sheets['company'].isin(['CHK','OSK','UA'])][summary] \
                                .sort_values(by=['company','st_date'])

Unnamed: 0,company,yahoo_sector,industry,st_YR,st_Mnth,st_date,totalRevenue,costOfRevenue,grossProfit,researchAndDevelopment,sellingGeneralAndAdministration,sellingAndMarketingExpense,operatingExpense,operatingIncome,totalOperatingIncomeAsReported,interestExpense,otherNonOperatingIncomeExpenses,netIncome,netIncomeFromContinuingOperationNetMinorityInterest,ebit,eBITDA
27,CHK,Energy,Oil & Gas E&P,2019,12,2019-12-31,8489000000,8093000000,396000000,,315000000,,447000000,-51000000,-31000000,657000000.0,30000000,-308000000,-308000000,18000000,
28,CHK,Energy,Oil & Gas E&P,2020,12,2020-12-31,5210000000,4590000000,620000000,,311000000,,818000000,-198000000,-8703000000,418000000.0,16000000,-9734000000,-9734000000,-9351000000,
25,CHK,Energy,Oil & Gas E&P,2022,12,2022-12-31,11443000000,7744000000,3699000000,,142000000,,214000000,3485000000,3780000000,163000000.0,36000000,4936000000,4936000000,3814000000,5567000000.0
26,CHK,Energy,Oil & Gas E&P,2023,3,2023-03-31,13822000000,7574000000,6248000000,,151000000,,205000000,6043000000,6394000000,169000000.0,30000000,7089000000,7089000000,6423000000,8157000000.0
2108,OSK,Industrials,Farm & Heavy Construction Machinery,2019,9,2019-09-30,8382000000,6864600000,1517400000,,683500000,,720400000,797000000,797000000,54400000.0,1300000,579400000,579400000,805100000,
2109,OSK,Industrials,Farm & Heavy Construction Machinery,2020,9,2020-09-30,6856800000,5736500000,1120300000,,620600000,,631600000,488700000,488700000,59300000.0,2200000,324500000,324500000,498400000,
2110,OSK,Industrials,Farm & Heavy Construction Machinery,2021,9,2021-09-30,7737300000,6516500000,1220800000,,666500000,,676100000,544700000,544700000,48200000.0,-2100000,472700000,472700000,546100000,
2111,OSK,Industrials,Farm & Heavy Construction Machinery,2022,12,2022-12-31,8282000000,7227600000,1054400000,,662800000,,674400000,380000000,372300000,53400000.0,-52800000,173900000,173900000,329000000,436600000.0
2112,OSK,Industrials,Farm & Heavy Construction Machinery,2023,3,2023-03-31,8604400000,7420000000,1184400000,,692700000,,705400000,479000000,471300000,54200000.0,-48100000,262600000,262600000,437900000,549600000.0
3749,UA,Consumer Cyclical,Apparel Manufacturing,2019,12,2019-12-31,5267132000,2796599000,2470533000,,2233763000,,2233763000,236770000,236770000,21240000.0,-5688000,92139000,92139000,231082000,


Most recent don't fall on the anniversary so is essentially TTM. Yahoo Finance website checked for above companies 
and they do only have 3 years of annual accounts.

Delete these companies.

In [21]:
ticker_list = list(df.groupby('company').filter(lambda x: len(x) < 4)['company'].unique())
ticker_list

['CHK', 'RGLD', 'OSK', 'AZPN', 'KD', 'PYCR', 'UA', 'DH']

In [22]:
# Drop companies with only 3 yrs of data

drop_data_details['companies'] = drop_data_details['companies'] + ticker_list
for ticker in ticker_list:
   print(ticker)
   df.drop(df[df['company'] == ticker].index, inplace=True, axis=0)

CHK
RGLD
OSK
AZPN
KD
PYCR
UA
DH


In [23]:
len(df['company'].unique())

732

In [24]:
# Sanity check. Do we still have companies with more than 4.

df.groupby('company').filter(lambda x: len(x) > 4)

Unnamed: 0,st_date,basicAverageShares,basicEPS,costOfRevenue,dilutedAverageShares,dilutedEPS,dilutedNIAvailtoComStockholders,eBITDA,earningsFromEquityInterest,ebit,gainOnSaleOfPPE,generalAndAdministrativeExpense,grossProfit,impairmentOfCapitalAssets,interestExpense,interestExpenseNonOperating,netIncome,netIncomeCommonStockholders,netIncomeContinuousOperations,netIncomeFromContinuingAndDiscontinuedOperation,netIncomeFromContinuingOperationNetMinorityInterest,netIncomeIncludingNoncontrollingInterests,netInterestIncome,netNonOperatingInterestIncomeExpense,normalizedEBITDA,normalizedIncome,operatingExpense,operatingIncome,operatingRevenue,otherGandA,otherIncomeExpense,otherOperatingExpenses,preferredStockDividends,pretaxIncome,reconciledCostOfRevenue,reconciledDepreciation,sellingGeneralAndAdministration,specialIncomeCharges,taxEffectOfUnusualItems,taxProvision,taxRateForCalcs,totalExpenses,totalOperatingIncomeAsReported,totalRevenue,totalUnusualItems,totalUnusualItemsExcludingGoodwill,otherunderPreferredStockDividend,restructuringAndMergernAcquisition,salariesAndWages,totalOtherFinanceCost,otherSpecialCharges,company,industry,yahoo_sector,gics_sector,st_YR,st_Mnth,gainOnSaleOfSecurity,minorityInterests,otherNonOperatingIncomeExpenses,sellingAndMarketingExpense,writeOff,gainOnSaleOfBusiness,interestIncome,interestIncomeNonOperating,otherTaxes,averageDilutionEarnings,researchAndDevelopment,amortization,amortizationOfIntangiblesIncomeStatement,depletionIncomeStatement,depreciationAmortizationDepletionIncomeStatement,depreciationAndAmortizationInIncomeStatement,depreciationIncomeStatement,netIncomeDiscontinuousOperations,rentAndLandingFees,rentExpenseSupplemental,securitiesAmortization,earningsFromEquityInterestNetOfTax,provisionForDoubtfulAccounts,insuranceAndClaims,exciseTaxes,netIncomeExtraordinary,netIncomeFromTaxLossCarryforward,M


In [25]:
# Sanity check. Do we still have companies with less than 4.

df.groupby('company').filter(lambda x: len(x) < 4)

Unnamed: 0,st_date,basicAverageShares,basicEPS,costOfRevenue,dilutedAverageShares,dilutedEPS,dilutedNIAvailtoComStockholders,eBITDA,earningsFromEquityInterest,ebit,gainOnSaleOfPPE,generalAndAdministrativeExpense,grossProfit,impairmentOfCapitalAssets,interestExpense,interestExpenseNonOperating,netIncome,netIncomeCommonStockholders,netIncomeContinuousOperations,netIncomeFromContinuingAndDiscontinuedOperation,netIncomeFromContinuingOperationNetMinorityInterest,netIncomeIncludingNoncontrollingInterests,netInterestIncome,netNonOperatingInterestIncomeExpense,normalizedEBITDA,normalizedIncome,operatingExpense,operatingIncome,operatingRevenue,otherGandA,otherIncomeExpense,otherOperatingExpenses,preferredStockDividends,pretaxIncome,reconciledCostOfRevenue,reconciledDepreciation,sellingGeneralAndAdministration,specialIncomeCharges,taxEffectOfUnusualItems,taxProvision,taxRateForCalcs,totalExpenses,totalOperatingIncomeAsReported,totalRevenue,totalUnusualItems,totalUnusualItemsExcludingGoodwill,otherunderPreferredStockDividend,restructuringAndMergernAcquisition,salariesAndWages,totalOtherFinanceCost,otherSpecialCharges,company,industry,yahoo_sector,gics_sector,st_YR,st_Mnth,gainOnSaleOfSecurity,minorityInterests,otherNonOperatingIncomeExpenses,sellingAndMarketingExpense,writeOff,gainOnSaleOfBusiness,interestIncome,interestIncomeNonOperating,otherTaxes,averageDilutionEarnings,researchAndDevelopment,amortization,amortizationOfIntangiblesIncomeStatement,depletionIncomeStatement,depreciationAmortizationDepletionIncomeStatement,depreciationAndAmortizationInIncomeStatement,depreciationIncomeStatement,netIncomeDiscontinuousOperations,rentAndLandingFees,rentExpenseSupplemental,securitiesAmortization,earningsFromEquityInterestNetOfTax,provisionForDoubtfulAccounts,insuranceAndClaims,exciseTaxes,netIncomeExtraordinary,netIncomeFromTaxLossCarryforward,M


In [26]:
# Check count of companies

len(df['company'].unique())

732

## Revenue

In [27]:
df[revenue].isnull().sum().sort_values()

company                     0
yahoo_sector                0
industry                    0
st_date                     0
totalRevenue                0
operatingRevenue            0
costOfRevenue              41
reconciledCostOfRevenue    41
grossProfit                41
dtype: int64

In [28]:
#Are there any differences between total revenue and operating revenue

df_temp = df[df['totalRevenue'] != df['operatingRevenue']][revenue].copy()
df_temp['calc_tot_gross_profit'] = df_temp['totalRevenue'] - df_temp['costOfRevenue']
df_temp['calc_op_gross_profit'] = df_temp['operatingRevenue'] - df_temp['costOfRevenue']
df_temp

Unnamed: 0,company,yahoo_sector,industry,st_date,totalRevenue,costOfRevenue,reconciledCostOfRevenue,operatingRevenue,grossProfit,calc_tot_gross_profit,calc_op_gross_profit
20,LNG,Energy,Oil & Gas Midstream,2022-12-31,33428000000,26751000000.0,26751000000.0,32872000000,6677000000.0,6677000000.0,6121000000.0
22,LNG,Energy,Oil & Gas Midstream,2019-12-31,9730000000,5873000000.0,5873000000.0,9512000000,3857000000.0,3857000000.0,3639000000.0
23,LNG,Energy,Oil & Gas Midstream,2020-12-31,9358000000,5093000000.0,5093000000.0,9193000000,4265000000.0,4265000000.0,4100000000.0
24,LNG,Energy,Oil & Gas Midstream,2021-12-31,15864000000,14784000000.0,14784000000.0,15664000000,1080000000.0,1080000000.0,880000000.0
40,CTRA,Energy,Oil & Gas E&P,2022-12-31,9051000000,3050000000.0,1406000000.0,9449000000,7636000000.0,6001000000.0,6399000000.0
42,CTRA,Energy,Oil & Gas E&P,2019-12-31,2066277000,1057368000.0,1057368000.0,1985240000,1008909000.0,1008909000.0,927872000.0
43,CTRA,Energy,Oil & Gas E&P,2020-12-31,1466624000,1035408000.0,1035408000.0,1404989000,431216000.0,431216000.0,369581000.0
44,CTRA,Energy,Oil & Gas E&P,2021-12-31,3449000000,1512000000.0,1512000000.0,3657000000,1937000000.0,1937000000.0,2145000000.0
50,FANG,Energy,Oil & Gas E&P,2022-12-31,9643000000,2865000000.0,2865000000.0,9566000000,6778000000.0,6778000000.0,6701000000.0
52,FANG,Energy,Oil & Gas E&P,2019-12-31,3964000000,2364000000.0,2364000000.0,3951000000,1600000000.0,1600000000.0,1587000000.0


Verified by spot checks wit Yahoo Finance website and also by comparing calculated gross profit with actual to 
deduce that we can take total revenue as the current sales revenue and ignore operating revenue.

In [29]:
df[df['costOfRevenue'].isnull()].sort_values(by=['industry','company','st_date'])[summary]

Unnamed: 0,company,yahoo_sector,industry,st_YR,st_Mnth,st_date,totalRevenue,costOfRevenue,grossProfit,researchAndDevelopment,sellingGeneralAndAdministration,sellingAndMarketingExpense,operatingExpense,operatingIncome,totalOperatingIncomeAsReported,interestExpense,otherNonOperatingIncomeExpenses,netIncome,netIncomeFromContinuingOperationNetMinorityInterest,ebit,eBITDA
3656,RIVN,Consumer Cyclical,Auto Manufacturers,2020,12,2020-12-31,0,,,766000000.0,255000000.0,,1021000000,-1021000000,-1021000000.0,8000000.0,1000000.0,-1018000000,-1018000000,-1010000000,
3640,QS,Consumer Cyclical,Auto Parts,2019,12,2019-12-31,0,,,45944000.0,9874000.0,,55818000,-55818000,-55818000.0,94000.0,1041000.0,-51283000,-51283000,-51169000,
3641,QS,Consumer Cyclical,Auto Parts,2020,12,2020-12-31,0,,,65103000.0,15918000.0,,81021000,-81021000,-81021000.0,20765000.0,-999227000.0,-1099914000,-1099914000,-1079155000,
3642,QS,Consumer Cyclical,Auto Parts,2021,12,2021-12-31,0,,,151496000.0,63770000.0,,215266000,-215266000,-215266000.0,1419000.0,151000.0,-45966000,-45966000,-44558000,
3638,QS,Consumer Cyclical,Auto Parts,2022,12,2022-12-31,0,,,297435000.0,123183000.0,,420618000,-420618000,-420618000.0,2399000.0,216000.0,-411907000,-411907000,-409497000,-373374000.0
4192,MRNA,Healthcare,Biotechnology,2019,12,2019-12-31,48036000,,,496309000.0,109620000.0,,593756000,-545720000,-545720000.0,6612000.0,-7526000.0,-514021000,-514021000,-545720000,
4186,MRTX,Healthcare,Biotechnology,2019,12,2019-12-31,3335000,,,182866000.0,42573000.0,,225439000,-222104000,-222104000.0,,8848000.0,-212846000,-213256000,-222104000,
4187,MRTX,Healthcare,Biotechnology,2020,12,2020-12-31,13398000,,,299349000.0,83412000.0,,382761000,-369363000,-369363000.0,,11426000.0,-358067000,-357937000,-369363000,
4182,MRTX,Healthcare,Biotechnology,2021,12,2021-12-31,72092000,,,508594000.0,136679000.0,,645273000,-573181000,-573181000.0,,-5304000.0,-582475000,-581784000,-573181000,
4210,NVAX,Healthcare,Biotechnology,2019,12,2019-12-31,18662000,,,113842000.0,34417000.0,,148259000,-129597000,-120581000.0,12188000.0,-13000.0,-132694000,-132694000,-129597000,


In [30]:
df[df['company'].isin(['RIVN','MRNA','MRTX','NVAX','BKI','DNB'])][summary].sort_values(by=['company','st_date'])

Unnamed: 0,company,yahoo_sector,industry,st_YR,st_Mnth,st_date,totalRevenue,costOfRevenue,grossProfit,researchAndDevelopment,sellingGeneralAndAdministration,sellingAndMarketingExpense,operatingExpense,operatingIncome,totalOperatingIncomeAsReported,interestExpense,otherNonOperatingIncomeExpenses,netIncome,netIncomeFromContinuingOperationNetMinorityInterest,ebit,eBITDA
2523,BKI,Technology,Software—Application,2019,12,2019-12-31,1177200000,646000000.0,531200000.0,,,,882200000,295000000,289600000,63500000.0,-1400000,108800000,108800000,288200000,
2524,BKI,Technology,Software—Application,2020,12,2020-12-31,1238500000,,,,,,940300000,298200000,266800000,62900000.0,16400000,264100000,264100000,283200000,
2525,BKI,Technology,Software—Application,2021,12,2021-12-31,1475200000,,,,,,1158900000,316300000,303000000,83600000.0,-6400000,207900000,207900000,296600000,
2521,BKI,Technology,Software—Application,2022,12,2022-12-31,1551900000,,,,,,1241900000,310000000,278200000,100600000.0,-11900000,452500000,452500000,266300000,590900000.0
1809,DNB,Financial Services,Financial Data & Stock Exchanges,2019,12,2019-12-31,1413900000,,,,651200000.0,,1582100000,-168200000,-220000000,303500000.0,17600000,-560000000,-560000000,-372400000,
1810,DNB,Financial Services,Financial Data & Stock Exchanges,2020,12,2020-12-31,1738100000,,,,557800000.0,,1640300000,97800000,63000000,271100000.0,-12000000,-111500000,-111500000,51800000,
1811,DNB,Financial Services,Financial Data & Stock Exchanges,2021,12,2021-12-31,2165600000,664300000.0,1501300000.0,,714700000.0,,1330600000,170700000,145600000,206400000.0,14900000,-71700000,-71700000,161200000,
1807,DNB,Financial Services,Financial Data & Stock Exchanges,2022,12,2022-12-31,2224600000,721400000.0,1503200000.0,,745600000.0,,1332800000,170400000,149900000,193200000.0,30200000,-2300000,-2300000,166000000,753200000.0
4192,MRNA,Healthcare,Biotechnology,2019,12,2019-12-31,48036000,,,496309000.0,109620000.0,,593756000,-545720000,-545720000,6612000.0,-7526000,-514021000,-514021000,-545720000,
4190,MRNA,Healthcare,Biotechnology,2020,12,2020-12-31,274490000,7933000.0,266557000.0,1370339000.0,188267000.0,,1029701000,-763144000,-763144000,9886000.0,-6084000,-747064000,-747064000,-763144000,


With QS and Rivian there were no sales and hence no cost of sales. 

MRNA, MRTX and NVAX we can make a reasonable 
assumption that if not reported, cost of sales (essentially cost of production) is negligible if not reported. 
Note the trend in sales which supports this assumption.

Not sure what to make of BKI as it reports significant cost of sales in one of the four years. Let's just delete
this company.

It's reasonable to assume that FOX and NWS being entertainment companies don't have specific cost of sales. Same 
applies to BKNG as an online hotel booking company. So set cost of sales to 0 for these companies.

Drop companies which come under 'Healthcare Plans' as these are essentially insurance companies.

DNB is services so setting cost of sales to 0 would be a reasonable but cost of sales is reported for the 2 most
recent years. Most likely as a result of acquisitions. Safest is to drop this company.

In [31]:
drop_data_details['companies'] = drop_data_details['companies'] + ['DNB','BKI']
df.drop(df[df['company'] == 'DNB'].index, inplace=True, axis=0)
df.drop(df[df['company'] == 'BKI'].index, inplace=True, axis=0)

drop_data_details['industry'] = drop_data_details['industry'] + ['Healthcare Plans']
df.drop(df[df['industry'] == 'Healthcare Plans'].index, inplace=True, axis=0)

In [32]:
len(df['company'].unique())

726

In [33]:
df[df['costOfRevenue'].isnull()].sort_values(by=['industry','company','st_date'])[revenue]

Unnamed: 0,company,yahoo_sector,industry,st_date,totalRevenue,costOfRevenue,reconciledCostOfRevenue,operatingRevenue,grossProfit
3656,RIVN,Consumer Cyclical,Auto Manufacturers,2020-12-31,0,,,0,
3640,QS,Consumer Cyclical,Auto Parts,2019-12-31,0,,,0,
3641,QS,Consumer Cyclical,Auto Parts,2020-12-31,0,,,0,
3642,QS,Consumer Cyclical,Auto Parts,2021-12-31,0,,,0,
3638,QS,Consumer Cyclical,Auto Parts,2022-12-31,0,,,0,
4192,MRNA,Healthcare,Biotechnology,2019-12-31,48036000,,,48036000,
4186,MRTX,Healthcare,Biotechnology,2019-12-31,3335000,,,3335000,
4187,MRTX,Healthcare,Biotechnology,2020-12-31,13398000,,,13398000,
4182,MRTX,Healthcare,Biotechnology,2021-12-31,72092000,,,72092000,
4210,NVAX,Healthcare,Biotechnology,2019-12-31,18662000,,,18662000,


In [34]:
df['costOfRevenue'] = df['costOfRevenue'].fillna(0)

In [35]:
#Investigate reconciledCostOfRevenue

df[df['costOfRevenue'] != df['reconciledCostOfRevenue']][revenue]

Unnamed: 0,company,yahoo_sector,industry,st_date,totalRevenue,costOfRevenue,reconciledCostOfRevenue,operatingRevenue,grossProfit
40,CTRA,Energy,Oil & Gas E&P,2022-12-31,9051000000,3050000000,1406000000.0,9449000000,7636000000.0
55,DTM,Energy,Oil & Gas Midstream,2022-12-31,920000000,0,,920000000,
57,DTM,Energy,Oil & Gas Midstream,2019-12-31,504000000,0,,504000000,
58,DTM,Energy,Oil & Gas Midstream,2020-12-31,754000000,0,,754000000,
59,DTM,Energy,Oil & Gas Midstream,2021-12-31,840000000,0,,840000000,
63,EVA,Basic Materials,Lumber & Wood Production,2022-12-31,1094276000,927453000,1040630000.0,1094276000,53646000.0
81,HAL,Energy,Oil & Gas Equipment & Services,2022-12-31,20297000000,16984000000,16044000000.0,20297000000,3313000000.0
85,HAL,Energy,Oil & Gas Equipment & Services,2021-12-31,15295000000,13256000000,12388000000.0,15295000000,2039000000.0
113,NFE,Utilities,Utilities—Regulated Gas,2022-12-31,2368272000,1179746000,1178797000.0,2335803000,1188526000.0
115,NFE,Utilities,Utilities—Regulated Gas,2019-12-31,189125000,210258000,209557000.0,145500000,-21133000.0


Checked on Yahoo Finance website for companies EVRG, WHR, SBUX. costOfRevenue holds the correct figure. Not sure
what reconciledCostOfRevenue is but it doesn't match anything displayed on the web site.

In [36]:
df[df['grossProfit'].isnull()].sort_values(by=['industry','company','st_date'])[revenue]

Unnamed: 0,company,yahoo_sector,industry,st_date,totalRevenue,costOfRevenue,reconciledCostOfRevenue,operatingRevenue,grossProfit
3656,RIVN,Consumer Cyclical,Auto Manufacturers,2020-12-31,0,0,,0,
3640,QS,Consumer Cyclical,Auto Parts,2019-12-31,0,0,,0,
3641,QS,Consumer Cyclical,Auto Parts,2020-12-31,0,0,,0,
3642,QS,Consumer Cyclical,Auto Parts,2021-12-31,0,0,,0,
3638,QS,Consumer Cyclical,Auto Parts,2022-12-31,0,0,,0,
4192,MRNA,Healthcare,Biotechnology,2019-12-31,48036000,0,,48036000,
4186,MRTX,Healthcare,Biotechnology,2019-12-31,3335000,0,,3335000,
4187,MRTX,Healthcare,Biotechnology,2020-12-31,13398000,0,,13398000,
4182,MRTX,Healthcare,Biotechnology,2021-12-31,72092000,0,,72092000,
4210,NVAX,Healthcare,Biotechnology,2019-12-31,18662000,0,,18662000,


Set grossProfit to totalRevenue if null given that cost of Revenue is 0. Note, that given the type of companies
shown above, it's feasible that costOfRevenue is reported as 0.

In [37]:
df['grossProfit'] = np.where(df.grossProfit.isnull(), \
                            df.totalRevenue, df.grossProfit)

In [38]:
revenue

['company',
 'yahoo_sector',
 'industry',
 'st_date',
 'totalRevenue',
 'costOfRevenue',
 'reconciledCostOfRevenue',
 'operatingRevenue',
 'grossProfit']

In [39]:
clist= ['totalRevenue',
        'costOfRevenue',
        'reconciledCostOfRevenue',
        'operatingRevenue']

df[clist] = df[clist].fillna(0)

In [40]:
final_revenue = ['totalRevenue','costOfRevenue','grossProfit']

## Income

In [41]:
df[income].isnull().sum().sort_values()

company                                                   0
yahoo_sector                                              0
industry                                                  0
st_date                                                   0
netIncome                                                 0
operatingIncome                                           0
netIncomeContinuousOperations                             0
netIncomeFromContinuingAndDiscontinuedOperation           0
netIncomeFromContinuingOperationNetMinorityInterest       0
pretaxIncome                                              0
netInterestIncome                                        68
otherIncomeExpense                                      140
totalOperatingIncomeAsReported                          353
interestIncome                                         1303
dtype: int64

Investigate netInterestIncome later as most likely this is net of interest paid and received.

In [42]:
df[df['netIncome'] != df['netIncomeFromContinuingOperationNetMinorityInterest']][income].head(500)

Unnamed: 0,company,yahoo_sector,industry,st_date,interestIncome,netInterestIncome,netIncome,operatingIncome,otherIncomeExpense,netIncomeContinuousOperations,netIncomeFromContinuingAndDiscontinuedOperation,netIncomeFromContinuingOperationNetMinorityInterest,totalOperatingIncomeAsReported,pretaxIncome
45,DVN,Energy,Oil & Gas E&P,2019-12-31,,-250000000.0,-355000000,181000000,-40000000,-79000000,-355000000,-81000000,,-109000000
46,DVN,Energy,Oil & Gas E&P,2020-12-31,12000000.0,-270000000.0,-2680000000,-113000000,-2707000000,-2543000000,-2680000000,-2552000000,,-3090000000
89,HES,Energy,Oil & Gas E&P,2022-12-31,,-493000000.0,2223000000,3847000000,192000000,2447000000,2223000000,2096000000,,3681000000
108,MPC,Energy,Oil & Gas Refining & Marketing,2020-12-31,9000000.0,-1365000000.0,-9919000000,-2707000000,-9540000000,-11275000000,-9919000000,-11124000000,-12247000000.0,-13612000000
109,MPC,Energy,Oil & Gas Refining & Marketing,2021-12-31,14000000.0,-1483000000.0,9738000000,3353000000,947000000,2553000000,9738000000,1290000000,4300000000.0,2817000000
110,MPC,Energy,Oil & Gas Refining & Marketing,2022-12-31,221000000.0,-1000000000.0,14516000000,18970000000,2499000000,15978000000,14516000000,14444000000,21469000000.0,20469000000
123,OXY,Energy,Oil & Gas E&P,2019-12-31,217000000.0,-849000000.0,-667000000,2815000000,-1780000000,-507000000,-667000000,-652000000,,186000000
124,OXY,Energy,Oil & Gas E&P,2020-12-31,118000000.0,-1306000000.0,-14831000000,-1258000000,-13141000000,-13533000000,-14831000000,-13533000000,,-15705000000
125,OXY,Energy,Oil & Gas E&P,2021-12-31,166000000.0,-1448000000.0,2322000000,4665000000,488000000,2790000000,2322000000,2790000000,,3705000000
126,OXY,Energy,Oil & Gas E&P,2022-12-31,153000000.0,-877000000.0,12504000000,13665000000,1329000000,13304000000,12504000000,13304000000,,14117000000


In [43]:
df[df['company'].isin(['DIS','PARA','IAC','T'])][income]

Unnamed: 0,company,yahoo_sector,industry,st_date,interestIncome,netInterestIncome,netIncome,operatingIncome,otherIncomeExpense,netIncomeContinuousOperations,netIncomeFromContinuingAndDiscontinuedOperation,netIncomeFromContinuingOperationNetMinorityInterest,totalOperatingIncomeAsReported,pretaxIncome
421,T,Communication Services,Telecom Services,2022-12-31,,-6108000000,-8524000000,22911000000,-19897000000,-6874000000,-8524000000,-8343000000,-4587000000.0,-3094000000
423,T,Communication Services,Telecom Services,2019-12-31,,-8422000000,13903000000,29413000000,-2523000000,14975000000,13903000000,13903000000,27955000000.0,18468000000
424,T,Communication Services,Telecom Services,2020-12-31,,-7925000000,-5176000000,25285000000,-20216000000,-3821000000,-5176000000,-5176000000,6405000000.0,-2856000000
425,T,Communication Services,Telecom Services,2021-12-31,,-6884000000,20081000000,28251000000,5580000000,21479000000,20081000000,20081000000,23347000000.0,26947000000
446,DIS,Communication Services,Entertainment,2019-09-30,268000000.0,-978000000,11054000000,11851000000,3071000000,10913000000,11054000000,10383000000,,13944000000
447,DIS,Communication Services,Entertainment,2020-09-30,156000000.0,-1491000000,-2864000000,3794000000,-4046000000,-2442000000,-2864000000,-2832000000,,-1743000000
448,DIS,Communication Services,Entertainment,2021-09-30,140000000.0,-1406000000,1995000000,3659000000,308000000,2536000000,1995000000,2024000000,,2561000000
449,DIS,Communication Services,Entertainment,2022-09-30,152000000.0,-1397000000,3145000000,6770000000,-88000000,3553000000,3145000000,3193000000,,5285000000
475,IAC,Communication Services,Internet Content & Information,2021-12-31,1351000.0,-34264000,603311000,-129460000,901137000,596580000,603311000,605142000,-129460000.0,737413000
476,IAC,Communication Services,Internet Content & Information,2022-12-31,24916000.0,-110165000,-1170170000,-362018000,-1078969000,-1195149000,-1170170000,-1172864000,-474771000.0,-1526236000


Checking DIS against Yahoo website, following fields match and can be taken in final cut:
    
netIncome, operatingIncome

In [44]:
income

['company',
 'yahoo_sector',
 'industry',
 'st_date',
 'interestIncome',
 'netInterestIncome',
 'netIncome',
 'operatingIncome',
 'otherIncomeExpense',
 'netIncomeContinuousOperations',
 'netIncomeFromContinuingAndDiscontinuedOperation',
 'netIncomeFromContinuingOperationNetMinorityInterest',
 'totalOperatingIncomeAsReported',
 'pretaxIncome']

In [45]:
clist= ['interestIncome',
        'netInterestIncome',
        'netIncome',
        'operatingIncome',
        'otherIncomeExpense',
        'netIncomeContinuousOperations']
      
df[clist] = df[clist].fillna(0)

In [46]:
final_income = ['netIncome','operatingIncome','netIncomeContinuousOperations',
                'netInterestIncome','interestIncome','otherIncomeExpense']

## Expense

In [47]:
df[expense].isnull().sum().sort_values()

company                               0
yahoo_sector                          0
industry                              0
st_date                               0
netInterestIncome                     0
totalExpenses                         0
operatingExpense                      8
taxProvision                         15
sellingGeneralAndAdministration     190
interestExpense                     238
interestExpenseNonOperating         238
researchAndDevelopment             1681
generalAndAdministrativeExpense    1704
otherGandA                         1756
otherOperatingExpenses             2035
sellingAndMarketingExpense         2185
dtype: int64

In [48]:
df[df['operatingExpense'].isnull()].sort_values(by=['industry','company','st_date'])[expense]

Unnamed: 0,company,yahoo_sector,industry,st_date,sellingGeneralAndAdministration,generalAndAdministrativeExpense,otherGandA,researchAndDevelopment,sellingAndMarketingExpense,netInterestIncome,interestExpense,interestExpenseNonOperating,taxProvision,otherOperatingExpenses,operatingExpense,totalExpenses
1693,CACI,Technology,Information Technology Services,2019-06-30,,,,,,-49958000,49958000,49958000,62305000,,,4608474000
1694,CACI,Technology,Information Technology Services,2020-06-30,,,,,,-56059000,56059000,56059000,80157000,,,5262346000
1695,CACI,Technology,Information Technology Services,2021-06-30,,,,,,-39836000,39836000,39836000,42172000,,,5504684000
1696,CACI,Technology,Information Technology Services,2022-06-30,,,,,,-41757000,41757000,41757000,87778000,,,5706588000
1765,CSX,Industrials,Railroads,2019-12-31,,,,,,-737000000,737000000,737000000,985000000,,,7063000000
1766,CSX,Industrials,Railroads,2020-12-31,,,,,,-754000000,754000000,754000000,862000000,,,6221000000
1767,CSX,Industrials,Railroads,2021-12-31,,,,,,-722000000,722000000,722000000,1170000000,,,7382000000
1768,CSX,Industrials,Railroads,2022-12-31,,,,,,-700000000,742000000,742000000,1248000000,,,9068000000


Check Yahoo website. Cost of revenue is equal to totalExpenses for both companies. 

Delete both companies in order to avoid making false assumptions about operating expenses. Note that SGA is not 
populated and it looks like it is all bundled into costOfRevenue.

In [49]:
drop_data_details['companies'] = drop_data_details['companies'] + ['CACI','CSX']
df.drop(df[df['company'] == 'CACI'].index, inplace=True, axis=0)
df.drop(df[df['company'] == 'CSX'].index, inplace=True, axis=0)

In [50]:
len(df['company'].unique())

724

In [51]:
df[df['sellingGeneralAndAdministration'].isnull()].sort_values(by=['industry','company','st_date'])[expense].head(400)

Unnamed: 0,company,yahoo_sector,industry,st_date,sellingGeneralAndAdministration,generalAndAdministrativeExpense,otherGandA,researchAndDevelopment,sellingAndMarketingExpense,netInterestIncome,interestExpense,interestExpenseNonOperating,taxProvision,otherOperatingExpenses,operatingExpense,totalExpenses
2029,LMT,Industrials,Aerospace & Defense,2019-12-31,,,,,,-653000000,653000000.0,653000000.0,1011000000,-178000000.0,-178000000,51445000000
2030,LMT,Industrials,Aerospace & Defense,2020-12-31,,,,,,-591000000,591000000.0,591000000.0,1347000000,10000000.0,10000000,56754000000
2031,LMT,Industrials,Aerospace & Defense,2021-12-31,,,,,,-569000000,569000000.0,569000000.0,1235000000,-62000000.0,-62000000,57921000000
2032,LMT,Industrials,Aerospace & Defense,2022-12-31,,,,,,-623000000,623000000.0,623000000.0,948000000,-61000000.0,-61000000,57636000000
2238,LUV,Industrials,Airlines,2019-12-31,,,,,,8000000,82000000.0,82000000.0,657000000,3026000000.0,3026000000,19471000000
2239,LUV,Industrials,Airlines,2020-12-31,,,,,,-282000000,314000000.0,314000000.0,-1182000000,1926000000.0,1926000000,12864000000
2240,LUV,Industrials,Airlines,2021-12-31,,,,,,-418000000,431000000.0,431000000.0,348000000,2394000000.0,2394000000,14069000000
2236,LUV,Industrials,Airlines,2022-12-31,,,,,,-84000000,301000000.0,301000000.0,189000000,3735000000.0,3735000000,22797000000
3404,GPS,Consumer Cyclical,Apparel Retail,2020-01-31,,,,,,-46000000,76000000.0,76000000.0,177000000,5559000000.0,5559000000,15809000000
3400,GPS,Consumer Cyclical,Apparel Retail,2021-01-31,,,,,,-182000000,192000000.0,192000000.0,-437000000,5567000000.0,5567000000,14662000000


Conclusion: if SGA is not populated then neither are any of the related fields that we can derive from or 
substitute from. 

In [52]:
final_expense = ['operatingExpense','totalExpenses','taxProvision','interestExpense',
                 'sellingGeneralAndAdministration','researchAndDevelopment']

## Ad hoc

In [53]:
df[adhoc].isnull().sum().sort_values()

company                           0
yahoo_sector                      0
industry                          0
st_date                           0
netInterestIncome                 0
interestIncome                    0
ebit                              0
dilutedEPS                       13
basicEPS                         13
interestExpense                 238
interestExpenseNonOperating     238
interestIncomeNonOperating     1299
eBITDA                         2283
dtype: int64

Conclusion, we can use ebit in our performance analysis. Ignore EBITDA.

In [54]:
## Investigate Interest Income and Interest Expense

df[adhoc].head(100)

Unnamed: 0,company,yahoo_sector,industry,st_date,netInterestIncome,interestIncome,interestExpense,interestExpenseNonOperating,interestIncomeNonOperating,dilutedEPS,basicEPS,ebit,eBITDA
0,AM,Energy,Oil & Gas Midstream,2022-12-31,-189948000,0,189948000.0,189948000.0,,1,1,633684000,836118000.0
2,AM,Energy,Oil & Gas Midstream,2019-12-31,-110402000,0,110402000.0,110402000.0,,-1,-1,-347178000,
3,AM,Energy,Oil & Gas Midstream,2020-12-31,-147007000,0,147007000.0,147007000.0,,0,0,-31208000,
4,AM,Energy,Oil & Gas Midstream,2021-12-31,-44366000,0,44366000.0,44366000.0,,1,1,151568000,
10,APA,Energy,Oil & Gas E&P,2019-12-31,-462000000,13000000,462000000.0,462000000.0,13000000.0,-9,-9,-2546000000,
11,APA,Energy,Oil & Gas E&P,2020-12-31,-267000000,7000000,267000000.0,267000000.0,7000000.0,-13,-13,-4573000000,
12,APA,Energy,Oil & Gas E&P,2021-12-31,-514000000,8000000,514000000.0,514000000.0,8000000.0,3,3,2405000000,
13,APA,Energy,Oil & Gas E&P,2022-12-31,-379000000,10000000,314000000.0,379000000.0,10000000.0,11,11,6113000000,7281000000.0
15,BKR,Energy,Oil & Gas Equipment & Services,2019-12-31,-237000000,0,237000000.0,237000000.0,,0,0,990000000,
16,BKR,Energy,Oil & Gas Equipment & Services,2020-12-31,-264000000,0,264000000.0,264000000.0,,-15,-15,-14938000000,


netInterestIncome is the net of InterestIncome and InterestExpense but there are anomalies. For example true for DINO
but not for APA.

Let's look closer where Interest Expense is null.

In [55]:
df[df['interestExpense'].isnull()].sort_values(by=['industry','company','st_date'])[adhoc]

Unnamed: 0,company,yahoo_sector,industry,st_date,netInterestIncome,interestIncome,interestExpense,interestExpenseNonOperating,interestIncomeNonOperating,dilutedEPS,basicEPS,ebit,eBITDA
1648,AXON,Industrials,Aerospace & Defense,2019-12-31,8464000,0,,,,0.0,0.0,-6394000,
1649,AXON,Industrials,Aerospace & Defense,2020-12-31,7859000,7859000,,,7859000.0,-0.0,-0.0,-14150000,
1650,AXON,Industrials,Aerospace & Defense,2021-12-31,26748000,26748000,,,26748000.0,-1.0,-1.0,-168123000,
1651,AXON,Industrials,Aerospace & Defense,2022-12-31,103265000,103265000,,,103265000.0,2.0,2.0,93253000,117634000.0
3324,COLM,Consumer Cyclical,Apparel Manufacturing,2020-12-31,435000,435000,,,435000.0,2.0,2.0,137049000,
3325,COLM,Consumer Cyclical,Apparel Manufacturing,2021-12-31,1380000,1380000,,,1380000.0,5.0,5.0,450504000,
3326,COLM,Consumer Cyclical,Apparel Manufacturing,2022-12-31,2713000,2713000,,,2713000.0,5.0,5.0,428704000,546103000.0
3513,LULU,Consumer Cyclical,Apparel Retail,2020-01-31,0,0,,,,5.0,5.0,889110000,
3514,LULU,Consumer Cyclical,Apparel Retail,2021-01-31,0,0,,,,4.0,5.0,849828000,
3515,LULU,Consumer Cyclical,Apparel Retail,2022-01-31,0,0,,,,7.0,8.0,1374749000,


In [56]:
df[df['interestExpense'].isnull() & df['netInterestIncome'].notnull()].sort_values(by=['industry','company','st_date'])[adhoc]

Unnamed: 0,company,yahoo_sector,industry,st_date,netInterestIncome,interestIncome,interestExpense,interestExpenseNonOperating,interestIncomeNonOperating,dilutedEPS,basicEPS,ebit,eBITDA
1648,AXON,Industrials,Aerospace & Defense,2019-12-31,8464000,0,,,,0.0,0.0,-6394000,
1649,AXON,Industrials,Aerospace & Defense,2020-12-31,7859000,7859000,,,7859000.0,-0.0,-0.0,-14150000,
1650,AXON,Industrials,Aerospace & Defense,2021-12-31,26748000,26748000,,,26748000.0,-1.0,-1.0,-168123000,
1651,AXON,Industrials,Aerospace & Defense,2022-12-31,103265000,103265000,,,103265000.0,2.0,2.0,93253000,117634000.0
3324,COLM,Consumer Cyclical,Apparel Manufacturing,2020-12-31,435000,435000,,,435000.0,2.0,2.0,137049000,
3325,COLM,Consumer Cyclical,Apparel Manufacturing,2021-12-31,1380000,1380000,,,1380000.0,5.0,5.0,450504000,
3326,COLM,Consumer Cyclical,Apparel Manufacturing,2022-12-31,2713000,2713000,,,2713000.0,5.0,5.0,428704000,546103000.0
3513,LULU,Consumer Cyclical,Apparel Retail,2020-01-31,0,0,,,,5.0,5.0,889110000,
3514,LULU,Consumer Cyclical,Apparel Retail,2021-01-31,0,0,,,,4.0,5.0,849828000,
3515,LULU,Consumer Cyclical,Apparel Retail,2022-01-31,0,0,,,,7.0,8.0,1374749000,


If both interestIncome and interestExpense are null then 
   set interestIncome to netInterestIncome if positive
otherwise 
   set interestExpense to -1 * netInterestIncome (reverse sign)

In [57]:
mask = (df['interestExpense'].isnull()) & (df['interestIncome'].isnull()) & (df['netInterestIncome'] > 0)
df.loc[mask,'interestIncome'] = df['netInterestIncome']


In [58]:
mask = (df['interestExpense'].isnull()) & (df['interestIncome'].isnull()) & (df['netInterestIncome'] < 0)
df.loc[mask,'interestExpense'] = df['netInterestIncome'] * -1


In [59]:
df[df['company'] == 'SAM'][adhoc]

Unnamed: 0,company,yahoo_sector,industry,st_date,netInterestIncome,interestIncome,interestExpense,interestExpenseNonOperating,interestIncomeNonOperating,dilutedEPS,basicEPS,ebit,eBITDA
672,SAM,Consumer Defensive,Beverages—Brewers,2019-12-31,647000,647000,,,647000.0,9,9,145823000,
673,SAM,Consumer Defensive,Beverages—Brewers,2020-12-31,-199000,0,,,,16,16,248673000,
674,SAM,Consumer Defensive,Beverages—Brewers,2021-12-31,-110000,0,,,,1,1,57174000,
675,SAM,Consumer Defensive,Beverages—Brewers,2022-12-31,2561000,0,,,,5,5,126052000,207408000.0


In [60]:
df[(df['interestExpense'].isnull()) & (df['netInterestIncome'] < 0)][adhoc]

Unnamed: 0,company,yahoo_sector,industry,st_date,netInterestIncome,interestIncome,interestExpense,interestExpenseNonOperating,interestIncomeNonOperating,dilutedEPS,basicEPS,ebit,eBITDA
96,KMI,Energy,Oil & Gas Midstream,2022-12-31,-1513000000,0,,,,1.0,1.0,4033000000,6219000000.0
99,KMI,Energy,Oil & Gas Midstream,2020-12-31,-1595000000,0,,,,0.0,0.0,3492000000,
100,KMI,Energy,Oil & Gas Midstream,2021-12-31,-1492000000,0,,,,1.0,1.0,4540000000,
172,TRGP,Energy,Oil & Gas Midstream,2020-12-31,-345700000,0,,,,-7.0,-7.0,1253100000,
216,AWK,Utilities,Utilities—Regulated Water,2019-12-31,-382000000,0,,,,3.0,3.0,1204000000,
217,AWK,Utilities,Utilities—Regulated Water,2020-12-31,-395000000,0,,,,4.0,4.0,1248000000,
539,META,Communication Services,Internet Content & Information,2022-12-31,-125000000,276000000,,,276000000.0,9.0,9.0,28944000000,37630000000.0
592,PLTK,Communication Services,Electronic Gaming & Multimedia,2019-12-31,-61100000,0,,,,1.0,1.0,497400000,
673,SAM,Consumer Defensive,Beverages—Brewers,2020-12-31,-199000,0,,,,16.0,16.0,248673000,
674,SAM,Consumer Defensive,Beverages—Brewers,2021-12-31,-110000,0,,,,1.0,1.0,57174000,


In [61]:
mask = (df['interestExpense'].isnull()) & (df['netInterestIncome'] < 0)
df.loc[mask,'interestExpense'] = (df['netInterestIncome'] - df['interestIncome']) * -1


In [62]:
df[df['company'] == 'META'][adhoc]

Unnamed: 0,company,yahoo_sector,industry,st_date,netInterestIncome,interestIncome,interestExpense,interestExpenseNonOperating,interestIncomeNonOperating,dilutedEPS,basicEPS,ebit,eBITDA
539,META,Communication Services,Internet Content & Information,2022-12-31,-125000000,276000000,401000000.0,,276000000,9,9,28944000000,37630000000.0
541,META,Communication Services,Internet Content & Information,2019-12-31,826000000,924000000,20000000.0,20000000.0,924000000,6,6,23986000000,
542,META,Communication Services,Internet Content & Information,2020-12-31,509000000,509000000,,,509000000,10,10,32671000000,
543,META,Communication Services,Internet Content & Information,2021-12-31,531000000,531000000,,,531000000,14,14,46753000000,


In [63]:
# Check EPS

df[df['dilutedEPS'].isnull()].sort_values(by=['industry','company','st_date'])[adhoc]

Unnamed: 0,company,yahoo_sector,industry,st_date,netInterestIncome,interestIncome,interestExpense,interestExpenseNonOperating,interestIncomeNonOperating,dilutedEPS,basicEPS,ebit,eBITDA
3758,VFC,Consumer Cyclical,Apparel Manufacturing,2023-03-31,-164632000,9758000,174390000.0,174390000.0,9758000.0,,,1062702000,480001000
3291,KMX,Consumer Cyclical,Auto & Truck Dealerships,2023-02-28,938402000,1441500000,430698000.0,430698000.0,1441500000.0,,,1067502000,1332726000
2898,NTAP,Technology,Computer Hardware,2023-04-30,0,0,,,,,,1159000000,1407000000
813,KR,Consumer Defensive,Grocery Stores,2023-01-31,-535000000,0,535000000.0,535000000.0,,,,3437000000,7016000000
2644,DXC,Technology,Information Technology Services,2023-03-31,-65000000,135000000,200000000.0,200000000.0,135000000.0,,,-685000000,866000000
3286,CPRI,Consumer Cyclical,Luxury Goods,2023-03-31,-24000000,0,24000000.0,,,,,679000000,1016000000
4170,MDT,Healthcare,Medical Devices,2023-04-30,-636000000,0,636000000.0,636000000.0,,,,6000000000,8697000000
894,SJM,Consumer Defensive,Packaged Foods,2023-04-30,-152000000,0,152000000.0,152000000.0,,,,142800000,579300000
2313,UHAL,Industrials,Rental & Leasing Services,2023-03-31,-223958000,0,223958000.0,223958000.0,,,,1441881000,1928676000
2424,ALGM,Technology,Semiconductors,2023-03-31,-612000,1724000,2336000.0,2336000.0,1724000.0,,,213682000,264490000


In [64]:
pd.options.display.float_format = '{:.2f}'.format
company_list = list(df[df['dilutedEPS'].isnull()].sort_values(by=['industry','company','st_date'])['company'].unique())
df[df['company'].isin(company_list)][['company','st_date','dilutedEPS','basicEPS']]

Unnamed: 0,company,st_date,dilutedEPS,basicEPS
813,KR,2023-01-31,,
814,KR,2020-01-31,2.04,2.05
815,KR,2021-01-31,3.27,3.31
816,KR,2022-01-31,2.17,2.2
891,SJM,2020-04-30,6.84,6.84
892,SJM,2021-04-30,7.79,7.79
893,SJM,2022-04-30,5.83,5.84
894,SJM,2023-04-30,,
2309,UHAL,2020-03-31,2.25,2.25
2310,UHAL,2021-03-31,3.12,3.12


Problem is with 2023 data. 

EPS is a good indicator of performance and we want to consider it part of the labelling strategy. The number of 
companies affected is small so let's just drop these companies.

In [65]:
drop_data_details['companies'] = drop_data_details['companies'] + company_list

for ticker in company_list:
  print(ticker)
  df.drop(df[df['company'] == ticker].index, inplace=True, axis=0)

VFC
KMX
NTAP
KR
DXC
CPRI
MDT
SJM
UHAL
ALGM
DOCU
ESTC
SMAR


In [66]:
len(df['company'].unique())

711

In [67]:
drop_data_details['companies']

['AMBP',
 'CI',
 'ORCL',
 'LHX',
 'LSXMK',
 'VTS',
 'BF-A',
 'CCCS',
 'AGL',
 'FWONK',
 'NWSA',
 'DRI',
 'AR',
 'FOXA',
 'RPM',
 'FDX',
 'PARAA',
 'LCID',
 'GIS',
 'HUM',
 'RXO',
 'HRB',
 'LBRDK',
 'UAA',
 'BEPC',
 'LSXMA',
 'FYBR',
 'CAG',
 'DNA',
 'ELV',
 'PAYX',
 'MBC',
 'NKE',
 'CASY',
 'EHAB',
 'ZG',
 'ESAB',
 'CXT',
 'GEHC',
 'FWONA',
 'CTAS',
 'CHPT',
 'LW',
 'CHK',
 'RGLD',
 'OSK',
 'AZPN',
 'KD',
 'PYCR',
 'UA',
 'DH',
 'DNB',
 'BKI',
 'CACI',
 'CSX',
 'VFC',
 'KMX',
 'NTAP',
 'KR',
 'DXC',
 'CPRI',
 'MDT',
 'SJM',
 'UHAL',
 'ALGM',
 'DOCU',
 'ESTC',
 'SMAR']

In [68]:
final_adhoc = ['ebit','dilutedEPS','basicEPS']

## Save modified data

In [69]:
filepath=os.path.join(PROJ_ROOT_DIR,'pickle','yahoo_income_sheets_modified_stage4.pkl')
with open(filepath,'wb') as f:
    pickle.dump(df,f)

In [70]:
filepath=os.path.join(PROJ_ROOT_DIR,'pickle','yahoo_drop_data_details_stage4.pkl')
with open(filepath,'wb') as f:
    pickle.dump(drop_data_details,f)

## Build Final Cut for Income Sheets

In [71]:
col_list = ['company', 'yahoo_sector','gics_sector','industry','st_date', 'st_YR','st_Mnth'] + \
           final_revenue + final_income + final_expense + final_adhoc
df_final = df[col_list].copy()
df_final

Unnamed: 0,company,yahoo_sector,gics_sector,industry,st_date,st_YR,st_Mnth,totalRevenue,costOfRevenue,grossProfit,netIncome,operatingIncome,netIncomeContinuousOperations,netInterestIncome,interestIncome,otherIncomeExpense,operatingExpense,totalExpenses,taxProvision,interestExpense,sellingGeneralAndAdministration,researchAndDevelopment,ebit,dilutedEPS,basicEPS
0,AM,Energy,Energy,Oil & Gas Midstream,2022-12-31,2022,12,990657000.00,382688000.00,607969000.00,326242000.00,540917000.00,326242000.00,-189948000.00,0.00,92767000.00,67052000.00,449740000.00,117494000.00,189948000.00,62125000.00,,633684000.00,0.68,0.68
2,AM,Energy,Energy,Oil & Gas Midstream,2019-12-31,2019,12,849598000.00,348354000.00,501244000.00,-355114000.00,371543000.00,-355114000.00,-110402000.00,0.00,-718721000.00,129701000.00,478055000.00,-102466000.00,110402000.00,118113000.00,,-347178000.00,-0.80,-0.80
3,AM,Energy,Energy,Oil & Gas Midstream,2020-12-31,2020,12,971391000.00,344848000.00,626543000.00,-122527000.00,558931000.00,-122527000.00,-147007000.00,0.00,-590139000.00,67612000.00,412460000.00,-55688000.00,147007000.00,52213000.00,,-31208000.00,-0.26,-0.26
4,AM,Energy,Energy,Oil & Gas Midstream,2021-12-31,2021,12,234158000.00,84254000.00,149904000.00,78626000.00,131980000.00,78626000.00,-44366000.00,0.00,19588000.00,17924000.00,102178000.00,28576000.00,44366000.00,16847000.00,,151568000.00,0.69,0.69
10,APA,Energy,Energy,Oil & Gas E&P,2019-12-31,2019,12,6315000000.00,4433000000.00,1882000000.00,-3515000000.00,410000000.00,-3682000000.00,-462000000.00,13000000.00,-2956000000.00,1472000000.00,5905000000.00,674000000.00,462000000.00,406000000.00,,-2546000000.00,-9.43,-9.43
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4404,ZBH,Healthcare,Health Care,Medical Devices,2022-12-31,2022,12,6939900000.00,2019500000.00,4920400000.00,231400000.00,1192100000.00,291200000.00,-164800000.00,0.00,-623800000.00,3728300000.00,5747800000.00,112300000.00,164800000.00,2761700000.00,406000000.00,568300000.00,1.10,1.10
4406,ZTS,Healthcare,Health Care,Drug Manufacturers—Specialty & Generic,2019-12-31,2019,12,6260000000.00,1992000000.00,4268000000.00,1500000000.00,2018000000.00,1500000000.00,-223000000.00,37000000.00,6000000.00,2250000000.00,4242000000.00,301000000.00,223000000.00,1638000000.00,457000000.00,2024000000.00,3.11,3.14
4407,ZTS,Healthcare,Health Care,Drug Manufacturers—Specialty & Generic,2020-12-31,2020,12,6675000000.00,2057000000.00,4618000000.00,1638000000.00,2269000000.00,1636000000.00,-219000000.00,12000000.00,-54000000.00,2349000000.00,4406000000.00,360000000.00,231000000.00,1726000000.00,463000000.00,2227000000.00,3.42,3.44
4408,ZTS,Healthcare,Health Care,Drug Manufacturers—Specialty & Generic,2021-12-31,2021,12,7776000000.00,2303000000.00,5473000000.00,2037000000.00,2803000000.00,2034000000.00,-224000000.00,6000000.00,-91000000.00,2670000000.00,4973000000.00,454000000.00,224000000.00,2001000000.00,508000000.00,2712000000.00,4.27,4.29


In [72]:
X = list(df_final.columns)
X.sort()
X

['basicEPS',
 'company',
 'costOfRevenue',
 'dilutedEPS',
 'ebit',
 'gics_sector',
 'grossProfit',
 'industry',
 'interestExpense',
 'interestIncome',
 'netIncome',
 'netIncomeContinuousOperations',
 'netInterestIncome',
 'operatingExpense',
 'operatingIncome',
 'otherIncomeExpense',
 'researchAndDevelopment',
 'sellingGeneralAndAdministration',
 'st_Mnth',
 'st_YR',
 'st_date',
 'taxProvision',
 'totalExpenses',
 'totalRevenue',
 'yahoo_sector']

In [73]:
len(df['company'].unique())

711

In [74]:
# Order by company and year.

df_final = df_final.sort_values(by=['company','st_YR'])

In [75]:
# Save final cut

filepath=os.path.join(PROJ_ROOT_DIR,'pickle','yahoo_income_sheets_final_stage4.pkl')
with open(filepath,'wb') as f:
    pickle.dump(df_final,f)