## Consolidate Extraction of Data

Save final data sets having ensured that companies and sectors identified for deletion are removed from all
three data sets.

In [1]:
import pandas as pd
import numpy as np
import pickle
import os

In [2]:
pd.set_option('display.max_rows', 900)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 100)
pd.options.display.float_format = '{:.0f}'.format

In [3]:
DATA_ROOT_DIR='/mnt/data/projects/MD3'
PROJ_ROOT_DIR='/home/priyesh/projects/MD3'

In [4]:
filepath=os.path.join(PROJ_ROOT_DIR,'pickle','yahoo_balance_sheets_final_stage3.pkl')
balance_sheets = pd.read_pickle(filepath)
filepath=os.path.join(PROJ_ROOT_DIR,'pickle','yahoo_income_sheets_final_stage4.pkl')
income_sheets = pd.read_pickle(filepath)
filepath=os.path.join(PROJ_ROOT_DIR,'pickle','yahoo_cash_sheets_final_stage5.pkl')
cash_sheets = pd.read_pickle(filepath)

In [5]:
list(income_sheets.columns)

['company',
 'yahoo_sector',
 'gics_sector',
 'industry',
 'st_date',
 'st_YR',
 'st_Mnth',
 'totalRevenue',
 'costOfRevenue',
 'grossProfit',
 'netIncome',
 'operatingIncome',
 'netIncomeContinuousOperations',
 'netInterestIncome',
 'interestIncome',
 'otherIncomeExpense',
 'operatingExpense',
 'totalExpenses',
 'taxProvision',
 'interestExpense',
 'sellingGeneralAndAdministration',
 'researchAndDevelopment',
 'ebit',
 'dilutedEPS',
 'basicEPS']

In [6]:
filepath=os.path.join(PROJ_ROOT_DIR,'pickle','yahoo_drop_data_details_stage5.pkl')
drop_data_details = pd.read_pickle(filepath)

In [7]:
# Check industries we need to drop

drop_data_details['industry']

['Healthcare Plans']

In [8]:
# Check sectors to drop

drop_data_details['sector']

['Real Estate']

In [9]:
# Remove duplicates from list of companies to drop

drop_companies = list(set(drop_data_details['companies']))
drop_companies

['FOXA',
 'GEHC',
 'LSXMK',
 'CASY',
 'CXT',
 'FWONA',
 'DH',
 'DRI',
 'ELV',
 'ESTC',
 'DOCU',
 'LSXMA',
 'CHK',
 'CI',
 'AMBP',
 'NWSA',
 'HUM',
 'BKI',
 'GIS',
 'AR',
 'RGLD',
 'FWONK',
 'UA',
 'LHX',
 'FDX',
 'VFC',
 'KR',
 'ZG',
 'LCID',
 'OSK',
 'ESAB',
 'CCCS',
 'KD',
 'EHAB',
 'HRB',
 'FYBR',
 'VTS',
 'DNB',
 'CTAS',
 'SJM',
 'LBRDK',
 'UHAL',
 'RPM',
 'RXO',
 'DNA',
 'CHPT',
 'PAYX',
 'CPRI',
 'NTAP',
 'DXC',
 'NKE',
 'AZPN',
 'ALGM',
 'CSX',
 'KMX',
 'BEPC',
 'AGL',
 'LW',
 'UAA',
 'PARAA',
 'MDT',
 'SMAR',
 'CAG',
 'ORCL',
 'BF-A',
 'CACI',
 'PYCR',
 'MBC']

In [10]:
# To ensure consistency across all 3 datasets, drop companies, industries and sectors as specified in
# drop_data_details.

for company in drop_companies:
   print(company)
   balance_sheets.drop(balance_sheets[balance_sheets['company'] == company].index, inplace=True, axis=0)
   income_sheets.drop(income_sheets[income_sheets['company'] == company].index, inplace=True, axis=0)

balance_sheets.drop(balance_sheets[balance_sheets['yahoo_sector'] == 'Real Estate'].index, inplace=True, axis=0)
income_sheets.drop(income_sheets[income_sheets['yahoo_sector'] == 'Real Estate'].index, inplace=True, axis=0)
cash_sheets.drop(cash_sheets[cash_sheets['yahoo_sector'] == 'Real Estate'].index, inplace=True, axis=0)

balance_sheets.drop(balance_sheets[balance_sheets['industry'] == 'Healthcare Plans'].index, inplace=True, axis=0)
income_sheets.drop(income_sheets[income_sheets['industry'] == 'Healthcare Plans'].index, inplace=True, axis=0)
cash_sheets.drop(cash_sheets[cash_sheets['industry'] == 'Healthcare Plans'].index, inplace=True, axis=0)

FOXA
GEHC
LSXMK
CASY
CXT
FWONA
DH
DRI
ELV
ESTC
DOCU
LSXMA
CHK
CI
AMBP
NWSA
HUM
BKI
GIS
AR
RGLD
FWONK
UA
LHX
FDX
VFC
KR
ZG
LCID
OSK
ESAB
CCCS
KD
EHAB
HRB
FYBR
VTS
DNB
CTAS
SJM
LBRDK
UHAL
RPM
RXO
DNA
CHPT
PAYX
CPRI
NTAP
DXC
NKE
AZPN
ALGM
CSX
KMX
BEPC
AGL
LW
UAA
PARAA
MDT
SMAR
CAG
ORCL
BF-A
CACI
PYCR
MBC


In [11]:
bal_tickers = list(balance_sheets['company'].unique())
len(bal_tickers)

711

In [12]:
inc_tickers = list(income_sheets['company'].unique())
len(inc_tickers)

711

In [13]:
cash_tickers = list(cash_sheets['company'].unique())
len(cash_tickers)

711

In [14]:
print("Income sheets: ",len(income_sheets))
print("Cash sheets: ",len(cash_sheets))
print("balance_sheets: ", len(balance_sheets))

Income sheets:  2844
Cash sheets:  2844
balance_sheets:  2844


## Rename Fields 

Rename fields for convenience going forwards. 

In [15]:
list(income_sheets.columns)

['company',
 'yahoo_sector',
 'gics_sector',
 'industry',
 'st_date',
 'st_YR',
 'st_Mnth',
 'totalRevenue',
 'costOfRevenue',
 'grossProfit',
 'netIncome',
 'operatingIncome',
 'netIncomeContinuousOperations',
 'netInterestIncome',
 'interestIncome',
 'otherIncomeExpense',
 'operatingExpense',
 'totalExpenses',
 'taxProvision',
 'interestExpense',
 'sellingGeneralAndAdministration',
 'researchAndDevelopment',
 'ebit',
 'dilutedEPS',
 'basicEPS']

In [16]:
income_sheets.rename(columns={"sellingGeneralAndAdministration": "SGA"},inplace=True)

In [17]:
income_sheets.head()

Unnamed: 0,company,yahoo_sector,gics_sector,industry,st_date,st_YR,st_Mnth,totalRevenue,costOfRevenue,grossProfit,netIncome,operatingIncome,netIncomeContinuousOperations,netInterestIncome,interestIncome,otherIncomeExpense,operatingExpense,totalExpenses,taxProvision,interestExpense,SGA,researchAndDevelopment,ebit,dilutedEPS,basicEPS
3831,A,Healthcare,Health Care,Diagnostics & Research,2019-10-31,2019,10,5163000000,2358000000,2805000000,1071000000,941000000,1071000000,-38000000,36000000,16000000,1864000000,4222000000,-152000000,74000000,1460000000,404000000,993000000,3,3
3832,A,Healthcare,Health Care,Diagnostics & Research,2020-10-31,2020,10,5339000000,2502000000,2837000000,719000000,846000000,719000000,-70000000,8000000,66000000,1991000000,4493000000,123000000,78000000,1496000000,495000000,920000000,2,2
3833,A,Healthcare,Health Care,Diagnostics & Research,2021-10-31,2021,10,6319000000,2912000000,3407000000,1210000000,1347000000,1210000000,-79000000,2000000,92000000,2060000000,4972000000,150000000,81000000,1619000000,441000000,1441000000,4,4
3834,A,Healthcare,Health Care,Diagnostics & Research,2022-10-31,2022,10,6848000000,3126000000,3722000000,1254000000,1618000000,1254000000,-75000000,9000000,-39000000,2104000000,5230000000,250000000,84000000,1637000000,467000000,1588000000,4,4
945,AA,Basic Materials,Materials,Aluminum,2019-12-31,2019,12,10433000000,8537000000,1896000000,-1125000000,876000000,-853000000,-121000000,0,-1193000000,1020000000,9557000000,415000000,121000000,280000000,27000000,-317000000,-6,-6


In [18]:
list(balance_sheets.columns)

['company',
 'yahoo_sector',
 'gics_sector',
 'industry',
 'st_date',
 'st_YR',
 'st_Mnth',
 'cashCashEquivalentsAndShortTermInvestments',
 'receivables',
 'finishedGoods',
 'workInProcess',
 'rawMaterials',
 'otherCurrentAssets',
 'inventory',
 'currentAssets',
 'netPPE',
 'otherNonCurrentAssets',
 'financialAssets',
 'goodwill',
 'goodwillAndOtherIntangibleAssets',
 'otherIntangibleAssets',
 'nonCurrentAccountsReceivable',
 'totalNonCurrentAssets',
 'currentDebtAndCapitalLeaseObligation',
 'payablesAndAccruedExpenses',
 'otherCurrentLiabilities',
 'currentLiabilities',
 'longTermDebtAndCapitalLeaseObligation',
 'otherNonCurrentLiabilities',
 'nonCurrentDeferredLiabilities',
 'nonCurrentDeferredTaxesLiabilities',
 'longTermProvisions',
 'totalNonCurrentLiabilitiesNetMinorityInterest',
 'retainedEarnings',
 'stockholdersEquity']

In [19]:
balance_sheets.rename(columns={"cashCashEquivalentsAndShortTermInvestments": "cashEquivalent",
                               "currentDebtAndCapitalLeaseObligation":"currentDebt",
                               "longTermDebtAndCapitalLeaseObligation":"longTermDebt",
                               "totalNonCurrentLiabilitiesNetMinorityInterest":"totalNonCurrentLiabilities"},inplace=True)

In [20]:
balance_sheets.head()

Unnamed: 0,company,yahoo_sector,gics_sector,industry,st_date,st_YR,st_Mnth,cashEquivalent,receivables,finishedGoods,workInProcess,rawMaterials,otherCurrentAssets,inventory,currentAssets,netPPE,otherNonCurrentAssets,financialAssets,goodwill,goodwillAndOtherIntangibleAssets,otherIntangibleAssets,nonCurrentAccountsReceivable,totalNonCurrentAssets,currentDebt,payablesAndAccruedExpenses,otherCurrentLiabilities,currentLiabilities,longTermDebt,otherNonCurrentLiabilities,nonCurrentDeferredLiabilities,nonCurrentDeferredTaxesLiabilities,longTermProvisions,totalNonCurrentLiabilities,retainedEarnings,stockholdersEquity
2919,A,Healthcare,Health Care,Diagnostics & Research,2019-10-31,2019,10,1382000000,930000000,416000000,0,263000000,198000000,679000000,3189000000,850000000,611000000,0,4700000000,4700000000,1107000000,0,6263000000,616000000,794000000,0,2080000000,1791000000,473000000,0,0,0,2624000000,-18000000,4748000000
2920,A,Healthcare,Health Care,Diagnostics & Research,2020-10-31,2020,10,1441000000,1038000000,417000000,0,303000000,216000000,720000000,3415000000,845000000,776000000,0,4433000000,4433000000,831000000,0,6212000000,75000000,639000000,0,1467000000,2284000000,614000000,0,0,0,3287000000,81000000,4873000000
2921,A,Healthcare,Health Care,Diagnostics & Research,2021-10-31,2021,10,1575000000,1172000000,463000000,0,367000000,222000000,830000000,3799000000,945000000,820000000,0,4956000000,4956000000,981000000,0,6906000000,0,774000000,0,1708000000,2729000000,659000000,0,0,0,3608000000,348000000,5389000000
2922,A,Healthcare,Health Care,Diagnostics & Research,2022-10-31,2022,10,1053000000,1405000000,555000000,0,483000000,282000000,1038000000,3778000000,1100000000,670000000,0,4773000000,4773000000,821000000,0,6738000000,36000000,909000000,0,1861000000,2733000000,536000000,0,0,0,3366000000,324000000,5289000000
724,AA,Basic Materials,Materials,Aluminum,2019-12-31,2019,12,879000000,660000000,305000000,282000000,611000000,288000000,1644000000,3530000000,7916000000,1414000000,18000000,150000000,202000000,52000000,179000000,11110000000,1000000,1588000000,561000000,2563000000,1799000000,371000000,102000000,0,902000000,6221000000,-555000000,4082000000


In [21]:
list(cash_sheets.columns)

['company',
 'yahoo_sector',
 'gics_sector',
 'industry',
 'st_date',
 'st_YR',
 'st_Mnth',
 'netIncome',
 'netIncomeFromContinuingOperations',
 'depreciationAmortizationDepletion',
 'stockBasedCompensation',
 'cashFlowFromContinuingOperatingActivities',
 'capitalExpenditure',
 'saleOfBusiness',
 'purchaseOfBusiness',
 'netBusinessPurchaseAndSale',
 'saleOfPPE',
 'purchaseOfPPE',
 'netPPEPurchaseAndSale',
 'saleOfInvestment',
 'purchaseOfInvestment',
 'netInvestmentPurchaseAndSale',
 'saleOfInvestmentProperties',
 'purchaseOfInvestmentProperties',
 'netInvestmentPropertiesPurchaseAndSale',
 'saleOfIntangibles',
 'purchaseOfIntangibles',
 'netIntangiblesPurchaseAndSale',
 'netOtherInvestingChanges',
 'cashFlowFromContinuingInvestingActivities',
 'netIssuancePaymentsOfDebt',
 'netLongTermDebtIssuance',
 'netShortTermDebtIssuance',
 'commonStockDividendPaid',
 'preferredStockDividendPaid',
 'cashDividendsPaid',
 'netCommonStockIssuance',
 'netPreferredStockIssuance',
 'repurchaseOfCapital

In [22]:
cash_sheets.rename(columns={"depreciationAmortizationDepletion": "depreciation",
                            'cashFlowFromContinuingOperatingActivities':'cashFlowOperatingActivities',
                            'cashFlowFromContinuingInvestingActivities':'cashFlowInvestingActivities',
                            'cashFlowFromContinuingFinancingActivities':'cashFlowFinancingActivities'},inplace=True)
                            

In [23]:
cash_sheets.head()

Unnamed: 0,company,yahoo_sector,gics_sector,industry,st_date,st_YR,st_Mnth,netIncome,netIncomeFromContinuingOperations,depreciation,stockBasedCompensation,cashFlowOperatingActivities,capitalExpenditure,saleOfBusiness,purchaseOfBusiness,netBusinessPurchaseAndSale,saleOfPPE,purchaseOfPPE,netPPEPurchaseAndSale,saleOfInvestment,purchaseOfInvestment,netInvestmentPurchaseAndSale,saleOfInvestmentProperties,purchaseOfInvestmentProperties,netInvestmentPropertiesPurchaseAndSale,saleOfIntangibles,purchaseOfIntangibles,netIntangiblesPurchaseAndSale,netOtherInvestingChanges,cashFlowInvestingActivities,netIssuancePaymentsOfDebt,netLongTermDebtIssuance,netShortTermDebtIssuance,commonStockDividendPaid,preferredStockDividendPaid,cashDividendsPaid,netCommonStockIssuance,netPreferredStockIssuance,repurchaseOfCapitalStock,netOtherFinancingCharges,cashFlowFinancingActivities,freeCashFlow
3881,A,Healthcare,Health Care,Diagnostics & Research,2019-10-31,2019,10,1071000000,1071000000,238000000,72000000,1021000000,-156000000,0,-1408000000,-1408000000,0,-155000000,-155000000,0,-23000000,-23000000,0,0,0,0,-1000000,-1000000,-3000000,-1590000000,600000000,-15000000,615000000,-206000000,0,-206000000,-723000000,0,-723000000,-24000000,-299000000,865000000
3882,A,Healthcare,Health Care,Diagnostics & Research,2020-10-31,2020,10,719000000,719000000,308000000,83000000,921000000,-119000000,0,0,0,1000000,-119000000,-118000000,0,-20000000,-20000000,0,0,0,0,0,0,-9000000,-147000000,-45000000,-918000000,873000000,-222000000,0,-222000000,-469000000,0,-469000000,-41000000,-717000000,802000000
3883,A,Healthcare,Health Care,Diagnostics & Research,2021-10-31,2021,10,1210000000,1210000000,321000000,110000000,1485000000,-189000000,0,-546000000,-546000000,1000000,-188000000,-187000000,12000000,-22000000,-10000000,0,0,0,0,-1000000,-1000000,-5000000,-749000000,356000000,431000000,-75000000,-236000000,0,-236000000,-788000000,0,-788000000,-83000000,-696000000,1296000000
3884,A,Healthcare,Health Care,Diagnostics & Research,2022-10-31,2022,10,1254000000,1254000000,317000000,125000000,1312000000,-291000000,0,-52000000,-52000000,0,-291000000,-291000000,22000000,-13000000,9000000,0,0,0,0,0,0,-4000000,-338000000,26000000,-9000000,35000000,-250000000,0,-250000000,-1139000000,0,-1139000000,-67000000,-1372000000,1021000000
961,AA,Basic Materials,Materials,Aluminum,2019-12-31,2019,12,-1125000000,-853000000,713000000,30000000,686000000,-379000000,0,0,0,0,0,0,0,-112000000,-112000000,0,0,0,0,0,0,23000000,-468000000,-7000000,-7000000,0,0,0,0,0,0,0,-439000000,-444000000,307000000


In [24]:
filepath=os.path.join(PROJ_ROOT_DIR,'pickle','yahoo_income_sheets_final_stage6.pkl')
with open(filepath,'wb') as f:
    pickle.dump(income_sheets,f)

In [25]:
filepath=os.path.join(PROJ_ROOT_DIR,'pickle','yahoo_balance_sheets_final_stage6.pkl')
with open(filepath,'wb') as f:
    pickle.dump(balance_sheets,f)

In [26]:
filepath=os.path.join(PROJ_ROOT_DIR,'pickle','yahoo_cash_sheets_final_stage6.pkl')
with open(filepath,'wb') as f:
    pickle.dump(cash_sheets,f)

## Export Data

In [27]:
filepath=os.path.join(DATA_ROOT_DIR,'data/export','yahoo_balance.csv')
balance_sheets.to_csv(filepath,index=False)

In [28]:
filepath=os.path.join(DATA_ROOT_DIR,'data/export','yahoo_cash.csv')
income_sheets.to_csv(filepath,index=False)

In [29]:
filepath=os.path.join(DATA_ROOT_DIR,'data/export','yahoo_income.csv')
cash_sheets.to_csv(filepath,index=False)