## Consolidate Extraction of Data

Save final data sets having ensured that companies and sectors identified for deletion are removed from all
three data sets.

In [1]:
import pandas as pd
import numpy as np
import pickle
import os

In [2]:
pd.set_option('display.max_rows', 900)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 100)
pd.options.display.float_format = '{:.0f}'.format

In [3]:
DATA_ROOT_DIR='/mnt/data/projects/MD3'
PROJ_ROOT_DIR='/home/priyesh/projects/MD3'

In [4]:
filepath=os.path.join(PROJ_ROOT_DIR,'pickle','yahoo_balance_sheets_final_stage3.pkl')
balance_sheets = pd.read_pickle(filepath)
filepath=os.path.join(PROJ_ROOT_DIR,'pickle','yahoo_income_sheets_final_stage4.pkl')
income_sheets = pd.read_pickle(filepath)
filepath=os.path.join(PROJ_ROOT_DIR,'pickle','yahoo_cash_sheets_final_stage5.pkl')
cash_sheets = pd.read_pickle(filepath)

In [5]:
list(income_sheets.columns)

['company',
 'yahoo_sector',
 'gics_sector',
 'industry',
 'st_date',
 'st_YR',
 'st_Mnth',
 'totalRevenue',
 'costOfRevenue',
 'grossProfit',
 'netIncome',
 'operatingIncome',
 'netIncomeContinuousOperations',
 'netInterestIncome',
 'interestIncome',
 'otherIncomeExpense',
 'operatingExpense',
 'totalExpenses',
 'taxProvision',
 'interestExpense',
 'sellingGeneralAndAdministration',
 'researchAndDevelopment',
 'ebit',
 'dilutedEPS',
 'basicEPS']

In [6]:
filepath=os.path.join(PROJ_ROOT_DIR,'pickle','yahoo_drop_data_details_stage5.pkl')
drop_data_details = pd.read_pickle(filepath)

In [7]:
# Check industries we need to drop

drop_data_details['industry']

['Healthcare Plans']

In [8]:
# Check sectors to drop

drop_data_details['sector']

['Real Estate']

In [9]:
# Remove duplicates from list of companies to drop

drop_companies = list(set(drop_data_details['companies']))
drop_companies

['HRB',
 'EHAB',
 'CCCS',
 'DH',
 'FWONA',
 'KMX',
 'DOCU',
 'DNB',
 'LHX',
 'VFC',
 'OSK',
 'SMAR',
 'BF-A',
 'DRI',
 'CASY',
 'NTAP',
 'CPRI',
 'AR',
 'CHPT',
 'RXO',
 'FDX',
 'AMBP',
 'PYCR',
 'ESTC',
 'NKE',
 'CI',
 'CTAS',
 'BKI',
 'CACI',
 'CAG',
 'AZPN',
 'ESAB',
 'AGL',
 'RPM',
 'FOXA',
 'GIS',
 'LSXMA',
 'LSXMK',
 'RGLD',
 'UA',
 'MDT',
 'UHAL',
 'CXT',
 'LBRDK',
 'ZG',
 'KR',
 'DXC',
 'LW',
 'GEHC',
 'ALGM',
 'CHK',
 'ELV',
 'NWSA',
 'PARAA',
 'UAA',
 'VTS',
 'DNA',
 'BEPC',
 'FYBR',
 'MBC',
 'KD',
 'LCID',
 'CSX',
 'SJM',
 'HUM',
 'FWONK',
 'ORCL',
 'PAYX']

In [10]:
# To ensure consistency across all 3 datasets, drop companies, industries and sectors as specified in
# drop_data_details.

for company in drop_companies:
   print(company)
   balance_sheets.drop(balance_sheets[balance_sheets['company'] == company].index, inplace=True, axis=0)
   income_sheets.drop(income_sheets[income_sheets['company'] == company].index, inplace=True, axis=0)

balance_sheets.drop(balance_sheets[balance_sheets['yahoo_sector'] == 'Real Estate'].index, inplace=True, axis=0)
income_sheets.drop(income_sheets[income_sheets['yahoo_sector'] == 'Real Estate'].index, inplace=True, axis=0)
cash_sheets.drop(cash_sheets[cash_sheets['yahoo_sector'] == 'Real Estate'].index, inplace=True, axis=0)

balance_sheets.drop(balance_sheets[balance_sheets['industry'] == 'Healthcare Plans'].index, inplace=True, axis=0)
income_sheets.drop(income_sheets[income_sheets['industry'] == 'Healthcare Plans'].index, inplace=True, axis=0)
cash_sheets.drop(cash_sheets[cash_sheets['industry'] == 'Healthcare Plans'].index, inplace=True, axis=0)

HRB
EHAB
CCCS
DH
FWONA
KMX
DOCU
DNB
LHX
VFC
OSK
SMAR
BF-A
DRI
CASY
NTAP
CPRI
AR
CHPT
RXO
FDX
AMBP
PYCR
ESTC
NKE
CI
CTAS
BKI
CACI
CAG
AZPN
ESAB
AGL
RPM
FOXA
GIS
LSXMA
LSXMK
RGLD
UA
MDT
UHAL
CXT
LBRDK
ZG
KR
DXC
LW
GEHC
ALGM
CHK
ELV
NWSA
PARAA
UAA
VTS
DNA
BEPC
FYBR
MBC
KD
LCID
CSX
SJM
HUM
FWONK
ORCL
PAYX


In [11]:
bal_tickers = list(balance_sheets['company'].unique())
len(bal_tickers)

711

In [12]:
inc_tickers = list(income_sheets['company'].unique())
len(inc_tickers)

711

In [13]:
cash_tickers = list(cash_sheets['company'].unique())
len(cash_tickers)

711

In [14]:
print("Income sheets: ",len(income_sheets))
print("Cash sheets: ",len(cash_sheets))
print("balance_sheets: ", len(balance_sheets))

Income sheets:  2844
Cash sheets:  2844
balance_sheets:  2844


## Rename Fields 

Rename fields for convenience going forwards. 

In [15]:
list(income_sheets.columns)

['company',
 'yahoo_sector',
 'gics_sector',
 'industry',
 'st_date',
 'st_YR',
 'st_Mnth',
 'totalRevenue',
 'costOfRevenue',
 'grossProfit',
 'netIncome',
 'operatingIncome',
 'netIncomeContinuousOperations',
 'netInterestIncome',
 'interestIncome',
 'otherIncomeExpense',
 'operatingExpense',
 'totalExpenses',
 'taxProvision',
 'interestExpense',
 'sellingGeneralAndAdministration',
 'researchAndDevelopment',
 'ebit',
 'dilutedEPS',
 'basicEPS']

In [16]:
income_sheets.rename(columns={"sellingGeneralAndAdministration": "SGA"},inplace=True)

In [17]:
income_sheets.head()

Unnamed: 0,company,yahoo_sector,gics_sector,industry,st_date,st_YR,st_Mnth,totalRevenue,costOfRevenue,grossProfit,netIncome,operatingIncome,netIncomeContinuousOperations,netInterestIncome,interestIncome,otherIncomeExpense,operatingExpense,totalExpenses,taxProvision,interestExpense,SGA,researchAndDevelopment,ebit,dilutedEPS,basicEPS
3831,A,Healthcare,Health Care,Diagnostics & Research,2019-10-31,2019,10,5163000000,2358000000,2805000000,1071000000,941000000,1071000000,-38000000,36000000,16000000,1864000000,4222000000,-152000000,74000000,1460000000,404000000,993000000,3,3
3832,A,Healthcare,Health Care,Diagnostics & Research,2020-10-31,2020,10,5339000000,2502000000,2837000000,719000000,846000000,719000000,-70000000,8000000,66000000,1991000000,4493000000,123000000,78000000,1496000000,495000000,920000000,2,2
3833,A,Healthcare,Health Care,Diagnostics & Research,2021-10-31,2021,10,6319000000,2912000000,3407000000,1210000000,1347000000,1210000000,-79000000,2000000,92000000,2060000000,4972000000,150000000,81000000,1619000000,441000000,1441000000,4,4
3834,A,Healthcare,Health Care,Diagnostics & Research,2022-10-31,2022,10,6848000000,3126000000,3722000000,1254000000,1618000000,1254000000,-75000000,9000000,-39000000,2104000000,5230000000,250000000,84000000,1637000000,467000000,1588000000,4,4
945,AA,Basic Materials,Materials,Aluminum,2019-12-31,2019,12,10433000000,8537000000,1896000000,-1125000000,876000000,-853000000,-121000000,0,-1193000000,1020000000,9557000000,415000000,121000000,280000000,27000000,-317000000,-6,-6


In [18]:
list(balance_sheets.columns)

['company',
 'yahoo_sector',
 'gics_sector',
 'industry',
 'st_date',
 'st_YR',
 'st_Mnth',
 'cashCashEquivalentsAndShortTermInvestments',
 'receivables',
 'finishedGoods',
 'workInProcess',
 'rawMaterials',
 'otherCurrentAssets',
 'inventory',
 'currentAssets',
 'netPPE',
 'otherNonCurrentAssets',
 'financialAssets',
 'goodwill',
 'goodwillAndOtherIntangibleAssets',
 'otherIntangibleAssets',
 'nonCurrentAccountsReceivable',
 'totalNonCurrentAssets',
 'currentDebtAndCapitalLeaseObligation',
 'payablesAndAccruedExpenses',
 'otherCurrentLiabilities',
 'currentLiabilities',
 'longTermDebtAndCapitalLeaseObligation',
 'otherNonCurrentLiabilities',
 'nonCurrentDeferredLiabilities',
 'nonCurrentDeferredTaxesLiabilities',
 'longTermProvisions',
 'totalNonCurrentLiabilitiesNetMinorityInterest',
 'retainedEarnings',
 'stockholdersEquity']

In [19]:
balance_sheets.rename(columns={"cashCashEquivalentsAndShortTermInvestments": "cashEquivalent",
                               "currentDebtAndCapitalLeaseObligation":"currentDebt",
                               "longTermDebtAndCapitalLeaseObligation":"longTermDebt",
                               "totalNonCurrentLiabilitiesNetMinorityInterest":"totalNonCurrentLiabilities"},inplace=True)

In [20]:
balance_sheets.head()

Unnamed: 0,company,yahoo_sector,gics_sector,industry,st_date,st_YR,st_Mnth,cashEquivalent,receivables,finishedGoods,workInProcess,rawMaterials,otherCurrentAssets,inventory,currentAssets,netPPE,otherNonCurrentAssets,financialAssets,goodwill,goodwillAndOtherIntangibleAssets,otherIntangibleAssets,nonCurrentAccountsReceivable,totalNonCurrentAssets,currentDebt,payablesAndAccruedExpenses,otherCurrentLiabilities,currentLiabilities,longTermDebt,otherNonCurrentLiabilities,nonCurrentDeferredLiabilities,nonCurrentDeferredTaxesLiabilities,longTermProvisions,totalNonCurrentLiabilities,retainedEarnings,stockholdersEquity
2919,A,Healthcare,Health Care,Diagnostics & Research,2019-10-31,2019,10,1382000000,930000000,416000000,0,263000000,198000000,679000000,3189000000,850000000,611000000,0,4700000000,4700000000,1107000000,0,6263000000,616000000,794000000,0,2080000000,1791000000,473000000,0,0,0,2624000000,-18000000,4748000000
2920,A,Healthcare,Health Care,Diagnostics & Research,2020-10-31,2020,10,1441000000,1038000000,417000000,0,303000000,216000000,720000000,3415000000,845000000,776000000,0,4433000000,4433000000,831000000,0,6212000000,75000000,639000000,0,1467000000,2284000000,614000000,0,0,0,3287000000,81000000,4873000000
2921,A,Healthcare,Health Care,Diagnostics & Research,2021-10-31,2021,10,1575000000,1172000000,463000000,0,367000000,222000000,830000000,3799000000,945000000,820000000,0,4956000000,4956000000,981000000,0,6906000000,0,774000000,0,1708000000,2729000000,659000000,0,0,0,3608000000,348000000,5389000000
2922,A,Healthcare,Health Care,Diagnostics & Research,2022-10-31,2022,10,1053000000,1405000000,555000000,0,483000000,282000000,1038000000,3778000000,1100000000,670000000,0,4773000000,4773000000,821000000,0,6738000000,36000000,909000000,0,1861000000,2733000000,536000000,0,0,0,3366000000,324000000,5289000000
724,AA,Basic Materials,Materials,Aluminum,2019-12-31,2019,12,879000000,660000000,305000000,282000000,611000000,288000000,1644000000,3530000000,7916000000,1414000000,18000000,150000000,202000000,52000000,179000000,11110000000,1000000,1588000000,561000000,2563000000,1799000000,371000000,102000000,0,902000000,6221000000,-555000000,4082000000


In [21]:
list(cash_sheets.columns)

['company',
 'yahoo_sector',
 'gics_sector',
 'industry',
 'st_date',
 'st_YR',
 'st_Mnth',
 'netIncome',
 'netIncomeFromContinuingOperations',
 'depreciationAmortizationDepletion',
 'stockBasedCompensation',
 'cashFlowFromContinuingOperatingActivities',
 'capitalExpenditure',
 'saleOfBusiness',
 'purchaseOfBusiness',
 'netBusinessPurchaseAndSale',
 'saleOfPPE',
 'purchaseOfPPE',
 'netPPEPurchaseAndSale',
 'saleOfInvestment',
 'purchaseOfInvestment',
 'netInvestmentPurchaseAndSale',
 'saleOfInvestmentProperties',
 'purchaseOfInvestmentProperties',
 'netInvestmentPropertiesPurchaseAndSale',
 'saleOfIntangibles',
 'purchaseOfIntangibles',
 'netIntangiblesPurchaseAndSale',
 'netOtherInvestingChanges',
 'cashFlowFromContinuingInvestingActivities',
 'netIssuancePaymentsOfDebt',
 'netLongTermDebtIssuance',
 'netShortTermDebtIssuance',
 'commonStockDividendPaid',
 'preferredStockDividendPaid',
 'cashDividendsPaid',
 'netCommonStockIssuance',
 'netPreferredStockIssuance',
 'repurchaseOfCapital

In [22]:
cash_sheets.rename(columns={"depreciationAmortizationDepletion": "depreciation",
                            'cashFlowFromContinuingOperatingActivities':'cashFlowOperatingActivities',
                            'cashFlowFromContinuingInvestingActivities':'cashFlowInvestingActivities',
                            'cashFlowFromContinuingFinancingActivities':'cashFlowFinancingActivities'},inplace=True)
                            

In [23]:
cash_sheets.head()

Unnamed: 0,company,yahoo_sector,gics_sector,industry,st_date,st_YR,st_Mnth,netIncome,netIncomeFromContinuingOperations,depreciation,stockBasedCompensation,cashFlowOperatingActivities,capitalExpenditure,saleOfBusiness,purchaseOfBusiness,netBusinessPurchaseAndSale,saleOfPPE,purchaseOfPPE,netPPEPurchaseAndSale,saleOfInvestment,purchaseOfInvestment,netInvestmentPurchaseAndSale,saleOfInvestmentProperties,purchaseOfInvestmentProperties,netInvestmentPropertiesPurchaseAndSale,saleOfIntangibles,purchaseOfIntangibles,netIntangiblesPurchaseAndSale,netOtherInvestingChanges,cashFlowInvestingActivities,netIssuancePaymentsOfDebt,netLongTermDebtIssuance,netShortTermDebtIssuance,commonStockDividendPaid,preferredStockDividendPaid,cashDividendsPaid,netCommonStockIssuance,netPreferredStockIssuance,repurchaseOfCapitalStock,netOtherFinancingCharges,cashFlowFinancingActivities,freeCashFlow
3881,A,Healthcare,Health Care,Diagnostics & Research,2019-10-31,2019,10,1071000000,1071000000,238000000,72000000,1021000000,-156000000,0,-1408000000,-1408000000,0,-155000000,-155000000,0,-23000000,-23000000,0,0,0,0,-1000000,-1000000,-3000000,-1590000000,600000000,-15000000,615000000,-206000000,0,-206000000,-723000000,0,-723000000,-24000000,-299000000,865000000
3882,A,Healthcare,Health Care,Diagnostics & Research,2020-10-31,2020,10,719000000,719000000,308000000,83000000,921000000,-119000000,0,0,0,1000000,-119000000,-118000000,0,-20000000,-20000000,0,0,0,0,0,0,-9000000,-147000000,-45000000,-918000000,873000000,-222000000,0,-222000000,-469000000,0,-469000000,-41000000,-717000000,802000000
3883,A,Healthcare,Health Care,Diagnostics & Research,2021-10-31,2021,10,1210000000,1210000000,321000000,110000000,1485000000,-189000000,0,-546000000,-546000000,1000000,-188000000,-187000000,12000000,-22000000,-10000000,0,0,0,0,-1000000,-1000000,-5000000,-749000000,356000000,431000000,-75000000,-236000000,0,-236000000,-788000000,0,-788000000,-83000000,-696000000,1296000000
3884,A,Healthcare,Health Care,Diagnostics & Research,2022-10-31,2022,10,1254000000,1254000000,317000000,125000000,1312000000,-291000000,0,-52000000,-52000000,0,-291000000,-291000000,22000000,-13000000,9000000,0,0,0,0,0,0,-4000000,-338000000,26000000,-9000000,35000000,-250000000,0,-250000000,-1139000000,0,-1139000000,-67000000,-1372000000,1021000000
961,AA,Basic Materials,Materials,Aluminum,2019-12-31,2019,12,-1125000000,-853000000,713000000,30000000,686000000,-379000000,0,0,0,0,0,0,0,-112000000,-112000000,0,0,0,0,0,0,23000000,-468000000,-7000000,-7000000,0,0,0,0,0,0,0,-439000000,-444000000,307000000


In [24]:
balance_sheets.shape

(2844, 35)

In [25]:
income_sheets.shape

(2844, 25)

In [26]:
cash_sheets.shape

(2844, 42)

## Check For Nulls and Fix

In [27]:
balance_sheets.isnull().sum()

company                               0
yahoo_sector                          0
gics_sector                           0
industry                              0
st_date                               0
st_YR                                 0
st_Mnth                               0
cashEquivalent                        0
receivables                           0
finishedGoods                         0
workInProcess                         0
rawMaterials                          0
otherCurrentAssets                    0
inventory                             0
currentAssets                         0
netPPE                                0
otherNonCurrentAssets                 0
financialAssets                       0
goodwill                              0
goodwillAndOtherIntangibleAssets      0
otherIntangibleAssets                 0
nonCurrentAccountsReceivable          0
totalNonCurrentAssets                 0
currentDebt                           0
payablesAndAccruedExpenses            0


In [28]:
income_sheets.isnull().sum()

company                             0
yahoo_sector                        0
gics_sector                         0
industry                            0
st_date                             0
st_YR                               0
st_Mnth                             0
totalRevenue                        0
costOfRevenue                       0
grossProfit                         0
netIncome                           0
operatingIncome                     0
netIncomeContinuousOperations       0
netInterestIncome                   0
interestIncome                      0
otherIncomeExpense                  0
operatingExpense                    0
totalExpenses                       0
taxProvision                       15
interestExpense                   190
SGA                               182
researchAndDevelopment           1645
ebit                                0
dilutedEPS                          0
basicEPS                            0
dtype: int64

In [29]:
mask = income_sheets['SGA'].isnull()
income_sheets[mask].reset_index()

Unnamed: 0,index,company,yahoo_sector,gics_sector,industry,st_date,st_YR,st_Mnth,totalRevenue,costOfRevenue,grossProfit,netIncome,operatingIncome,netIncomeContinuousOperations,netInterestIncome,interestIncome,otherIncomeExpense,operatingExpense,totalExpenses,taxProvision,interestExpense,SGA,researchAndDevelopment,ebit,dilutedEPS,basicEPS
0,203,AEE,Utilities,Utilities,Utilities—Regulated Electric,2019-12-31,2019,12,5910000000,3167000000,2743000000,828000000,1267000000,834000000,-381000000,33000000,130000000,1476000000,4643000000,182000000,381000000.0,,,1397000000,3,3
1,204,AEE,Utilities,Utilities,Utilities—Regulated Electric,2020-12-31,2020,12,5794000000,2936000000,2858000000,871000000,1300000000,877000000,-419000000,0,151000000,1558000000,4494000000,155000000,419000000.0,,,1451000000,4,4
2,205,AEE,Utilities,Utilities,Utilities—Regulated Electric,2021-12-31,2021,12,6394000000,3403000000,2991000000,990000000,1333000000,995000000,-383000000,0,202000000,1658000000,5061000000,157000000,383000000.0,,,1535000000,4,4
3,206,AEE,Utilities,Utilities,Utilities—Regulated Electric,2022-12-31,2022,12,7957000000,4614000000,3343000000,1074000000,1515000000,1079000000,-486000000,35000000,226000000,1828000000,6442000000,176000000,486000000.0,,,1741000000,4,4
4,209,AEP,Utilities,Utilities,Utilities—Regulated Electric,2019-12-31,2019,12,15561400000,6320000000,9241400000,1921100000,2748700000,1919800000,-1072500000,0,158600000,6492700000,12812700000,-12900000,1072500000.0,,,2907300000,4,4
5,210,AEP,Utilities,Utilities,Utilities—Regulated Electric,2020-12-31,2020,12,14918500000,5380100000,9538400000,2200100000,2987700000,2196700000,-1165700000,0,324100000,6550700000,11930800000,40500000,1165700000.0,,,3311800000,4,4
6,211,AEP,Utilities,Utilities,Utilities—Regulated Electric,2021-12-31,2021,12,16792000000,6588100000,10203900000,2488100000,3422900000,2488100000,-1199100000,0,288100000,6781000000,13369100000,115500000,1199100000.0,,,3711000000,5,5
7,212,AEP,Utilities,Utilities,Utilities—Regulated Electric,2022-12-31,2022,12,19639500000,8347300000,11292200000,2307200000,3778500000,2305600000,-1396100000,0,38000000,7513700000,15861000000,5400000,1396100000.0,,,3816500000,4,5
8,227,AGR,Utilities,Utilities,Utilities—Regulated Electric,2019-12-31,2019,12,6338000000,3810000000,2528000000,700000000,1003000000,676000000,-306000000,0,122000000,1525000000,5335000000,143000000,306000000.0,,,1125000000,2,2
9,228,AGR,Utilities,Utilities,Utilities—Regulated Electric,2020-12-31,2020,12,6320000000,3845000000,2475000000,581000000,869000000,539000000,-316000000,0,15000000,1606000000,5451000000,29000000,316000000.0,,,884000000,2,2


In [30]:
# Set SGA to operating expense if it is null. Set ofher fields to 0 where null.

income_sheets['SGA'] = np.where(income_sheets.SGA.isnull(), \
                         income_sheets.operatingExpense, income_sheets.SGA)

income_sheets = income_sheets.fillna(0)
income_sheets.isnull().sum()

company                          0
yahoo_sector                     0
gics_sector                      0
industry                         0
st_date                          0
st_YR                            0
st_Mnth                          0
totalRevenue                     0
costOfRevenue                    0
grossProfit                      0
netIncome                        0
operatingIncome                  0
netIncomeContinuousOperations    0
netInterestIncome                0
interestIncome                   0
otherIncomeExpense               0
operatingExpense                 0
totalExpenses                    0
taxProvision                     0
interestExpense                  0
SGA                              0
researchAndDevelopment           0
ebit                             0
dilutedEPS                       0
basicEPS                         0
dtype: int64

In [31]:
cash_sheets.isnull().sum()

company                                     0
yahoo_sector                                0
gics_sector                                 0
industry                                    0
st_date                                     0
st_YR                                       0
st_Mnth                                     0
netIncome                                   0
netIncomeFromContinuingOperations           0
depreciation                                0
stockBasedCompensation                    308
cashFlowOperatingActivities                 0
capitalExpenditure                          0
saleOfBusiness                              0
purchaseOfBusiness                          0
netBusinessPurchaseAndSale                  0
saleOfPPE                                   0
purchaseOfPPE                               0
netPPEPurchaseAndSale                       0
saleOfInvestment                            0
purchaseOfInvestment                        0
netInvestmentPurchaseAndSale      

In [32]:
# Set fields to 0 where null

cash_sheets = cash_sheets.fillna(0)
cash_sheets.isnull().sum()

company                                   0
yahoo_sector                              0
gics_sector                               0
industry                                  0
st_date                                   0
st_YR                                     0
st_Mnth                                   0
netIncome                                 0
netIncomeFromContinuingOperations         0
depreciation                              0
stockBasedCompensation                    0
cashFlowOperatingActivities               0
capitalExpenditure                        0
saleOfBusiness                            0
purchaseOfBusiness                        0
netBusinessPurchaseAndSale                0
saleOfPPE                                 0
purchaseOfPPE                             0
netPPEPurchaseAndSale                     0
saleOfInvestment                          0
purchaseOfInvestment                      0
netInvestmentPurchaseAndSale              0
saleOfInvestmentProperties      

## Create Synthetic Data

The number of companies with score of 0 is too low, so generate some synthetic data by cloning selection of 
actual companies and setting retainedEarnings to negative and add 50% to long term debt.

In [33]:
# Clone list taken from Yahoo_assign_label notebook where 20 companies with final score of 1 and balance sheet 
# score of 2 are identified. 
# Triple long term debt, Set cash to 1M and fix inventory at 1 billion.

clone_list = ['ALNY','BA','EXAS','FRPT','GH','IONS','JBLU','LVS','NVAX','OSH','PLUG',
 'PTON','RARE','SGEN','TXG','WOLF','LYFT']

df_temp = balance_sheets[balance_sheets['company'].isin(clone_list)].copy()
df_temp['longTermDebt'] = df_temp['longTermDebt'] * 3
df_temp['inventory'] = df_temp['inventory'] * 100
df_temp['currentLiabilities'] = df_temp['currentLiabilities'] * 10
df_temp['cashEquivalent'] = 100000
df_temp['company'] = df_temp['company'] + '_S'
balance_sheets = pd.concat([balance_sheets,df_temp])

In [34]:
balance_sheets.columns

Index(['company', 'yahoo_sector', 'gics_sector', 'industry', 'st_date',
       'st_YR', 'st_Mnth', 'cashEquivalent', 'receivables', 'finishedGoods',
       'workInProcess', 'rawMaterials', 'otherCurrentAssets', 'inventory',
       'currentAssets', 'netPPE', 'otherNonCurrentAssets', 'financialAssets',
       'goodwill', 'goodwillAndOtherIntangibleAssets', 'otherIntangibleAssets',
       'nonCurrentAccountsReceivable', 'totalNonCurrentAssets', 'currentDebt',
       'payablesAndAccruedExpenses', 'otherCurrentLiabilities',
       'currentLiabilities', 'longTermDebt', 'otherNonCurrentLiabilities',
       'nonCurrentDeferredLiabilities', 'nonCurrentDeferredTaxesLiabilities',
       'longTermProvisions', 'totalNonCurrentLiabilities', 'retainedEarnings',
       'stockholdersEquity'],
      dtype='object')

In [36]:
# Now we need to clone the data for income sheets and cash_sheets

df_temp = income_sheets[income_sheets['company'].isin(clone_list)].copy()
df_temp['company'] = df_temp['company'] + '_S'
income_sheets = pd.concat([income_sheets,df_temp])

df_temp = cash_sheets[cash_sheets['company'].isin(clone_list)].copy()
df_temp['company'] = df_temp['company'] + '_S'
cash_sheets = pd.concat([cash_sheets,df_temp])

In [37]:
balance_sheets.shape

(2912, 35)

In [38]:
income_sheets.shape

(2980, 25)

In [39]:
cash_sheets.shape

(2980, 42)

In [41]:
len(cash_sheets['company'].unique())

728

In [43]:
filepath=os.path.join(PROJ_ROOT_DIR,'pickle','yahoo_income_sheets_final_stage6.pkl')
with open(filepath,'wb') as f:
    pickle.dump(income_sheets,f)

In [44]:
filepath=os.path.join(PROJ_ROOT_DIR,'pickle','yahoo_balance_sheets_final_stage6.pkl')
with open(filepath,'wb') as f:
    pickle.dump(balance_sheets,f)

In [45]:
filepath=os.path.join(PROJ_ROOT_DIR,'pickle','yahoo_cash_sheets_final_stage6.pkl')
with open(filepath,'wb') as f:
    pickle.dump(cash_sheets,f)

## Export Data

In [46]:
filepath=os.path.join(DATA_ROOT_DIR,'data/export','yahoo_balance.csv')
balance_sheets.to_csv(filepath,index=False)

In [47]:
filepath=os.path.join(DATA_ROOT_DIR,'data/export','yahoo_cash.csv')
cash_sheets.to_csv(filepath,index=False)

In [48]:
filepath=os.path.join(DATA_ROOT_DIR,'data/export','yahoo_income.csv')
income_sheets.to_csv(filepath,index=False)

In [7]:
filepath=os.path.join(DATA_ROOT_DIR,'data/export','yahoo_income.csv')
income_sheets = pd.read_csv(filepath)