# Extract and Format Data from Downloaded Files

Take the data downloaded from Yahoo and transform into a single dictionary keyed on ticker symbol. 

Assign sector and industry for each company.

Save data as pickle files.

In [1]:
import pandas as pd
import numpy as np
from os import listdir
import re

import csv
import datetime
import pickle
import os

In [2]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 100)
pd.options.display.float_format = '{:.2f}'.format

In [3]:
DATA_ROOT_DIR='/mnt/data/projects/MD3'
PROJ_ROOT_DIR='/home/priyesh/projects/MD3'

In [4]:
def import_data():

  data_root = os.path.join(DATA_ROOT_DIR,'data/R1000/Yahoo')

  master_company_data = {}

  statement_type = {'balance': 'balanceSheetHistory',
                    'income': 'incomeStatementHistory',
                    'cash': 'cashflowStatementHistory'}
  
  # Loop through each statement type (balance,income,cash)
                    
  for stype, sdesc in statement_type.items():
    
    # Loop through each of the files. we have 4 numbered 1 to 4
                    
    for i in range(1,5): 
      
      # Read data from pickle file. Data is in the form of a dictionary.
                    
      filename=data_root+'/Yahoo_'+stype+'_statements_g'+str(i)+'.pkl'
      statement_dict = pd.read_pickle(filename)
    
      # Data downloaded from Yahoo is organised into sectors and then ticker. 
      # Hence loop through each sector and ticker.
    
      for sector in statement_dict:
        print(sector)
        
        for ticker in statement_dict[sector][sdesc]:
        
          print(ticker)
          statement=pd.DataFrame() 
        
          # Can't figure out a more elegant way to do this. We want to set entry
          # for ticker to dictionary on first iteration so that that we can store 
          # the three statements (balance, income, cash)
            
          if stype == 'balance':
            master_company_data[ticker] = {}
                    
          no_entries=len(statement_dict[sector][sdesc][ticker])
        
          # Build statement table 
                    
          company_data = {}
        
          for i in range(0,no_entries):    
            df = pd.DataFrame.from_dict(statement_dict[sector][sdesc][ticker][i])
            statement=pd.concat([statement,df],axis=1)
        
          # Store statement for company
            
          master_company_data[ticker][stype] = statement
  
  master_company_data.pop('CAT')
  
  return master_company_data

In [5]:
def import_company_profile_data():
   
  data_root = os.path.join(DATA_ROOT_DIR,'data/R1000/Yahoo')

  master_company_profile = {}

  for i in range(1,5):

    filename=data_root+'/Yahoo_company_profile_g'+str(i)+'.pkl'
    company_profile = pd.read_pickle(filename)

    for sector in company_profile:
      for ticker in company_profile[sector]:
        print(ticker)
        
        # We failed to download profile data for CAT so ignore
        
        if ticker != 'CAT':
          master_company_profile[ticker] = company_profile[sector][ticker]
          master_company_profile[ticker]['gics_sector'] = sector
 
  return master_company_profile


In [6]:
master_company_data = import_data()

Energy
AM
AR
APA
BKR
LNG
CHK
CVX
COP
CTRA
DVN
FANG
DTM
EVA
EOG
EQT
XOM
HAL
HES
DINO
KMI
MRO
MPC
NFE
NOV
OXY
OKE
OVV
PDCE
PSX
PXD
RRC
SLB
SWN
TRGP
TPL
VLO
VTS
WMB
Utilities
AES
LNT
AEE
AEP
AWK
ATO
AGR
BEPC
CNP
CMS
ED
CEG
D
DTE
DUK
EIX
ETR
WTRG
EVRG
ES
EXC
FE
HE
IDA
MDU
NFG
NEE
NI
NRG
OGE
PCG
PNW
PPL
PEG
SRE
SO
UGI
VST
WEC
XEL
Communication Services
ATVI
GOOG
ATUS
AMC
T
CABO
CHTR
CMCSA
DISH
DIS
EA
FOXA
FOX
FYBR
IAC
IPG
LBRDA
LBRDK
FWONA
FWONK
LSXMA
LSXMK
LYV
LUMN
MSGS
MTCH
META
NFLX
NYT
NWSA
NWS
NXST
OMC
PARAA
PARA
PINS
PLTK
RBLX
ROKU
SIRI
SPOT
TMUS
TTWO
TRIP
VZ
WBD
WWE
ZI
Consumer Staples
ADM
ACI
MO
BJ
SAM
BF-A
BG
CPB
CASY
CHD
CLX
KO
CL
CAG
STZ
COST
COTY
DAR
DG
DLTR
EL
FLO
FRPT
GIS
GO
HSY
HRL
INGR
K
KDP
KMB
KHC
KR
LW
MKC
TAP
MDLZ
MNST
OLPX
PEP
PFGC
PM
PPC
POST
PG
REYN
SEB
SJM
SPB
SYY
TGT
TSN
USFD
WBA
WMT
Materials
APD
ALB
AA
AMCR
ATR
AMBP
ASH
AVY
AXTA
BALL
BERY
CE
CF
CC
CLF
CTVA
CCK
DOW
DD
EXP
EMN
ECL
ESI
FMC
FCX
DNA
GPK
HUN
IFF
IP
LIN
LPX
LYB
MLM
MOS
MP
NEU
NEM
NUE
OLN
PKG
PPG
RS
RGLD


SMG
SEE
SHW
SLGN
SON
SCCO
SSRM
STLD
X
VVV
VMC
WLK
WRK
Real Estate
ARE
AMH
AMT
COLD
AIRC
AVB
BXP
BRX
CPT
CBRE
CUZ
CCI
CUBE
DLR
DEI
EGP
EPR
EQIX
ELS
EQR
ESS
EXR
FRT
FR
GLPI
HR
PEAK
HIW
HST
HHC
INVH
IRM
JBGS
JLL
KRC
KIM
LAMR
LSI
MPW
MAA
NNN
NSA
OHI
OPEN
PK
PLD
PSA
RYN
O
REG
REXR
SBAC
SPG
SLG
SRC
SUI
UDR
VTR
VICI
VNO
WPC
WELL
WE
WY
ZG
Z
Industrials
MMM
AYI
ADP
WMS
ACM
AGCO
AL
ALK
ALLE
ALSN
AAL
AME
AWI
CAR
AXON
AZEK
BA
BAH
BR
BLDR
BWXT
CHRW
CACI
CSL
CARR
CAT
CHPT
CTAS
CLVT
CLH
CPA
CPRT
CNM
CSGP
CR
CXT
CSX
CMI
CW
DE
DAL
DCI
DOV
DRVN
DNB
ETN
EMR
EFX
ESAB
EXPD
FAST
FDX
FLS
FTV
FBIN
FCN
GTES
GE
GNRC
GD
GGG
GWW
GXO
HAYW
HEI
HTZ
HXL
HON
HWM
HUBB
HII
IEX
ITW
IR
ITT
J
JBHT
JBLU
JCI
KBR
KEX
KNX
LHX
LSTR
LDOS
LII
LECO
LMT
LYFT
MAN
MAS
MTZ
MBC
MRCY
MIDD
MSA
MSM
NDSN
NSC
NOC
NVT
ODFL
OSK
OTIS
OC
PCAR
PH
PAYX
PNR
PLUG
PWR
RTX
RRX
RSG
RBA
RHI
ROK
ROL
ROP
RXO
R
SAIC
SNDR
ST
SITE
AOS
SNA
LUV
SPR
SWK
SRCL
RUN
TDY
TTEK
TXT
TKR
TTC
TT
TDG
TRU
TREX
UHAL
UBER
UNP
UAL
UPS
URI
UNVR
VMI
VRSK
VRT
WAB
WM
WSO
WCC
WSC

In [7]:
master_company_profile = import_company_profile_data()

AM
AR
APA
BKR
LNG
CHK
CVX
COP
CTRA
DVN
FANG
DTM
EVA
EOG
EQT
XOM
HAL
HES
DINO
KMI
MRO
MPC
NFE
NOV
OXY
OKE
OVV
PDCE
PSX
PXD
RRC
SLB
SWN
TRGP
TPL
VLO
VTS
WMB
AES
LNT
AEE
AEP
AWK
ATO
AGR
BEPC
CNP
CMS
ED
CEG
D
DTE
DUK
EIX
ETR
WTRG
EVRG
ES
EXC
FE
HE
IDA
MDU
NFG
NEE
NI
NRG
OGE
PCG
PNW
PPL
PEG
SRE
SO
UGI
VST
WEC
XEL
ATVI
GOOG
ATUS
AMC
T
CABO
CHTR
CMCSA
DISH
DIS
EA
FOXA
FOX
FYBR
IAC
IPG
LBRDA
LBRDK
FWONA
FWONK
LSXMA
LSXMK
LYV
LUMN
MSGS
MTCH
META
NFLX
NYT
NWSA
NWS
NXST
OMC
PARAA
PARA
PINS
PLTK
RBLX
ROKU
SIRI
SPOT
TMUS
TTWO
TRIP
VZ
WBD
WWE
ZI
ADM
ACI
MO
BJ
SAM
BF-A
BG
CPB
CASY
CHD
CLX
KO
CL
CAG
STZ
COST
COTY
DAR
DG
DLTR
EL
FLO
FRPT
GIS
GO
HSY
HRL
INGR
K
KDP
KMB
KHC
KR
LW
MKC
TAP
MDLZ
MNST
OLPX
PEP
PFGC
PM
PPC
POST
PG
REYN
SEB
SJM
SPB
SYY
TGT
TSN
USFD
WBA
WMT
APD
ALB
AA
AMCR
ATR
AMBP
ASH
AVY
AXTA
BALL
BERY
CE
CF
CC
CLF
CTVA
CCK
DOW
DD
EXP
EMN
ECL
ESI
FMC
FCX
DNA
GPK
HUN
IFF
IP
LIN
LPX
LYB
MLM
MOS
MP
NEU
NEM
NUE
OLN
PKG
PPG
RS
RGLD
RPM
SMG
SEE
SHW
SLGN
SON
SCCO
SSRM
STLD
X
VVV
VMC
WLK
WRK
ARE
AMH
A

# Assign sector and industry

In [8]:
for ticker in master_company_profile:
  print(ticker)
  master_company_data[ticker]['industry'] = master_company_profile[ticker]['industry']
  master_company_data[ticker]['yahoo_sector'] = master_company_profile[ticker]['sector']
  master_company_data[ticker]['gics_sector'] = master_company_profile[ticker]['gics_sector']

AM
AR
APA
BKR
LNG
CHK
CVX
COP
CTRA
DVN
FANG
DTM
EVA
EOG
EQT
XOM
HAL
HES
DINO
KMI
MRO
MPC
NFE
NOV
OXY
OKE
OVV
PDCE
PSX
PXD
RRC
SLB
SWN
TRGP
TPL
VLO
VTS
WMB
AES
LNT
AEE
AEP
AWK
ATO
AGR
BEPC
CNP
CMS
ED
CEG
D
DTE
DUK
EIX
ETR
WTRG
EVRG
ES
EXC
FE
HE
IDA
MDU
NFG
NEE
NI
NRG
OGE
PCG
PNW
PPL
PEG
SRE
SO
UGI
VST
WEC
XEL
ATVI
GOOG
ATUS
AMC
T
CABO
CHTR
CMCSA
DISH
DIS
EA
FOXA
FOX
FYBR
IAC
IPG
LBRDA
LBRDK
FWONA
FWONK
LSXMA
LSXMK
LYV
LUMN
MSGS
MTCH
META
NFLX
NYT
NWSA
NWS
NXST
OMC
PARAA
PARA
PINS
PLTK
RBLX
ROKU
SIRI
SPOT
TMUS
TTWO
TRIP
VZ
WBD
WWE
ZI
ADM
ACI
MO
BJ
SAM
BF-A
BG
CPB
CASY
CHD
CLX
KO
CL
CAG
STZ
COST
COTY
DAR
DG
DLTR
EL
FLO
FRPT
GIS
GO
HSY
HRL
INGR
K
KDP
KMB
KHC
KR
LW
MKC
TAP
MDLZ
MNST
OLPX
PEP
PFGC
PM
PPC
POST
PG
REYN
SEB
SJM
SPB
SYY
TGT
TSN
USFD
WBA
WMT
APD
ALB
AA
AMCR
ATR
AMBP
ASH
AVY
AXTA
BALL
BERY
CE
CF
CC
CLF
CTVA
CCK
DOW
DD
EXP
EMN
ECL
ESI
FMC
FCX
DNA
GPK
HUN
IFF
IP
LIN
LPX
LYB
MLM
MOS
MP
NEU
NEM
NUE
OLN
PKG
PPG
RS
RGLD
RPM
SMG
SEE
SHW
SLGN
SON
SCCO
SSRM
STLD
X
VVV
VMC
WLK
WRK
ARE
AMH
A

## Save Data to Pickle File

In [9]:
filepath=os.path.join(PROJ_ROOT_DIR,'pickle','yahoo_company_data.pkl')
with open(filepath,'wb') as f:
    pickle.dump(master_company_data,f)

filepath=os.path.join(PROJ_ROOT_DIR,'pickle','yahoo_company_profile.pkl')    
with open(filepath,'wb') as f:
    pickle.dump(master_company_profile,f)

In [10]:
# check Data - Balance Statement

master_company_data['WMT']['balance']

Unnamed: 0,2020-01-31,2021-01-31,2022-01-31,2023-01-31
accountsPayable,46973000000.0,49141000000.0,55261000000.0,54002000000.0
accountsReceivable,6284000000.0,6516000000.0,8280000000.0,7933000000.0
accumulatedDepreciation,-89820000000.0,-88370000000.0,-94809000000.0,-101610000000.0
additionalPaidInCapital,3247000000.0,3646000000.0,4839000000.0,4969000000.0
buildingsAndImprovements,105674000000.0,97582000000.0,100376000000.0,104554000000.0
capitalLeaseObligations,22782000000.0,18713000000.0,19246000000.0,19711000000.0
capitalStock,284000000.0,282000000.0,276000000.0,269000000.0
cashAndCashEquivalents,9465000000.0,17741000000.0,14760000000.0,8885000000.0
cashCashEquivalentsAndShortTermInvestments,9465000000.0,17741000000.0,14760000000.0,8885000000.0
commonStock,284000000.0,282000000.0,276000000.0,269000000.0


In [11]:
#Check Data - Income Statement

master_company_data['WMT']['income']

Unnamed: 0,2020-01-31,2021-01-31,2022-01-31,2023-01-31,2023-04-30
basicAverageShares,2850000000.0,2831000000.0,2792000000.0,2724000000.0,
basicEPS,5.22,4.77,4.9,4.29,
costOfRevenue,394605000000.0,420315000000.0,429000000000.0,463721000000.0,472158000000.0
dilutedAverageShares,2868000000.0,2847000000.0,2805000000.0,2734000000.0,
dilutedEPS,5.19,4.75,4.87,4.27,
dilutedNIAvailtoComStockholders,14881000000.0,13510000000.0,13673000000.0,11680000000.0,11299000000.0
ebit,22715000000.0,22879000000.0,20690000000.0,19144000000.0,19140000000.0
grossProfit,129359000000.0,138836000000.0,143754000000.0,147568000000.0,149863000000.0
interestExpense,2599000000.0,2315000000.0,1994000000.0,2128000000.0,2337000000.0
interestExpenseNonOperating,2599000000.0,2315000000.0,1994000000.0,2128000000.0,2337000000.0


In [12]:
# Check Data - Cash Statement

master_company_data['WMT']['cash']

Unnamed: 0,2020-01-31,2021-01-31,2022-01-31,2023-01-31,2023-04-30
beginningCashPosition,7756000000.0,9515000000.0,17788000000.0,14834000000.0,11882000000.0
capitalExpenditure,-10705000000.0,-10264000000.0,-13106000000.0,-16857000000.0,-17747000000.0
cashDividendsPaid,-6048000000.0,-6116000000.0,-6152000000.0,-6114000000.0,-6109000000.0
cashFlowFromContinuingFinancingActivities,-14299000000.0,-16117000000.0,-22828000000.0,-17039000000.0,-20414000000.0
cashFlowFromContinuingInvestingActivities,-9128000000.0,-10071000000.0,-6015000000.0,-17722000000.0,-18024000000.0
cashFlowFromContinuingOperatingActivities,25255000000.0,36074000000.0,24181000000.0,28841000000.0,37232000000.0
changeInAccountPayable,-274000000.0,6966000000.0,5520000000.0,-1165000000.0,1186000000.0
changeInAccruedExpense,186000000.0,4623000000.0,1404000000.0,4393000000.0,4895000000.0
changeInCashSupplementalAsReported,1759000000.0,10121000000.0,-4802000000.0,-5993000000.0,-1174000000.0
changeInIncomeTaxPayable,-93000000.0,-136000000.0,39000000.0,-127000000.0,384000000.0


In [13]:
# Check data

print("Walmart:")
print("industry: ",master_company_data['WMT']['industry'])
print("yahoo_sector:",master_company_data['WMT']['yahoo_sector'])
print("gics_sector:",master_company_data['WMT']['gics_sector'])


Walmart:
industry:  Discount Stores
yahoo_sector: Consumer Defensive
gics_sector: Consumer Staples
