## Imports

In [21]:
import yfinance as yf
import pandas as pd

## Read selected stocks

In [22]:
stocks = pd.read_csv('./selected stocks.csv')
stocks.head()

Unnamed: 0,Company,Stock
0,Brunswick,BC
1,NCH,THB
2,Revlon,REV
3,Control Data,CDC
4,Colgate-Palmolive,CL


In [23]:
tickers = stocks['Stock'].tolist()

## Download data

In [24]:
data = yf.download(tickers, interval="1d", start="1990-01-01")

[*********************100%%**********************]  81 of 81 completed


10 Failed downloads:
['NAV', 'DFODQ', 'BGG', 'REV']: Exception('%ticker%: No timezone found, symbol may be delisted')
['JOY', 'GMIL', 'GKIS', 'TSO', 'FCS', 'BS']: Exception('%ticker%: No price data found, symbol may be delisted (1d 1990-01-01 -> 2023-10-26)')





## Drop missing data

In [25]:
data = data.dropna(axis=1)

## Round floating point numbers

In [26]:
df_rounded = data.applymap(lambda x: round(x, 3) if isinstance(x, float) else x)

  df_rounded = data.applymap(lambda x: round(x, 3) if isinstance(x, float) else x)


## Reshape the data

In [27]:
frames = []
for c in df_rounded.columns.get_level_values(0).unique():
    d = df_rounded[c].reset_index()
    d = d.melt(id_vars=['Date'], var_name='stock_ticker', value_name=c)
    frames.append(d)

In [28]:
base = frames[0]

for f in frames[1:]:
    print(f.columns)
    base = base.merge(f, on=['Date', 'stock_ticker'], how='inner')

Index(['Date', 'stock_ticker', 'Close'], dtype='object')
Index(['Date', 'stock_ticker', 'High'], dtype='object')
Index(['Date', 'stock_ticker', 'Low'], dtype='object')
Index(['Date', 'stock_ticker', 'Open'], dtype='object')
Index(['Date', 'stock_ticker', 'Volume'], dtype='object')


In [29]:
tickers = base['stock_ticker'].unique().tolist()
tickers

['AAPL',
 'ADM',
 'AME',
 'AOS',
 'AVY',
 'BA',
 'BC',
 'BDX',
 'BKR',
 'CAG',
 'CAT',
 'CCK',
 'CL',
 'CMI',
 'COP',
 'CVX',
 'DD',
 'DOV',
 'F',
 'GD',
 'GE',
 'GLW',
 'HON',
 'HPQ',
 'IBM',
 'IP',
 'KO',
 'MAT',
 'MDT',
 'MMM',
 'MO',
 'MRK',
 'MSFT',
 'NC',
 'NEM',
 'NUE',
 'NWL',
 'OXY',
 'PEP',
 'PG',
 'PPG',
 'ROK',
 'SPXC',
 'SR',
 'TTC',
 'TXN',
 'VFC',
 'VMC',
 'VMI',
 'WDC',
 'XOM']

## Pick random date for each ticker

In [30]:
mask = (base['Date'] >= '1990-01-01') & (base['Date'] <= '2016-01-01')
unique_valid_dates = base[mask]['Date'].unique()
random_dates = pd.Series(unique_valid_dates).sample(len(tickers), replace=True, random_state=42)
random_dates

860    1993-05-26
5390   2011-05-19
5226   2010-09-24
5191   2010-08-05
3772   2004-12-15
3092   2002-04-05
5734   2012-09-28
6265   2014-11-10
466    1991-11-04
5334   2011-03-01
4426   2007-07-24
5578   2012-02-16
6231   2014-09-23
3444   2003-08-27
3171   2002-07-29
2919   2001-07-23
130    1990-07-09
1685   1996-08-29
769    1993-01-15
2391   1999-06-18
5611   2012-04-04
2433   1999-08-18
5311   2011-01-26
5051   2010-01-14
6420   2015-06-24
1184   1994-09-07
4555   2008-01-28
3385   2003-06-04
4117   2006-05-01
6396   2015-05-20
4843   2009-03-19
2904   2001-06-29
474    1991-11-14
1082   1994-04-12
2558   2000-02-15
2047   1998-02-05
2747   2000-11-13
975    1993-11-08
1806   1997-02-21
189    1990-10-01
2734   2000-10-25
3005   2001-11-28
4658   2008-06-24
1899   1997-07-07
1267   1995-01-05
1528   1996-01-17
3202   2002-09-11
3556   2004-02-06
3890   2005-06-06
646    1992-07-22
6164   2014-06-18
dtype: datetime64[ns]

### Get 4 years of daily data for each ticker starting from it's corresponding random date 

In [31]:
stock_df = []
for t, d in zip(tickers, random_dates):
    subset = base[base['stock_ticker'] == t]
    subset = subset[subset['Date'] >= d]
    subset = subset.iloc[:4*365]
    stock_df.append(subset)

In [32]:
all_ = pd.concat(stock_df)
all_

Unnamed: 0,Date,stock_ticker,Adj Close,Close,High,Low,Open,Volume
860,1993-05-26,AAPL,0.422,0.516,0.516,0.494,0.500,121564800
861,1993-05-27,AAPL,0.421,0.513,0.522,0.511,0.516,197288000
862,1993-05-28,AAPL,0.415,0.506,0.513,0.502,0.509,183948800
863,1993-06-01,AAPL,0.418,0.509,0.516,0.504,0.504,135072000
864,1993-06-02,AAPL,0.418,0.509,0.520,0.500,0.507,200480000
...,...,...,...,...,...,...,...,...
433669,2020-03-30,XOM,31.147,37.500,38.150,35.860,36.230,45835000
433670,2020-03-31,XOM,31.537,37.970,39.450,37.370,38.340,41491600
433671,2020-04-01,XOM,31.172,37.530,38.700,36.340,36.860,36045900
433672,2020-04-02,XOM,33.556,40.400,41.960,37.900,38.740,62000200


In [33]:
all_.to_csv('./daily_stock_prices.csv', index=False)

## Replace date column with epoch

In [34]:
all_['epoch'] = 0
all_.head()

Unnamed: 0,Date,stock_ticker,Adj Close,Close,High,Low,Open,Volume,epoch
860,1993-05-26,AAPL,0.422,0.516,0.516,0.494,0.5,121564800,0
861,1993-05-27,AAPL,0.421,0.513,0.522,0.511,0.516,197288000,0
862,1993-05-28,AAPL,0.415,0.506,0.513,0.502,0.509,183948800,0
863,1993-06-01,AAPL,0.418,0.509,0.516,0.504,0.504,135072000,0
864,1993-06-02,AAPL,0.418,0.509,0.52,0.5,0.507,200480000,0


In [35]:
epochs = []
lengths = []
for t in all_['stock_ticker'].unique():
    length = all_[all_['stock_ticker'] == t].shape[0]
    epochs += list(range(1, length+1))
    lengths.append(length)
all_['epoch'] = epochs

In [36]:
all_

Unnamed: 0,Date,stock_ticker,Adj Close,Close,High,Low,Open,Volume,epoch
860,1993-05-26,AAPL,0.422,0.516,0.516,0.494,0.500,121564800,1
861,1993-05-27,AAPL,0.421,0.513,0.522,0.511,0.516,197288000,2
862,1993-05-28,AAPL,0.415,0.506,0.513,0.502,0.509,183948800,3
863,1993-06-01,AAPL,0.418,0.509,0.516,0.504,0.504,135072000,4
864,1993-06-02,AAPL,0.418,0.509,0.520,0.500,0.507,200480000,5
...,...,...,...,...,...,...,...,...,...
433669,2020-03-30,XOM,31.147,37.500,38.150,35.860,36.230,45835000,1456
433670,2020-03-31,XOM,31.537,37.970,39.450,37.370,38.340,41491600,1457
433671,2020-04-01,XOM,31.172,37.530,38.700,36.340,36.860,36045900,1458
433672,2020-04-02,XOM,33.556,40.400,41.960,37.900,38.740,62000200,1459


## Making sure all stocks have the same number of rows

In [37]:
pd.Series(lengths).unique()

array([1460])

In [38]:
all_['stock_ticker'].value_counts().nunique()

1

## Saving processed data

In [39]:
all_.set_index('epoch').drop(columns='Date').to_csv('../../processed/daily_stock_prices/daily_stock_prices.csv')