In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

In [2]:
import glob

# Match all stock historical data files using the glob module.
stock_files = glob.glob('../data/raw/stocks/*.csv')

In [3]:
# You can extract the stock names from the file names using split(). No regex required!
stock_symbols = [file.split('../data/raw/stocks/')[1].split('.csv')[0] for file in stock_files]

stock_dataframes = []
for file, symbol in zip(tqdm(stock_files), stock_symbols):
    try:
        df = pd.read_csv(file)
        df['symbol'] = symbol
        stock_dataframes.append(df)
    except(pd.errors.EmptyDataError):
        # Ignore empty files
        pass

HBox(children=(FloatProgress(value=0.0, max=5884.0), HTML(value='')))




In [4]:
all_stocks = pd.concat(stock_dataframes, ignore_index=True)

In [5]:
all_stocks.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,symbol
0,2016-01-08,19.162428,19.162428,19.114038,19.114038,11.411459,10300.0,RIV
1,2016-01-11,19.133394,19.230173,19.017258,19.017258,11.353675,49600.0,RIV
2,2016-01-12,19.11307,19.11307,19.103392,19.103392,11.405104,2300.0,RIV
3,2016-01-13,19.103392,19.103392,19.103392,19.103392,11.405104,0.0,RIV
4,2016-01-14,19.065647,19.385021,18.968868,19.104361,11.405683,26100.0,RIV


In [6]:
all_stocks.columns = ['date', 'open', 'high', 'low', 'close', 'adjusted_close', 'volume', 'symbol']

In [7]:
# With nearly 6,000 stocks, it will be difficult to efficiently run analyses.
len(all_stocks.symbol.unique())

5884

In [8]:
# Taking a naive metric of average volume, I can estimate the most traded stocks.
# As expected, some immediately recognizable names such as Apple and Microsoft are at the top of the list.
top_symbols = all_stocks.groupby('symbol').volume.mean().nlargest(100).index

In [9]:
top_stocks = all_stocks.set_index('symbol').loc[top_symbols]

In [10]:
top_stocks.info()

<class 'pandas.core.frame.DataFrame'>
Index: 564995 entries, FTAI to GOOGL
Data columns (total 7 columns):
date              564995 non-null object
open              564951 non-null float64
high              564951 non-null float64
low               564951 non-null float64
close             564951 non-null float64
adjusted_close    564951 non-null float64
volume            564951 non-null float64
dtypes: float64(6), object(1)
memory usage: 34.5+ MB


In [11]:
all_stocks.shape[0], top_stocks.shape[0]

(24197442, 564995)

In [12]:
#all_stocks.to_csv('../Data/Clean/all_stocks.csv')

In [13]:
top_stocks.to_csv('../data/clean/top_stocks.csv')