##### Imports

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
%matplotlib inline

from pathlib import Path
import numpy as np
import pandas as pd
import gc

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
sns.set_style('whitegrid')
idx = pd.IndexSlice
deciles = np.arange(.1, 1, .1).round(1)

In [None]:
DATA_STORE = Path('/home/sayem/Desktop/Project/data/assets.h5')

In [None]:
with pd.HDFStore(DATA_STORE) as store:
    print(store.keys())

In [None]:
with pd.HDFStore(DATA_STORE) as store:
    nyse_stocks = store['/stooq/us/nyse/stocks/prices']
    nasdaq_stocks = store['/stooq/us/nasdaq/stocks/prices']
    metadata = store['us_equities/stocks'].loc[:, ['market cap', 'sector']]

In [None]:
# Concatenate the two DataFrames
data = pd.concat([nyse_stocks, nasdaq_stocks])

# Drop NaN values and duplicates
data = data.dropna().drop_duplicates()

# Display the cleaned DataFrame
print(data.info())

In [None]:
metadata.sector = pd.factorize(metadata.sector)[0]
metadata.info()

In [None]:
metadata

In [None]:
data = data.join(metadata).dropna(subset=['sector'])


In [None]:
print(f"# Tickers: {len(data.index.unique('ticker')):,.0f} | # Dates: {len(data.index.unique('date')):,.0f}")

In [None]:
# Identifying duplicate entries based on index
duplicates = data[data.duplicated(keep=False)]
print(duplicates)

##### Select 500 most-traded stocks

In [None]:
dv = data.close.mul(data.volume)
dv = dv[~dv.index.duplicated(keep='first')]

In [None]:
top500 = (dv.groupby(level='date')
          .rank(ascending=False)
          .unstack('ticker')
          .dropna(thresh=8*252, axis=1)
          .mean()
          .nsmallest(500))

##### Visualize the 200 most liquid stocks

In [None]:
dv = data.close.mul(data.volume).div(1e6)  # scaling values to millions
dv = dv[~dv.index.duplicated(keep='first')]

# Group by date and ticker, then by ticker alone
avg_dv_per_ticker = dv.groupby(level=['date', 'ticker']).mean().groupby(level='ticker').mean()

# Get top 200 tickers based on the average dollar volume
top200_tickers = avg_dv_per_ticker.nlargest(200)

cutoffs = [0, 50, 100, 150, 200]
fig, axes = plt.subplots(ncols=4, figsize=(20, 10), sharex=True)
axes = axes.flatten()

for i, cutoff in enumerate(cutoffs[1:], 1):
    top200_tickers.iloc[cutoffs[i-1]:cutoffs[i]].sort_values().plot.barh(logx=True, ax=axes[i-1])

fig.tight_layout()

In [None]:
to_drop = data.index.unique('ticker').difference(top500.index)

In [None]:
len(to_drop)

In [None]:
data = data.drop(to_drop, level='ticker')

In [None]:
data.info(null_counts=True)

In [None]:
print(f"# Tickers: {len(data.index.unique('ticker')):,.0f} | # Dates: {len(data.index.unique('date')):,.0f}")

##### Remove outlier observations based on daily returns

In [None]:
before = len(data)
data['ret'] = data.groupby('ticker').close.pct_change()
data = data[data.ret.between(-1, 1)].drop('ret', axis=1)
print(f'Dropped {before-len(data):,.0f}')

In [None]:
data.info()

In [None]:
import pandas as pd
import numpy as np
from statsmodels.tsa.stattools import adfuller

def test_stationarity(df, ticker):
    # Extract relevant data for the ticker
    ticker_data = df.loc[ticker]
    
    # Ensure data covers at least 2 years, otherwise return a message
    if len(ticker_data) < 2 * 252:  # Assuming 252 trading days in a year
        return f"Insufficient data for {ticker}. Need at least 2 years of data."

    # Calculate log and percentage returns
    ticker_data['log_ret'] = np.log(ticker_data['close'] / ticker_data['close'].shift(1))
    ticker_data['pct_ret'] = ticker_data['close'].pct_change()

    # Drop NaN values (the first row after computing returns)
    ticker_data = ticker_data.dropna()

    # Test stationarity for each series using ADF
    adf_price = adfuller(ticker_data['close'])
    adf_log_ret = adfuller(ticker_data['log_ret'])
    adf_pct_ret = adfuller(ticker_data['pct_ret'])
    
    # Store results in a dictionary for comparison
    results = {
        'price': adf_price[0],
        'log_ret': adf_log_ret[0],
        'pct_ret': adf_pct_ret[0]
    }
    # Find which series is the most stationary based on the ADF statistic (the lower, the better)
    most_stationary = min(results, key=results.get)
    
    return most_stationary, results

# Example usage:
ticker = 'AAPL'  # Replace with desired ticker
most_stationary, results = test_stationarity(data, ticker)
print(f"For {ticker}, the most stationary series is: {most_stationary}.")
print("ADF Results:", results)

In [None]:
STOP

In [None]:
tickers = data.index.unique('ticker')
print(f"# Tickers: {len(tickers):,.0f} | # Dates: {len(data.index.unique('date')):,.0f}")

##### Compute returns
- Historical returns

In [None]:
T = [1, 2, 3, 4, 5, 10, 21, 42, 63, 126, 252]

by_ticker = data.groupby(level='ticker')

for t in T:
    data[f'ret_{t:02}d'] = by_ticker.close.pct_change(t)

In [None]:
data

##### Forward returns

In [None]:
data['ret_fwd_01d'] = by_ticker.ret_01d.shift(-1)
data = data.dropna(subset=['ret_fwd_01d'])

In [None]:
data

##### Persist results

In [None]:
data.info(null_counts=True)

In [None]:
data.to_hdf(DATA_STORE, 'data/top500_dataset')