Okayyyy, so here's what we're going for:

Null: Fat tails are _uniforimly_ or _normally_ distributed.

Alternate: The things appear in big clusters, and specifically, they whip around. 

Point: If I can prove that a big move(specifically in commodities and volatility) is clustered with/paired with/indicative of a big move in the _opposite_ direction the next day, I serve to make a lot of money. 

What I don't want: For it to be a *coin flip*. 

Let's get cracking. 

What we need to know: 
- One: Whether these moves are, in fact, clustered.
- Two: Whether these moves are, in fact, _usually_ opposite.
- Three: Whether these moves are or are not clustered depending on product. 
- Four: Whether high kurtosis has an effect 
    (I know it will on the freq of big moves, Idk about the diretionality)

In [14]:
#importing libraries needed
import numpy as np
import pandas as pd
import scipy.stats as sc
import matplotlib.pyplot as plt
pd.core.common.is_list_like = pd.api.types.is_list_like
import pandas_datareader as pdr
from datetime import *
%matplotlib inline

In [15]:
#grabs data. Creates returns, date and 'mover' (sds to be considered a big move) columns
def grab_data(ticker, stdevs, start, end):
    ticker = pdr.DataReader(ticker, 'iex', start, end)
    dates = pd.to_datetime(ticker.index)
    ticker["date"] = dates
    ticker['returns'] = (ticker.close - ticker.open)/ticker.open
    mover = []
    thres = stdevs*np.std(ticker['returns'])
    for i in ticker['returns']:
        if abs(i) > thres:
            mover += [True]
        else: 
            mover += [False]
    ticker['mover'] = mover
    return ticker

#graphs a returns chart, big moves in red
def make_graph(ticker):
    fig, graph = plt.subplots()
    graph.plot_date(ticker.date, ticker.returns)
    graph.plot_date(ticker[ticker['mover']].date, ticker[ticker['mover']].returns, c = 'r')
    graph.grid()
    x1, x2, y1, y2 = graph.axis()
    graph.axis([x1, x2, -.3, .3]) #set to remove scale bias
    return graph

In [25]:
#setting start/end, etf list, making the etf_database (dict style representation)
start = datetime(2013, 11, 16)
end = datetime(2018, 11, 15)
etfs = ['EWJ', 'USO']
etf_database = {}
for i in etfs:
    etf_database[i] = grab_data(i, 3, start, end)


5y
5y


In [13]:
##etf_database['EWJ'].mover

In [17]:
#this is misleading as singular (cluster)
#it returns all the clusters for a given dataframe
#n.b. you have to give it a pre-selected dataframe so either 
#screen or pass in a sample
#also, non-clusters are returned as "clusters" of one
def build_cluster(etf_movers_selected, current_cluster= []):
    movers = etf_movers_selected
    cluster_list = []
    #-1 to avoid index error + irrelavant because you can't know when the next big move is
    for i in np.arange(0, len(movers.date) - 1): 
        move_date = movers['date'][i]
        next_move = movers['date'][i +1]
        current_cluster += [move_date]
        if (next_move - move_date).days <= 5:
            current_cluster += [next_move]
        else:
            cluster_list += [current_cluster]
            current_cluster = []
    return cluster_list
            
    
#this is an appropriation function
#i kid, lowkey it's for a later function, cluster base
#gets rid of duplicate dates
def get_unique_clusters(etf_cluster, start):
    def minus_start(some_date):
        return [(i - start).days for i in some_date]  
    days = list(map(minus_start, etf_cluster))
    unique_days = [list(set(i)) for i in days]
    return unique_days


In [6]:
#makes a database storing all the clusters of a given etf_datbase
def make_cluster_base(etf_database):
    cluster_base = {}
    for i in etf_database.keys():
        movers = etf_database[i][etf_database[i]['mover']] #selecting movers for build cluster,
        cluster_base[i] = get_unique_clusters(build_cluster(movers), etf_database[i].date[0])
    return cluster_base

#returns the fraction of clusters in a list of flagged moves, grouped by distance
def fraction_cluster(cluster_list):
    cluster_count = 0
    for cluster in cluster_list:
            if len(cluster) != 1:
                cluster_count +=1 
    return cluster_count/len(cluster_list)

#returns a dict of fractions of clusters from a cluster database
#essentitally, this a databse for observed values (and later resampled ones)
def get_cluster_fracs(cluster_database):
    fraction_clusters = {}
    for name in cluster_database:
        frac_cluster = fraction_cluster(cluster_database[name]) 
        fraction_clusters[name] = frac_cluster
    return fraction_clusters

In [28]:
#this is a bitch to run but that's okay
#grabs you resamapled data of some dataframe based on how many movers there were
#so it's set, more or less, at the data grabbing level
#has repetitions and sample size included
def get_resampled_data(etf_database, repetitions = 100, sample_size = 30):
    resampled_data_dict = {}
    for i in etf_database.keys():
        resampled_data = []
        for j in np.arange(repetitions):
            sample_clusters_to_mean = []
            for k in np.arange(sample_size):
                ticker = etf_database[i]
                resampled_df = ticker.sample(len(ticker[ticker['mover']]))
                resampled_df_sorted = resampled_df.sort_index()
                resampled_cluster = build_cluster(resampled_df_sorted)
                sample_clusters_to_mean += [fraction_cluster(resampled_cluster)]
            resampled_data += [np.mean(sample_clusters_to_mean)]
        resampled_data_dict[i] = resampled_data
    return resampled_data_dict

In [29]:
#this takes in a database of etf dataframes as defined far above and a sample data database
#outputs a bunch of graphs proving your hypothsis (it's okay popper, this is acceptable)
#also has another txt file output (look at those numbers karl; look at those numbers KARL)
def make_cluster_graph(etf_database, sample_database):
    cluster_baser = make_cluster_base(etf_database)
    observed = get_cluster_fracs(cluster_baser)
    txter = open('analysis/v_important_cluster_analysis', 'w+')
    for i in etf_database.keys():
        resampled_data = sample_database[i]
        plt.axvline(np.percentile(resampled_data, .5), color = 'purple', marker = '|')
        plt.axvline(np.percentile(resampled_data, 99.5), color = 'purple', marker = '|')
        plt.axvline(observed[i], color = 'red')
        plt.hist(np.array(resampled_data), bins = 17)
        plt.xlabel('Fraction Clustered')
        plt.ylabel('Count')
        plt.title('Clusters of ' + str(i))
        plt.grid()
        plt.text(0.25, -4, "Purple is 99% confidence at random; Red is the observed.", ha = 'center')
        stds = np.round(observed[i]/np.std(resampled_data), 2)
        text = "That's " + str(stds) + " standard deviations away, in case you were wondering."
        plt.text(0.25, -5.3, text , ha = 'center')
        clusterimgfilename = ('images/'+ i + 'clusterchart.png')
        plt.savefig(clusterimgfilename, bbox_inches="tight")
        plt.close()
        text = "{}: The observed percentage of clusters, {}%, for {} was {} standard deviations from the mean value of {}%.".format(i, 100*np.round(observed[i], 4), i, stds, 100*np.round(np.mean(resampled_data), 4))
        txter.write(text + "\n\n")
    txter.close()

In [30]:
#tying it all together for hypothesis test number one
sample_database_forever = get_resampled_data(etf_database)

In [31]:
make_cluster_graph(etf_database, sample_database_forever)
plt.show()