In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import os
import datetime
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from IPython.display import display, HTML


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))
#         break

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

KAGGLE_WORKING_DIR = "/kaggle/working/"

# What is Hurst Exponent ?
The Hurst exponent is used as a measure of long-term memory of time series. It relates to the autocorrelations of the time series and the rate at which these decrease as the lag between pairs of values increases.

### Hurst Value is more than 0.5

If the Hurst value is more than 0.5 then it would indicate a persistent time series (roughly translates to a trending market).

### Hurst Value is less than 0.5

If the Hurst Value is less than 0.5 then it can be considered as an anti-persistent time series (roughly translates to sideways market).

### Hurst Value is 0.5

If the Hurst value is 0.5 then it would indicate a random walk or a market where prediction of future based on past data is not possible.

Reference: https://blog.quantinsti.com/hurst-exponent/

In [None]:
class HurstExponentAnalysis(object):
    def __init__(self, file_path: str, n_weeks: list, trending_market_limit: float, mean_reverting_limit: float):
        self.file_path = file_path
        self.n_weeks = n_weeks
        self.trending_market_limit = trending_market_limit
        self.mean_reverting_limit = mean_reverting_limit
    
    def readFileAsDataFrame(self, file_path: str):
        df = pd.read_csv(file_path)
        if len(df) < 1:
            return None
        df["time"] = [self.unixTimeToDatetime(dt) for dt in df["time"]]
        df["week"] = [dt.isocalendar()[1] for dt in df["time"]]
        return df
    
    @staticmethod
    def averageOHLC(record):
        return (record["open"] + record["close"] + record["high"] + record["low"]) / 4
    
    def getWeeklySplitTimeseriesDataSplit(self, df: pd.DataFrame):
        treated_data = list()
        week_data = list()
        current_week = None
        for record in df.to_dict("records"):
            if current_week is None:
                current_week = record["week"]
                week_data.append(self.averageOHLC(record))
            elif current_week != record["week"]:
                treated_data.append(week_data)
                current_week = record["week"]
                week_data = list()
                week_data.append(self.averageOHLC(record))
            else:
                week_data.append(self.averageOHLC(record))
        if len(week_data) > 0:
            treated_data.append(week_data)
        return treated_data

    @staticmethod
    def unixTimeToDatetime(unix_time: int):
        return datetime.datetime.utcfromtimestamp(unix_time / 1000)
    
    @staticmethod
    def hurst_exponent(time_series):
        if time_series is None or len(time_series) < 1:
            return
        if type(time_series[0]) == list:
            # flatten 2d list
            time_series = [data for week in time_series for data in week]
        lags = range(2, 100)
        tau = [np.sqrt(np.std(np.subtract(time_series[lag:], time_series[:-lag]))) for lag in lags]
        poly = np.polyfit(np.log(lags), np.log(tau), 1)
        return poly[0]*2.0
    
    def getRollingHurstExponentResult(self, weekly_splited_data, week_period):
        weekly_hurst_exp = list()
        for i in range(len(weekly_splited_data)):
            if i < week_period - 1:
                weekly_hurst_exp.append(np.nan)
            else:
                start = i - week_period + 1
                end = i + 1
                weekly_hurst_exp.append(self.hurst_exponent(weekly_splited_data[start:end]))
        return weekly_hurst_exp

    def run(self):
        self.df = self.readFileAsDataFrame(self.file_path)
        weekly_splited_data = self.getWeeklySplitTimeseriesDataSplit(self.df)
        records = dict()
        colnames = list()
        for n_week in self.n_weeks:
            colname = "%s_week" % n_week
            records[colname] = self.getRollingHurstExponentResult(weekly_splited_data, n_week)
            colnames.append(colname)
        df = pd.DataFrame.from_dict(records)[colnames]
        return df


N_WEEKS = [1, 2, 5, 8, 13, 26, 52]
TRENDING_MARKET_LIMIT = 0.6
MEAN_REVERTING_MARKET_LIMIT = 0.4
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        file_path = os.path.join(dirname, filename)
        target_file_path = os.path.join(KAGGLE_WORKING_DIR, filename.replace(".csv", "_hurst.csv"))
        if not os.path.isfile(target_file_path):
            he_analysis = HurstExponentAnalysis(
                file_path, N_WEEKS, TRENDING_MARKET_LIMIT, MEAN_REVERTING_MARKET_LIMIT
            )
            resultTable = he_analysis.run()
            resultTable.to_csv(target_file_path, index=False)
        else:
            resultTable = pd.read_csv(target_file_path)
        with pd.option_context('display.max_rows', 1000, 'display.max_columns', 20):
            display(resultTable)
        resultTable.plot()
        break