In [None]:
import pandas as pd
import datetime as dt
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import sklearn

In [None]:
from typing import Callable
from typeguard import typechecked
import re
import plotly.io as pio
import plotly.express as px
import plotly.offline as py

py.init_notebook_mode()

N=50000
df = pd.DataFrame({'vals': np.random.normal(10, size=N)})
for slip_s in [1,5,60]:
    df[f'slip_{slip_s}'] = np.random.normal(10,size=N)

def get_qcut(
        focus: str,
        n_bins: int = 10
    ):
    df_out = df.copy()
    df_out[focus] = pd.qcut(df_out[focus], n_bins, duplicates='drop')
    return df_out.groupby(focus)

def slippnls_g(
        focus: str,
        n_bins: int = 10,
        slips: list[int] = [1,5,60],
        func: Callable = np.mean
    ):
    return get_qcut(focus, n_bins)[[f'slip_{s}' for s in slips]].agg(func)
    
def slipsum_g(
        focus: str,
        n_bins: int = 10,
        slip: int = 5,
    ):
    sn = f'slip_{slip}'
    return get_qcut(focus, n_bins).agg(
        mean = (sn, np.mean), 
        sum = (sn, np.sum), 
        count = (sn, lambda x: x.count())
    )

def ctab(
        col1: str,
        col2: str,
        n_bins: int = 10,
        f: str = '5slip',
        how: str = 'mean'
    ):
    slip_period = re.search('[0-9]+', f).group()
    s = re.search('[a-z]+', f).group()
    if s == 'slip':
        choice = f'slip_{slip_period}'
    elif s == 'pnl':
        choice = f'slippnl_{slip_period}'
    else:
        print('invalid input')
        return
    if how == 'mean':
        f = np.mean
    elif how == 'sum':
        f = np.sum
    elif how == 'count':
        f = lambda x: x.count()
    return pd.crosstab(
        pd.qcut(df[col1], n_bins, duplicates='drop'),
        pd.qcut(df[col2], n_bins, duplicates='drop'),
        values = df[choice],
        aggfunc = f
    ) 

def agg_qt(
    # aggregate df column var per col quantiles
    # maybe use aggfunc != mean
    # maybe use labels
    # maybe name output
        cols: list[str],
        var: str = 'value',
        qt: tuple[int, Callable] | int = 10,
        labels: bool = False,
        df: pd.DataFrame = df,
        name: str | None = None
    ):
    df_out = df.copy()
    tgt = var if name is None else name
    if type(qt) is tuple:
        n_bins, af = qt
    else:
        n_bins = qt
        af = np.mean
    for col in cols:
        df_out[col] = pd.qcut(df_out[col], n_bins, duplicates='drop', labels=labels)
    return df_out\
        .groupby(cols)\
        .agg(**{
            tgt: (var, af), 
            'count': (var, lambda x: x.count())
        })\
        .reset_index() 
            

def plot_scatter_3d(
    # plot 3d scatter plot of two variables against third
    # maybe sample points
    # maybe use quantiles and aggregation
    # maybe rename output
        c1: str,
        c2: str,
        val: str,
        qt: tuple[int, Callable] | int | None = 10,
        samples: int | None = None,
        df: pd.DataFrame = df,
        name: str | None = None
    ):
    df_out = df.copy()
    tgt = name if name is not None else val
    if samples is not None:
        df_out = df_out.sample(samples)
    kwargs = {}
    if qt is not None:
        df_return = agg_qt([c1, c2], df=df_out, qt=qt, name=name, var=val)
        kwargs = {
            'size': 'count',
            'color': 'count'
        }
    else:
        df_return = df_out
        df_return[tgt] = df_return[val] 
    return px.scatter_3d(df_return, c1, c2, tgt, **kwargs)

def plot_scatter_2d(
    # plot of one variable against another
    # maybe use quantiles and aggregation
    # maybe sample points
    # maybe rename output
        c: str,
        val: str,
        qt: tuple[int, Callable] | int | None = 10,
        samples: int | None = None,
        df: pd.DataFrame = df,
        name: str | None = None
    ):
    df_out = df.copy()
    tgt = name if name is not None else val
    kwargs = {}
    if samples is not None:
        df_out = df_out.sample(samples)
    if qt is not None:
        df_return = agg_qt([c], df=df_out, qt=qt, name=name, var=val)
        kwargs = {
            'size': 'count',
            'color': 'count'
        }
    else:
        df_return = df_out
        df_return[tgt] = df_return[val] 
    return px.scatter(df_return, c, tgt, **kwargs)


ctab('vals', 'slip_1', n_bins=3, how = 'mean')
adf = agg_qt(['vals', 'slip_1'], qt=20, var = 'slip_5')
print(
    pd.crosstab(
        adf['vals'],
        adf['slip_1'],
        adf['slip_5'],
        aggfunc = np.mean
    )
)
fig = plot_scatter_3d('vals', 'slip_1', 'slip_5',samples=90, qt=6, name='slips5')
fig.show(renderer='iframe')
fig = plot_scatter_2d('vals', 'slip_1', qt = 6, samples=5000)
fig.update_traces(marker_size = 3)
fig.show(renderer='iframe')

#import plotly.graph_objects as go
#fig_widget = go.FigureWidget(fig)
#fig_widget

#print(re.search('[0-9]+', 'slip_55').group())

In [None]:
times = [
    dt.datetime(
        year=2023, 
        month=month, 
        day=day, 
        hour = hour,
        minute = minute,
        second = 14
    ) for month in [11,12,1]
    for hour in range(24)
    for minute in range(60)
    for day in range(1,13)
]

df = pd.DataFrame({'time': times})
df_ol = df.copy()
df_air = df.copy()
df_ol['station'] = 'olympic'
df_air['station'] = 'airport'
colnames = ['airTemperature', 'mystery', 'humidity', 'windspeed']
ranges = [(20,35), (19,23), (20,50), (5,30)]
for colname, (min, max) in zip(colnames, ranges):
    for df_to_use in [df_ol, df_air]:
        df_to_use[colname] = np.random.randint(min,max,size=(len(df.index),1))
df = pd.concat([df_ol, df_air], ignore_index=True)

df.drop_duplicates(['time', 'station'], inplace=True)
df.head()

In [None]:
# get relationship between
# myst-temp spread and humidity change
data = df.copy()

data.sort_values(['station', 'time'], inplace=True)
data['temp_slip'] = data.groupby('station')['airTemperature'].diff(-1)
data['humidity_change'] = data.groupby('station')['humidity'].diff(1)
data.dropna(subset=['temp_slip', 'humidity_change'])
data.plot.scatter(x='humidity_change', y='temp_slip')
plt.show()

In [None]:
# work out 4:30 temp for each day
hour = 16
minute = 30

def preprocess_data(df):
    df_out = df
    df_out['date'] = df_out['time'].dt.date
    df_out['is_olympic'] = np.where(df_out['station'] == 'olympic', 1, 0)
    exps = pd.to_datetime(df_out['date'].astype('str') + f'{hour}:{minute}:00', format='%Y-%m-%d%H:%M:%S')
    df_out['time_to_430'] = (exps - data['time']).dt.total_seconds()/60
    df_out['hour'] = df_out['time'].dt.hour
    df_out['minute'] = df_out['time'].dt.minute
    return df_out

def prep_data(df):
    df_out = df
    return df_out

data = df.copy()
data = preprocess_data(data)
vals_exp = data[(data['time'].dt.hour == hour) & (data['time'].dt.minute == minute)][['station', 'airTemperature', 'date', 'time']]
vals_exp.rename(columns={'airTemperature': 'airTemp_430', 'time': 'time_temp'}, inplace=True)
data = prep_data(data)
data = pd.merge(data, vals_exp, on=['date', 'station'])
print(data.head())

features = ['is_olympic', 'airTemperature', 'mystery', 'humidity', 'windspeed', 'hour', 'minute']

X = data[features]
y = data['airTemp_430']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)


dtrain_reg = xgb.DMatrix(X_train, y_train)
dtest_reg = xgb.DMatrix(X_test, y_test)

params = {"objective": "reg:squarederror"}
n = 4

evals = [(dtest_reg, "validation"), (dtrain_reg, "train")]


model = xgb.train(
   params=params,
   dtrain=dtrain_reg,
   num_boost_round=n,
   evals=evals,
   early_stopping_rounds=2,
   verbose_eval=2
)
X_pred = preprocess_data(df).loc[:5, features]
X_pred = xgb.DMatrix(X_pred)
model.predict(X_pred)

In [None]:
# Averages

data = df.sort_values('time').copy()[-50:]
data.sort_values(['station', 'time'], inplace=True)
windows = [2,5,8]
vars = ['airTemperature', 'mystery']

fig, axs = plt.subplots(2,2, figsize=(12,12))
plt.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=.3, hspace=.51)

for var in vars:
    data[f'{var}_pred'] = model.predict(xgb.DMatrix(preprocess_data(data)[features]))
    for window in windows:
        avgs = data.set_index('time').groupby('station').rolling(window)[var].mean().rename(f'{var}_{window}m')
        data = pd.merge(data, avgs, on=['station', 'time'])


stations = ['airport', 'olympic']
for (var, var_idx) in zip(vars, range(2)):
    for (station, station_idx) in zip(stations, range(2)):
        station_dat = data[data['station'] == station][-10:]
        axs[var_idx, station_idx].plot(station_dat['time'], station_dat[var], 'o-')
        axs[var_idx, station_idx].plot(station_dat['time'], station_dat[f'{var}_pred'], 'o--')
        axs[var_idx, station_idx].set_xticklabels('', rotation = 50)
        axs[var_idx, station_idx].set_title(f'{var}: {station}')
        for window in windows:
            axs[var_idx, station_idx].plot(station_dat['time'], station_dat[f'{var}_{window}m'], 'v-')
        axs[var_idx, station_idx].legend(['current', 'predicted'] + [f'rolling_{window}' for window in windows])

In [None]:
from collections import defaultdict
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

data = df.copy()
data.head()
data.sort_values(['station','time'], inplace=True)
data['temp_change'] = data.groupby('station')['airTemperature'].diff(1)
data['date'] = data['time'].dt.date
res = defaultdict(list)


for prop, barrier,(day, to_use)  in (
    (x/10, y/10, groupby_frame) 
    for x in range(1,10)
    for y in range(5,15)
    for groupby_frame in data.groupby('date')
):

    to_use['trade_price'] = np.where(np.abs(to_use['temp_change'])>barrier, to_use['airTemperature'] - to_use['temp_change']*(1-prop), 0)
    to_use['trade_side'] = np.where(to_use['temp_change']>0, 1, -1)
    to_use['rolling_position'] = np.cumsum(np.where(to_use['trade_price'] != 0, to_use['trade_side'], 0))
    to_use['cash_balance'] = np.cumsum(-to_use['trade_price']*to_use['trade_side'])
    to_use['settlement'] = to_use['airTemperature'].shift(-1)
    to_use['PnL'] = np.where(to_use['trade_side'] != 0, to_use['trade_side']*(to_use['settlement'] - to_use['trade_price']), 0)

    out = to_use.dropna(subset='PnL')
    
    res['prop'].append(prop)
    res['barrier'].append(barrier)
    res['day'].append(day)
    acc = out['cash_balance'].iloc[-1]
    loss = out['cash_balance'].min()
    win = out['PnL'].cumsum().iloc[-1]
    fail = out['PnL'].cumsum().min()
    res['acc'].append(acc)
    res['loss'].append(loss)
    res['win'].append(win)
    res['fail'].append(fail)

results = pd.DataFrame(res).groupby(['prop', 'barrier'])[['win', 'fail']].mean().reset_index()
results.sort_values('win', ascending=False)

In [None]:
cor_hr = 0.7

pos = 0
for b in self.active_books():
    if b.definition.stationId == book.definition.stationId:
        s_b = 
        pos += 1 - abs(s_b-s)*(1-cor_hr)/3600