In [None]:
import duckdb as db
import polars as pl
import pandas as pd
from pathlib import Path

In [None]:
data_dir = Path("/home/sean/Projects/streambt/full_df_2_exit.parquet/")
full_df = pd.concat(
    pd.read_parquet(parquet_file)
    for parquet_file in data_dir.glob('*.parquet')
)

In [None]:
full_df.columns

# pca check

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import Pipeline
for year in range(2010,2022):
    feature_target = db.sql(
    f"""
    with stg as (
    select 
        TMF_w
        , TMF_4w_min
        , TMF_4w_min_dd
        , TMF_26w_min
        , TMF_26w_min_dd
        , TMF_4w_min_dd_qtl_50
        , TMF_4w_min_dd_qtl_50_alt
        , TMF_26w_min_dd_qtl_50
        , TMF_26w_min_dd_qtl_50_alt 
        , TMF_Simple_Signal
        , macd_hof
        , macd_signal_hof
        , stddev_samp(log(Volume+1)) over (partition by Ticker order by Date rows between 5*52 preceding and current row) as std_vol
        , log(Volume + 1)
        --, macd_w_hof
        --, macd_w_signal_hof
        --, Volume
        , dayofweek(Date) as day_of_week
        , Date
        , Ticker
        , gain_loss_ratio > 1.01 as target
        , case 
            when gain_loss_ratio2 > 1.1 then 2 
            when gain_loss_ratio2 > 1.05 then 1 
            when gain_loss_ratio2 > 1 then 0
            else -1 
        end as target2
        , gain_loss_ratio2
    from full_df
    )
    select *
    from stg
    where TMF_Simple_Signal = 1
    and TMF_w < 1 and TMF_w > -1
    and macd_hof < 1 and macd_hof > -1
    and std_vol > 0
    --and Ticker in string 
    """
    )
    train = db.sql(f"select * from feature_target where year(Date) in ({year})").df().dropna()

    f = Pipeline(steps=[('normalize',Normalizer()),('pca', PCA())])
    train_input = train.drop(columns = ['Date', 'Ticker', 'target',  'target2', 'gain_loss_ratio2'])
    res = f.fit_transform(train_input)


    focus = pd.DataFrame(res[:,:2],columns=['pc1','pc2'])
    focus['target'] = train['target']
    focus['target2'] = train['target2']
    focus['gain_loss_ratio2'] = train['gain_loss_ratio2']
    #focus['gain_loss_ratio'] = train
    import plotly.express as px
    ax = px.scatter(focus, x = 'pc1', y= 'pc2', color = 'target2', title= year)
    ax.show()
    #ax = px.scatter(focus, x = 'pc1', y= 'pc2', color = 'gain_loss_ratio2', title= year)
    #ax.show()

# clustering

In [None]:
from sklearn.cluster import HDBSCAN

for year in range(2010,2011):
    feature_target = db.sql(
    f"""
    with stg as (
    select 
        TMF_w
        , TMF_4w_min
        , TMF_4w_min_dd
        , TMF_26w_min
        , TMF_26w_min_dd
        , TMF_4w_min_dd_qtl_50
        , TMF_4w_min_dd_qtl_50_alt
        , TMF_26w_min_dd_qtl_50
        , TMF_26w_min_dd_qtl_50_alt 
        , TMF_Simple_Signal
        , macd_hof
        , macd_signal_hof
        , stddev_samp(log(Volume+1)) over (partition by Ticker order by Date rows between 5*52 preceding and current row) as std_vol
        , log(Volume + 1)
        --, macd_w_hof
        --, macd_w_signal_hof
        --, Volume
        , dayofweek(Date) as day_of_week
        , Date
        , Ticker
        , gain_loss_ratio > 1.01 as target
        , case 
            when gain_loss_ratio2 > 1.1 then 2 
            when gain_loss_ratio2 > 1.05 then 1 
            when gain_loss_ratio2 > 1 then 0
            else -1 
        end as target2
        , gain_loss_ratio2
    from full_df
    )
    select *
    from stg
    where TMF_Simple_Signal = 1
    and TMF_w < 1 and TMF_w > -1
    and macd_hof < 1 and macd_hof > -1
    and std_vol > 0
    --and Ticker in string 
    """
    )
    train = db.sql(f"select * from feature_target where year(Date) in ({year})").df().dropna()
    f = HDBSCAN()
    train_input = train.drop(columns = ['Date', 'Ticker', 'target',  'target2', 'gain_loss_ratio2'])
    res = f.fit_predict(train_input)
    focus = train
    focus['cluster'] = res
    #display(db.sql("select cluster, target2, count(*) as cnt from focus group by cluster, target2 order by target2, cluster").df())
    display(db.sql("select cluster, target2, count(*) as cnt from focus group by cluster, target2 order by cluster, cnt desc").df())

# drop outliers as well before pca?

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import Pipeline
f = Pipeline(steps=[('normalize',Normalizer()),('pca', PCA())])
train_input = train.drop(columns = ['Date', 'Ticker', 'target',  'target2'])
res = f.fit_transform(train_input)

In [None]:
f.named_steps['pca'].explained_variance_

In [None]:
focus = pd.DataFrame(res[:,:2],columns=['pc1','pc2'])
focus['target'] = train['target']
focus['target2'] = train['target2']
#focus['gain_loss_ratio'] = train
import plotly.express as px
ax = px.scatter(focus, x = 'pc1', y= 'pc2', color = 'target2')
ax.show()
ax = px.scatter(focus, x = 'pc1', y= 'pc2', color = 'target')
ax.show()


In [None]:
import plotly.express as px
stg1 = db.sql("select *, year(Date) as year from full_df").df()
subset = stg1
subset = subset[(subset['TMF_w']!=0) & (subset['TMF_w']<1) & (subset['TMF_w']>-1) ]
#subset = subset[(subset['TMF_w']!=0)]
#px.histogram(subset, x = 'TMF_w', color='year', marginal = 'box')
px.box(subset, x = 'year', y = 'TMF_w', points = False)

In [None]:
import plotly.express as px
var = db.sql("select quantile(macd_hof,0.01), quantile(macd_hof,0.99), quantile(macd_hof,0.05), quantile(macd_hof,0.95) from full_df").df()
var
#subset = stg1
#px.histogram(subset, x = 'macd_hof')
#px.box(subset, x = 'year', y = 'macd_hof', points = False)

In [None]:
#subset = db.sql("select * from full_df where macd_hof > -0.16 and macd_hof < 0.13").df()
subset = db.sql("select * from full_df where macd_hof > -2.73 and macd_hof < 1.158").df()
px.histogram(subset, x = 'macd_hof')

In [None]:
len(subset)

In [None]:
import plotly.express as px
stg1 = db.sql("select *, year(Date) as year from full_df where Ticker = 'CBA.AX'").df()
subset = stg1
#subset = subset[(subset['TMF_w']!=0) & (subset['TMF_w']<1) & (subset['TMF_w']>-1) ]
subset = subset[(subset['TMF_w']!=0)]
#px.histogram(subset, x = 'TMF_w', color='year', marginal = 'box')
px.box(subset, x = 'year', y = 'TMF_w', points = False)

In [None]:
import plotly.express as px
stg1 = db.sql("select *, year(Date) as year from full_df where Ticker = 'MEZ.AX'").df()
subset = stg1
#subset = subset[(subset['TMF_w']!=0) & (subset['TMF_w']<1) & (subset['TMF_w']>-1) ]
subset = subset[(subset['TMF_w']!=0)]
#px.histogram(subset, x = 'TMF_w', color='year', marginal = 'box')
px.box(subset, x = 'year', y = 'TMF_w', points = False)

In [None]:
stg1 = db.sql(
"""
select count
from full_df
""")

In [None]:
agg_by_ticker = db.sql("select max(TMF_w) as max_c, min(TMF_w) as min_c, year(Date) as year, Ticker from full_df group by year(Date), Ticker")
import plotly.express as px
ax = px.scatter(agg_by_ticker, x = 'max_c', y = 'min_c', color = 'year', hover_name='Ticker')
ax.show()


In [None]:
focus = db.sql(
"""
select *
from full_df 
where Ticker = 'MEZ.AX'
and year(Date) = 2014
""").df()

px.line(focus, x = 'Date', y=['Close','Volume'])

In [None]:
db.sql(
"""
select max(tmf_w), min(tmf_w), median(tmf_w)
, quantile(tmf_w,0.1)
, quantile(tmf_w,0.2)
, quantile(tmf_w,0.3)
, quantile(tmf_w,0.4)
from full_df 
where Ticker = 'MEZ.AX'
and year(Date) = 2014
""")
