## measure the trade efficiency of the clusters
after clustering, dataframe stats are delivered on how well a cluster finishes a trade.

In [3]:
import pycaret
from pycaret.clustering import *
import pandas as pd
import datetime

#get most recent clustering element, or specify by name
fn = 'clustered/clustered_kmeans _2023-09-02 16:21:50.csv'
cluster = pd.read_csv(fn)
cluster.head()

Unnamed: 0.1,Unnamed: 0,group,time,s_MP,change,type,length,sum_change,area,surge_area,...,type.1,p_MP,precursor_buy_cap_pct_change,precursor_ask_cap_pct_change,p_totalBidVol,p_totalAskVol,length.1,sum_change.1,area.1,Cluster
0,0,1,1660222000000.0,30.0,0.505364,surge,1,0.505364,0.505364,0.505364,...,precursor,29.98,-0.000618,-1.7e-05,-0.00021,-0.002384,6.0,-0.005009518,-0.030057,Cluster 2
1,1,3,1660222000000.0,29.86,0.00067,surge,1,0.00067,0.00067,0.00067,...,precursor,29.89,-0.002358,-2e-06,-0.000818,-0.000333,1.0,0.000134564,0.000135,Cluster 2
2,2,5,1660222000000.0,29.88,0.001273,surge,2,0.001808,0.003615,0.003615,...,precursor,29.94,0.00475,-3.6e-05,0.001573,-0.004835,3.0,-0.003410602,-0.010232,Cluster 2
3,3,7,1660222000000.0,29.8,0.000873,surge,2,0.002114,0.004229,0.004229,...,precursor,29.87,-0.000987,2e-06,-0.000398,6.2e-05,2.0,8.855895e-07,2e-06,Cluster 2
4,4,9,1660223000000.0,29.9,0.001305,surge,1,0.001305,0.001305,0.001305,...,precursor,29.95,0.001531,1e-05,0.000334,0.001094,7.0,-0.01377036,-0.096393,Cluster 2


### what is the efficiency of each cluster?
which variables help describe efficiency?  sum_change and length summarize the sloped area. 

In [4]:

# group by the 'Cluster' column and calculate the product of 'length' and 'sum_change'
grouped = cluster.groupby('Cluster').apply(lambda x: x['length'] * x['sum_change'])

# create the new dataframe with the calculated values
cluster_efficiency = pd.DataFrame({
    'Cluster': grouped.index.get_level_values(0),
    'efficiency': grouped.values
})

print(cluster_efficiency)

        Cluster  efficiency
0     Cluster 0    0.033538
1     Cluster 0    0.005959
2     Cluster 0    0.075429
3     Cluster 0    0.000965
4     Cluster 0    0.022446
...         ...         ...
5632  Cluster 3    0.001297
5633  Cluster 3    0.109437
5634  Cluster 3    0.000599
5635  Cluster 3    0.000898
5636  Cluster 3    0.000699

[5637 rows x 2 columns]


In [5]:
# aggregate the 'efficiency' column by 'Cluster'
aggregated = cluster_efficiency.groupby('Cluster').agg({'efficiency': 'mean'})

print(aggregated)

           efficiency
Cluster              
Cluster 0    0.035652
Cluster 1    0.031833
Cluster 2    0.031037
Cluster 3    0.025109


In [6]:
# convert the 'efficiency' column to a percentage
aggregated['efficiency'] = aggregated['efficiency'].apply(lambda x: '{:.2%}'.format(x))

print(aggregated)

          efficiency
Cluster             
Cluster 0      3.57%
Cluster 1      3.18%
Cluster 2      3.10%
Cluster 3      2.51%


## apply efficiency metric to each row, according to cluster metric

In [7]:
# merge the aggregated dataframe with the original dataframe
merged = pd.merge(cluster, aggregated, on='Cluster')

print(merged)

      Unnamed: 0  group          time   s_MP    change   type  length  \
0              0      1  1.660222e+12  30.00  0.505364  surge       1   
1              1      3  1.660222e+12  29.86  0.000670  surge       1   
2              2      5  1.660222e+12  29.88  0.001273  surge       2   
3              3      7  1.660222e+12  29.80  0.000873  surge       2   
4              4      9  1.660223e+12  29.90  0.001305  surge       1   
...          ...    ...           ...    ...       ...    ...     ...   
5632        5631  11263  1.693062e+12  10.05  0.001297  surge       1   
5633        5632  11265  1.693072e+12  10.02  0.109437  surge       1   
5634        5633  11267  1.693073e+12  10.02  0.000599  surge       1   
5635        5634  11269  1.693073e+12  10.03  0.000898  surge       1   
5636        5635  11271  1.693076e+12  10.02  0.000699  surge       1   

      sum_change      area  surge_area  ...   p_MP  \
0       0.505364  0.505364    0.505364  ...  29.98   
1       0.00067

### CSV out: categoric efficiency, per cluster

In [9]:
# setup storage string

now = datetime.datetime.now()
# print("Current date and time: ")
print(now.strftime("%Y-%m-%d %H:%M:%S"))

# model_type = 'kmeans'
run_time = now.strftime("%Y-%m-%d %H:%M:%S")
file_name = 'efficiency_metric/' +run_time+ '.csv'
print(file_name)
merged.to_csv(file_name)

2023-09-02 16:24:56
efficiency_metric/2023-09-02 16:24:56.csv
