In [1]:
import pycaret
from pycaret.clustering import *
import pandas as pd
import datetime

#get most recent clustering element, or specify by name
fn = 'efficiency_metric/2023-08-14 10:37:50.csv'
ec = pd.read_csv(fn)
ec.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,group,time,s_MP,change,type,length,sum_change,s_area,...,p_MP,p_buyCap,p_askCap,p_totalBidVol,p_totalAskVol,p_length,p_sum_change,p_area,Cluster,efficiency
0,0,0,1,1660222000000.0,29.99,0.317588,surge,1,0.317588,0.317588,...,30.0,7349599.5,1229796000.0,716681.94,314676.4,6.0,-0.004943,-0.029656,Cluster 2,3.25%
1,1,1,3,1660222000000.0,29.85,0.000469,surge,1,0.000469,0.000469,...,29.9,7307990.0,1229792000.0,715352.9,314642.84,1.0,0.000402,0.000402,Cluster 2,3.25%
2,2,2,5,1660222000000.0,29.87,0.001105,surge,2,0.001607,0.003214,...,29.92,7285245.0,1229754000.0,714476.1,313576.0,4.0,-0.002773,-0.01109,Cluster 2,3.25%
3,3,3,7,1660222000000.0,29.89,0.00161,surge,1,0.00161,0.00161,...,29.88,7318707.0,1229744000.0,715730.56,313546.6,2.0,-0.0002,-0.0004,Cluster 2,3.25%
4,4,4,9,1660223000000.0,29.9,0.001372,surge,1,0.001372,0.001372,...,29.93,7280582.0,1229794000.0,714140.1,314889.38,7.0,-0.01367,-0.09569,Cluster 2,3.25%


In [2]:
import altair as alt
from vega_datasets import data

movies = alt.UrlData(
    data.movies.url,
    format=alt.DataFormat(parse={"Release_Date":"date"})
)
ratings = ['G', 'NC-17', 'PG', 'PG-13', 'R']
genres = [
    'Action', 'Adventure', 'Black Comedy', 'Comedy',
    'Concert/Performance', 'Documentary', 'Drama', 'Horror', 'Musical',
    'Romantic Comedy', 'Thriller/Suspense', 'Western'
]

base = alt.Chart(movies, width=200, height=200).mark_point(filled=True).transform_calculate(
    Rounded_IMDB_Rating = "floor(datum.IMDB_Rating)",
    Hundred_Million_Production =  "datum.Production_Budget > 100000000.0 ? 100 : 10",
    Release_Year = "year(datum.Release_Date)"
).transform_filter(
    alt.datum.IMDB_Rating > 0
).transform_filter(
    alt.FieldOneOfPredicate(field='MPAA_Rating', oneOf=ratings)
).encode(
    x=alt.X('Worldwide_Gross:Q').scale(domain=(100000,10**9), clamp=True),
    y='IMDB_Rating:Q',
    tooltip="Title:N"
)

# A slider filter
year_slider = alt.binding_range(min=1969, max=2018, step=1, name="Release Year")
slider_selection = alt.selection_point(bind=year_slider, fields=['Release_Year'])

filter_year = base.add_params(
    slider_selection
).transform_filter(
    slider_selection
).properties(title="Slider Filtering")

# A dropdown filter
genre_dropdown = alt.binding_select(options=genres, name="Genre")
genre_select = alt.selection_point(fields=['Major_Genre'], bind=genre_dropdown)

filter_genres = base.add_params(
    genre_select
).transform_filter(
    genre_select
).properties(title="Dropdown Filtering")

#color changing marks
rating_radio = alt.binding_radio(options=ratings, name="Rating")
rating_select = alt.selection_point(fields=['MPAA_Rating'], bind=rating_radio)

rating_color_condition = alt.condition(
    rating_select,
    alt.Color('MPAA_Rating:N').legend(None),
    alt.value('lightgray')
)

highlight_ratings = base.add_params(
    rating_select
).encode(
    color=rating_color_condition
).properties(title="Radio Button Highlighting")

# Boolean selection for format changes
input_checkbox = alt.binding_checkbox(name="Big Budget Films ")
checkbox_selection = alt.param(bind=input_checkbox)

size_checkbox_condition = alt.condition(
    checkbox_selection,
    alt.Size('Hundred_Million_Production:Q'),
    alt.SizeValue(25)
)

budget_sizing = base.add_params(
    checkbox_selection
).encode(
    size=size_checkbox_condition
).properties(title="Checkbox Formatting")

(filter_year | filter_genres) & (highlight_ratings | budget_sizing)

In [3]:
# import altair as alt
# import pandas as pd

# # Create sample data
# data = pd.DataFrame({
#     'Cluster': ['A', 'B', 'C', 'D', 'E'],
#     'surge_area': [1, 2, 3, 4, 5],
#     'tbv': [10, 20, 30, 40, 50],
#     'tav': [100, 200, 300, 400, 500],
#     'ac': [1000, 2000, 3000, 4000, 5000],
#     'bc': [10000, 20000, 30000, 40000, 50000]
# })

# # Create scatter plot
# scatter = alt.Chart(data).mark_circle().encode(
#     x='tbv',
#     y='tav',
#     color='Cluster'
# )

# # Create histogram
# hist = alt.Chart(data).transform_fold(
#     ['ac', 'bc', 'tav', 'tbv'],
#     as_=['Column', 'Value']
# ).mark_bar().encode(
#     x=alt.X('Value:Q'),
#     y=alt.Y('count()', title='Number of Records'),
#     color='Column:N'
# ).properties(
#     width=300,
#     height=300
# )

# # Combine charts
# scatter & hist.properties(title='Distribution of ac,bc,tav,tbv')


In [4]:
# import altair as alt
# import pandas as pd

# # Create sample data
# data = pd.DataFrame({
#     'Cluster': ['A', 'B', 'C', 'D', 'E'],
#     'surge_area': [1, 2, 3, 4, 5],
#     'tbv': [10, 20, 30, 40, 50],
#     'tav': [100, 200, 300, 400, 500],
#     'ac': [1000, 2000, 3000, 4000, 5000],
#     'bc': [10000, 20000, 30000, 40000, 50000]
# })

# # Create scatter plot
# scatter = alt.Chart(data).mark_circle().encode(
#     x='tbv',
#     y='tav',
#     color='Cluster'
# ).properties(
#     selection={
#         'Cluster': alt.selection_point(on='click', fields=['Cluster'])
#     }
# )

# # Create histogram
# hist = alt.Chart(data).transform_fold(
#     ['ac', 'bc', 'tav', 'tbv'],
#     as_=['Column', 'Value']
# ).mark_bar().encode(
#     x=alt.X('Value:Q'),
#     y=alt.Y('count()', title='Number of Records'),
#     color='Column:N'
# ).properties(
#     width=300,
#     height=300
# ).transform_filter(
#     alt.datum.Cluster == alt.value('A')
# )

# # Combine charts
# scatter & hist.properties(title='Distribution of ac,bc,tav,tbv')
# # alt.vconcat(
# #     scatter,
# #     hist
# # ).properties(title='Distribution of ac,bc,tav,tbv')



In [5]:
ec.columns

Index(['Unnamed: 0.1', 'Unnamed: 0', 'group', 'time', 's_MP', 'change', 'type',
       'length', 'sum_change', 's_area', 'surge_area', 'p_group', 'p_time',
       'p_change', 'p_type', 'p_MP', 'p_buyCap', 'p_askCap', 'p_totalBidVol',
       'p_totalAskVol', 'p_length', 'p_sum_change', 'p_area', 'Cluster',
       'efficiency'],
      dtype='object')

In [6]:
arr = ec['Cluster'].unique()
arr

array(['Cluster 2', 'Cluster 0', 'Cluster 1', 'Cluster 3'], dtype=object)

In [7]:
#scat
used = ec#[:4999]
def compareEfficiencyAndVolume():
    alt.data_transformers.disable_max_rows()

    eff = alt.Chart(used).mark_circle().encode(
        x='surge_area:Q',
        y='efficiency',
        color='Cluster:N', 
        size='surge_area:Q',
        tooltip=['surge_area', 'Cluster']
    ).properties(height=300, width=400,title='Efficiency, per Cluster')
    
    
    tbv = alt.Chart(used).transform_fold(
        ['Cluster 2', 'Cluster 0', 'Cluster 1', 'Cluster 3'],
        as_=['Cluster', 'efficiency']
    ).mark_bar(
        opacity=0.3,
        binSpacing=0
    ).encode(
        alt.X('p_totalBidVol:Q').bin(maxbins=100),
        alt.Y('count()').stack(True),
        alt.Color('Cluster:N'),
        tooltip=['Cluster', 'p_totalBidVol', 'count()']
    ).properties(height=300, width=400,title='Precursor Ask Volume vs Bid Volume')
    
    tav = alt.Chart(used).transform_fold(
        ['Cluster 2', 'Cluster 0', 'Cluster 1', 'Cluster 3'],
        as_=['Cluster', 'efficiency']
    ).mark_bar(
        opacity=0.7,
        binSpacing=0
    ).encode(
        alt.X('p_totalAskVol:Q').bin(maxbins=100),
        alt.Y('count()').stack(True),
        alt.Color('Cluster:N'),
        tooltip=['Cluster', 'p_totalAskVol', 'count()']
    ).properties(height=300, width=400)
    return eff | tbv + tav

In [8]:
#scat
used = ec#[:4999]
def compareEfficiencyAndCaps():
    alt.data_transformers.disable_max_rows()

    eff = alt.Chart(used).mark_circle().encode(
        x='surge_area:Q',
        y='efficiency',
        color='Cluster:N', 
        size='surge_area:Q',
        tooltip=['surge_area', 'Cluster']
    ).properties(height=300, width=400,title='Efficiency, per Cluster')
    
    
    tbv = alt.Chart(used).transform_fold(
        ['Cluster 2', 'Cluster 0', 'Cluster 1', 'Cluster 3'],
        as_=['Cluster', 'efficiency']
    ).mark_bar(
        opacity=0.3,
        binSpacing=0
    ).encode(
        alt.X('p_buyCap:Q').bin(maxbins=100),
        alt.Y('count()').stack(True),
        alt.Color('Cluster:N'),
        tooltip=['Cluster', 'p_buyCap', 'count()']
    ).properties(height=300, width=400,title='Precursor Ask Capitaliztion vs Bid')
    
    tav = alt.Chart(used).transform_fold(
        ['Cluster 2', 'Cluster 0', 'Cluster 1', 'Cluster 3'],
        as_=['Cluster', 'efficiency']
    ).mark_bar(
        opacity=0.7,
        binSpacing=0
    ).encode(
        alt.X('p_askCap:Q').bin(maxbins=100),
        alt.Y('count()').stack(True),
        alt.Color('Cluster:N'),
        tooltip=['Cluster', 'p_askCap', 'count()']
    ).properties(height=300, width=400)
    return eff | tbv + tav

In [9]:
def buildMultiline():
    data = ec[['time','p_buyCap', 'p_askCap']].melt('time', var_name='variable', value_name='value')
    
    chart = alt.Chart(data).mark_line().encode(
        x='time:T',
        y=alt.Y('value:Q', scale=alt.Scale(zero=False)),
        color='variable:N'
    ).properties(
        width=600,
        height=400,
        title='Capitalization of Buy Orders vs Sell'
    )
    return chart

In [10]:
compareEfficiencyAndVolume()


In [11]:
## bid and ask volume time series vs mp
compareEfficiencyAndCaps()

In [13]:
buildMultiline()