## premise: Grid search for clustering solution

via skleanr pipeline and grid search

In [2]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.model_selection import GridSearchCV
import sklearn.metrics
import numpy as np
from sklearn import preprocessing

## using a discretizer on our target

In [12]:
#load pipeline 1 csv and prep for clustering
m2_pipeline = pd.read_csv('pipeline1.csv')
#change is surge price rate of change per observation, change.1 is precursor
#sum_change is surge sum_change per surge, and surge_area is surge alone
keepable = ['precursor_buy_cap_pct_change', 
            'precursor_ask_cap_pct_change',
            'precursor_bid_vol_pct_change', 
            'precursor_ask_vol_pct_change', 'change.1',
            'surge_targets_met_pct']

# Normalize the 'surge_targets_met_pct' column
x = m2_pipeline[['surge_targets_met_pct']].values.astype(float)
m2_pipeline = m2_pipeline[keepable]
m2_pipeline = m2_pipeline.dropna()
print(m2_pipeline.isna().sum(axis=1).astype(bool).sum())
m2_pipeline = m2_pipeline.astype('float')
m2_pipeline.dtypes

0


precursor_buy_cap_pct_change    float64
precursor_ask_cap_pct_change    float64
precursor_bid_vol_pct_change    float64
precursor_ask_vol_pct_change    float64
change.1                        float64
surge_targets_met_pct           float64
dtype: object

## using predefined bins for discrete clustering

In [13]:
# Define the maximum and minimum values
maximum = 10
minimum = -10

# Create a range of 20 values between the maximum and minimum
values = np.linspace(minimum, maximum, num=20)

# Round each value to two decimal places
rounded_values = np.round(values, decimals=2)

# Print the resulting array
print(rounded_values)


[-10.    -8.95  -7.89  -6.84  -5.79  -4.74  -3.68  -2.63  -1.58  -0.53
   0.53   1.58   2.63   3.68   4.74   5.79   6.84   7.89   8.95  10.  ]


In [16]:
#model 

# Bin the values in the 'surge_targets_met_pct' column
# bins = [-float('inf'), -8.5, -5.64, 0, 0.25, 0.35, 0.4, 0.5, 0.75, 1.0, 2.0, 3.0, 4.0, 5.0, 5.64, float('inf')]
bins = rounded_values
labels = rounded_values #['< -8.5', '-8.5 to -5.64', '-5.64 to 0', '0 to 0.25', '0.25 to 0.35', '0.35 to 0.4', '0.4 to 0.5', '0.5 to 0.75', '0.75 to 1', '1 to 2', '2 to 3', '3 to 4', '4 to 5', '5 to 5.64', '> 5.64']
# m2_pipeline['binned'] = pd.cut(m2_pipeline['surge_targets_met_pct'], bins=bins  )#, labels=labels)

# Display the binned data
# print(m2_pipeline['binned'].value_counts())
#implement
transformer = preprocessing.FunctionTransformer(
    pd.cut, kw_args={'bins': bins, 'retbins': False}  #'labels': labels, 
)
X = np.array([0.2, 2, 15, 25, 97])
transformer.fit_transform(X)

[(-0.53, 0.53], (1.58, 2.63], NaN, NaN, NaN]
Categories (19, interval[float64, right]): [(-10.0, -8.95] < (-8.95, -7.89] < (-7.89, -6.84] < (-6.84, -5.79] ... (5.79, 6.84] < (6.84, 7.89] < (7.89, 8.95] < (8.95, 10.0]]

In [17]:
transformer

In [11]:
m2_pipeline['binned']

1       (-0.53, 0.53]
2       (-0.53, 0.53]
3       (-0.53, 0.53]
4       (-0.53, 0.53]
5       (-0.53, 0.53]
            ...      
5631    (-0.53, 0.53]
5632    (-0.53, 0.53]
5633    (-0.53, 0.53]
5634    (-0.53, 0.53]
5635    (-0.53, 0.53]
Name: binned, Length: 5634, dtype: category
Categories (19, interval[float64, right]): [(-10.0, -8.95] < (-8.95, -7.89] < (-7.89, -6.84] < (-6.84, -5.79] ... (5.79, 6.84] < (6.84, 7.89] < (7.89, 8.95] < (8.95, 10.0]]

In [None]:
## you must cluster on the column 'surge_targets_met_pct' like so
import pandas as pd
from sklearn.cluster import KMeans
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
m2_pipeline['surge_targets_met_pct_normalized'] = pd.DataFrame(x_scaled)

# Assuming your dataframe is named 'm2_pipeline'
kmeans = KMeans(n_clusters=3, init='k-means++', max_iter=300, n_init=10, random_state=0)
kmeans.fit(m2_pipeline[['surge_targets_met_pct_normalized']])


### clustering types

(https://scikit-learn.org/stable/modules/clustering.html#mean-shift)


In [4]:
## alternative: specifying cluster value
import pandas as pd 
from sklearn.cluster import KMeans, AgglomerativeClustering, SpectralClustering,\
MeanShift, AffinityPropagation, DBSCAN, OPTICS, Birch
from sklearn.metrics import silhouette_score 

data =  m2_pipeline
scores = []  # feed in experiments as a dictionary of parameters and scores

csize = [4,5,6,7,8,9,10, 11, 12]
for i in range(len(csize)):
    c= csize[i]
    print("for cluster size",c)
    kmeans = KMeans(n_clusters=c) #
    hierarchical = AgglomerativeClustering(n_clusters=c) 
    spectral = SpectralClustering(n_clusters=c) 
    birch = Birch(n_clusters=c)
    algorithms = [(kmeans, 'KMeans'), (hierarchical, 'Agglomerative'), (spectral, 'Spectral'), (birch, 'Birch')] 
    # algorithms = [kmeans, hierarchical, spectral, birch] 

    # for algorithm, name in algorithms: # Loop over the algorithms and calculate the silhouette score for each
    for algorithm, name in algorithms: # Loop over the algorithms and calculate the silhouette score for each
        algorithm.fit(data) 
        score= silhouette_score(data, algorithm.labels_) 
        fin = {"algo":name, "score":score, "clusters":c}
        print(fin)
        scores.append(fin) 

results = pd.DataFrame(scores)  #{'Algorithm': [name for _, name in algorithms], 'Silhouette Score': scores}) 
print(results) 

for cluster size 4




{'algo': 'KMeans', 'score': 0.6468266676607601, 'clusters': 4}
{'algo': 'Agglomerative', 'score': 0.5544786252177756, 'clusters': 4}




{'algo': 'Spectral', 'score': 0.8894194271443359, 'clusters': 4}
{'algo': 'Birch', 'score': 0.8963065534883663, 'clusters': 4}
for cluster size 5




{'algo': 'KMeans', 'score': 0.5593983370450872, 'clusters': 5}
{'algo': 'Agglomerative', 'score': 0.5643203301653003, 'clusters': 5}




{'algo': 'Spectral', 'score': 0.8680072361397677, 'clusters': 5}
{'algo': 'Birch', 'score': 0.8897806396215054, 'clusters': 5}
for cluster size 6




{'algo': 'KMeans', 'score': 0.48139647422184684, 'clusters': 6}
{'algo': 'Agglomerative', 'score': 0.381872491666767, 'clusters': 6}




{'algo': 'Spectral', 'score': 0.860072471203043, 'clusters': 6}
{'algo': 'Birch', 'score': 0.6744131267918048, 'clusters': 6}
for cluster size 7




{'algo': 'KMeans', 'score': 0.5266775562128754, 'clusters': 7}
{'algo': 'Agglomerative', 'score': 0.3822319644010064, 'clusters': 7}




{'algo': 'Spectral', 'score': 0.8529175760916948, 'clusters': 7}
{'algo': 'Birch', 'score': 0.674899790548584, 'clusters': 7}
for cluster size 8




{'algo': 'KMeans', 'score': 0.5249708561440117, 'clusters': 8}
{'algo': 'Agglomerative', 'score': 0.38847626939917296, 'clusters': 8}




{'algo': 'Spectral', 'score': 0.7678599844306472, 'clusters': 8}
{'algo': 'Birch', 'score': 0.6739887885839931, 'clusters': 8}
for cluster size 9




{'algo': 'KMeans', 'score': 0.4537747745020753, 'clusters': 9}
{'algo': 'Agglomerative', 'score': 0.3889831387281811, 'clusters': 9}




{'algo': 'Spectral', 'score': 0.7654156402145649, 'clusters': 9}
{'algo': 'Birch', 'score': 0.673894150359468, 'clusters': 9}
for cluster size 10




{'algo': 'KMeans', 'score': 0.4836865690827746, 'clusters': 10}
{'algo': 'Agglomerative', 'score': 0.41540364706459565, 'clusters': 10}




{'algo': 'Spectral', 'score': 0.7557128742777597, 'clusters': 10}
{'algo': 'Birch', 'score': 0.6470633893096103, 'clusters': 10}
for cluster size 11




{'algo': 'KMeans', 'score': 0.4737651766143858, 'clusters': 11}
{'algo': 'Agglomerative', 'score': 0.4244966269048636, 'clusters': 11}




{'algo': 'Spectral', 'score': 0.7622392704381659, 'clusters': 11}
{'algo': 'Birch', 'score': 0.6450014158194325, 'clusters': 11}
for cluster size 12




{'algo': 'KMeans', 'score': 0.46565127223594627, 'clusters': 12}
{'algo': 'Agglomerative', 'score': 0.4222297884377017, 'clusters': 12}




{'algo': 'Spectral', 'score': 0.7633259097943303, 'clusters': 12}
{'algo': 'Birch', 'score': 0.6443349201464048, 'clusters': 12}
             algo     score  clusters
0          KMeans  0.646827         4
1   Agglomerative  0.554479         4
2        Spectral  0.889419         4
3           Birch  0.896307         4
4          KMeans  0.559398         5
5   Agglomerative  0.564320         5
6        Spectral  0.868007         5
7           Birch  0.889781         5
8          KMeans  0.481396         6
9   Agglomerative  0.381872         6
10       Spectral  0.860072         6
11          Birch  0.674413         6
12         KMeans  0.526678         7
13  Agglomerative  0.382232         7
14       Spectral  0.852918         7
15          Birch  0.674900         7
16         KMeans  0.524971         8
17  Agglomerative  0.388476         8
18       Spectral  0.767860         8
19          Birch  0.673989         8
20         KMeans  0.453775         9
21  Agglomerative  0.388983        

## do non cluster specified clustering

In [5]:
meanshift = MeanShift(*, bandwidth=None, seeds=None, bin_seeding=False, min_bin_freq=1, cluster_all=True, n_jobs=None, max_iter=300)
affinity = AffinityPropagation(*, damping=0.5, max_iter=200, convergence_iter=15, copy=True, preference=None, affinity='euclidean', verbose=False, random_state=42)
dbscan = DBSCAN(eps=0.5, *, min_samples=5, metric='euclidean', metric_params=None, algorithm='auto', leaf_size=30, p=None, n_jobs=None)
optics = OPTICS(*, min_samples=5, max_eps=inf, metric='minkowski', p=2, metric_params=None, cluster_method='xi', eps=None, xi=0.05, predecessor_correction=True, min_cluster_size=None, algorithm='auto', leaf_size=30, memory=None, n_jobs=None)

data = m2_pipeline
# scores2 = []  # feed in experiments as a dictionary of parameters and scores
algorithms2 = [(meanshift, 'Meanshift'), (affinity, 'Affinity'), (dbscan, 'DBSCAN'), (optics, 'OPTICS')]
for algorithm, name in algorithms2: # Loop over the algorithms and calculate the silhouette score for each
        algorithm.fit(data) 
        score= silhouette_score(data, algorithm.labels_) 
        fin = {"algo":name, "score":score, "clusters":0} 
        print(fin)
        scores.append(fin) 
# Create a dataframe to store the results 
results = pd.DataFrame(scores) #{'Algorithm': [name for _, name in algorithms2], 'Silhouette Score': scores2}) 
print(results)

{'algo': 'Meanshift', 'score': 0.5060122726757966, 'clusters': 0}




{'algo': 'Affinity', 'score': 0.20066580030525197, 'clusters': 0}
{'algo': 'DBSCAN', 'score': 0.9626815572732176, 'clusters': 0}
{'algo': 'OPTICS', 'score': -0.6003076524523103, 'clusters': 0}
             algo     score  clusters
0          KMeans  0.646827         4
1   Agglomerative  0.554479         4
2        Spectral  0.889419         4
3           Birch  0.896307         4
4          KMeans  0.559398         5
5   Agglomerative  0.564320         5
6        Spectral  0.868007         5
7           Birch  0.889781         5
8          KMeans  0.481396         6
9   Agglomerative  0.381872         6
10       Spectral  0.860072         6
11          Birch  0.674413         6
12         KMeans  0.526678         7
13  Agglomerative  0.382232         7
14       Spectral  0.852918         7
15          Birch  0.674900         7
16         KMeans  0.524971         8
17  Agglomerative  0.388476         8
18       Spectral  0.767860         8
19          Birch  0.673989         8
20       

In [6]:
cm = 'ranked_classifier_search.csv'  #clustered methods, specified num clusters
results.to_csv(cm, index=False)

## revive the test from stored memory
via csv, then get top n, n=10

In [7]:
revive = pd.read_csv(cm)
print(revive.head(2))
print(revive.columns)

            algo     score  clusters
0         KMeans  0.646827         4
1  Agglomerative  0.554479         4
Index(['algo', 'score', 'clusters'], dtype='object')


In [8]:
topn = revive.sort_values('score', ascending=False).head(10)

In [9]:
topn

Unnamed: 0,algo,score,clusters
38,DBSCAN,0.962682,0
3,Birch,0.896307,4
7,Birch,0.889781,5
2,Spectral,0.889419,4
6,Spectral,0.868007,5
10,Spectral,0.860072,6
14,Spectral,0.852918,7
18,Spectral,0.76786,8
22,Spectral,0.765416,9
34,Spectral,0.763326,12


In [10]:
topn.columns

Index(['algo', 'score', 'clusters'], dtype='object')

## cluster and back test the cluster as trade strategy


In [12]:
## cluster range evaluation
# Bin the values in the 'surge_targets_met_pct' column after grouped by 'cluster'
# bins = [-float('inf'), -8.5, -5.64, 0, 0.25, 0.35, 0.4, 0.5, 0.75, 1.0, 2.0, 3.0, 4.0, 5.0, 5.64, float('inf')]
# labels = ['< -8.5', '-8.5 to -5.64', '-5.64 to 0', '0 to 0.25', '0.25 to 0.35', '0.35 to 0.4', '0.4 to 0.5', '0.5 to 0.75', '0.75 to 1', '1 to 2', '2 to 3', '3 to 4', '4 to 5', '5 to 5.64', '> 5.64']
# lc['binned'] = pd.cut(lc['surge_targets_met_pct'], bins=bins, labels=labels)


In [18]:
from sklearn.cluster import KMeans

scoring = [] #algo, cluster quality metrics, cluster sizes, from topn
for index, row in topn.iterrows():
    data = m2_pipeline #need a temporary version of the master data

    algo = row['algo']   # Extract the algorithm and cluster from the current row
    c = row['clusters']
    score = row['score']
    kmeans = KMeans(n_clusters=c) #establish a means to build the cluster algo, cluster or no NEED SEEDING 
    hierarchical = AgglomerativeClustering(n_clusters=c) 
    spectral = SpectralClustering(n_clusters=c) 
    birch = Birch(n_clusters=c)
    meanshift = MeanShift()
    affinity = AffinityPropagation()
    dbscan = DBSCAN()
    optics = OPTICS()
    algorithms = [(kmeans, 'KMeans'), (hierarchical, 'Agglomerative'), (spectral, 'Spectral'), (birch, 'Birch')\
                  ,(meanshift, 'Meanshift'), (affinity, 'Affinity'), (dbscan, 'DBSCAN'), (optics, 'OPTICS')]
   
    method = [t for t in algorithms if algo in t][0][0] #read the algo type from the topn list/df
    print("handle: ",algo, c, score, method) #verify it is reading it ok
    # method.fit(m2_pipeline)
    predict = method.fit_predict(data) #fit predict, derive cluster labeling
    data['cluster'] = pd.Series(predict, index=data.index) # Add a new column to the filtered dataframe with the predicted cluster labels
    # group by cluster identifier, define ranges of values, number of clusters and their ranged values
    #NEED A NEW ALGORITHM TO EVALUATE EACH CLUSTER, BY BUSINESS RULE
    # means = data.groupby('Cluster')['surge_targets_met_pct'].mean() ##group by then aggregate
    min_max_count = data.groupby('cluster')['surge_targets_met_pct'].agg(['min', 'max', 'count'])
    print(type(min_max_count))
#cluster analytics: 
    
    fin = {"algo":method, "score":score, "clusters":c, "metrics":min_max_count}
    print(fin)
    scoring.append(fin)

analytic = pd.DataFrame(scoring)
print(analytic)


handle:  DBSCAN 0 0.9626815572732176 DBSCAN()
<class 'pandas.core.frame.DataFrame'>
{'algo': DBSCAN(), 'score': 0.9626815572732176, 'clusters': 0, 'metrics':                min       max  count
cluster                            
-1      -16.194332  5.642857     32
 0       -3.313392  2.158611   5554
 1       -4.078550 -3.355705     28
 2       -7.132802 -6.415279      7
 3       -5.163992 -4.180985     13}
handle:  Birch 4 0.8963065534883663 Birch(n_clusters=4)
<class 'pandas.core.frame.DataFrame'>
{'algo': Birch(n_clusters=4), 'score': 0.8963065534883663, 'clusters': 4, 'metrics':                min        max  count
cluster                             
0        -1.424149   5.642857   5277
1       -16.194332 -16.194332      1
2       -11.006585  -1.432408    355
3        -0.167973  -0.167973      1}
handle:  Birch 5 0.8897806396215054 Birch(n_clusters=5)
<class 'pandas.core.frame.DataFrame'>
{'algo': Birch(n_clusters=5), 'score': 0.8897806396215054, 'clusters': 5, 'metrics':         



<class 'pandas.core.frame.DataFrame'>
{'algo': SpectralClustering(n_clusters=4), 'score': 0.8894194271443359, 'clusters': 4, 'metrics':                min        max  count
cluster                             
0        -3.701528   5.642857   5587
1       -16.194332 -11.006585      2
2        -8.590806  -6.118776     14
3        -5.864198  -3.823280     31}
handle:  Spectral 5 0.8680072361397677 SpectralClustering(n_clusters=5)




<class 'pandas.core.frame.DataFrame'>
{'algo': SpectralClustering(n_clusters=5), 'score': 0.8680072361397677, 'clusters': 5, 'metrics':                min        max  count
cluster                             
0        -3.701528   5.642857   5587
1       -16.194332 -11.006585      2
2        -8.590806  -6.118776     14
3        -4.716981  -3.823280     22
4        -5.864198  -5.041152      9}
handle:  Spectral 6 0.860072471203043 SpectralClustering(n_clusters=6)




<class 'pandas.core.frame.DataFrame'>
{'algo': SpectralClustering(n_clusters=6), 'score': 0.860072471203043, 'clusters': 6, 'metrics':                min        max  count
cluster                             
0        -4.716981   5.642857   5609
1       -16.194332 -16.194332      1
2       -11.006585 -11.006585      1
3        -8.590806  -7.540799      3
4        -5.864198  -5.041152      9
5        -7.343550  -6.118776     11}
handle:  Spectral 7 0.8529175760916948 SpectralClustering(n_clusters=7)




<class 'pandas.core.frame.DataFrame'>
{'algo': SpectralClustering(n_clusters=7), 'score': 0.8529175760916948, 'clusters': 7, 'metrics':                min        max  count
cluster                             
0        -2.892562   2.838164   5550
1        -8.590806  -7.540799      3
2        -5.864198  -5.041152      9
3        -7.343550  -6.118776     11
4        -4.716981  -4.059963     13
5       -16.194332 -11.006585      2
6        -3.972950   5.642857     46}
handle:  Spectral 8 0.7678599844306472 SpectralClustering()




<class 'pandas.core.frame.DataFrame'>
{'algo': SpectralClustering(), 'score': 0.7678599844306472, 'clusters': 8, 'metrics':                min        max  count
cluster                             
0        -2.892562   2.838164   5550
1        -8.590806  -7.540799      3
2        -7.343550  -6.118776     11
3        -5.864198  -5.041152      9
4        -4.716981  -4.059963     13
5        -3.972950  -2.929687     45
6       -16.194332 -11.006585      2
7         5.642857   5.642857      1}
handle:  Spectral 9 0.7654156402145649 SpectralClustering(n_clusters=9)




<class 'pandas.core.frame.DataFrame'>
{'algo': SpectralClustering(n_clusters=9), 'score': 0.7654156402145649, 'clusters': 9, 'metrics':                min        max  count
cluster                             
0        -2.892562   5.642857   5551
1       -11.006585 -11.006585      1
2       -16.194332 -16.194332      1
3        -7.787810  -7.540799      2
4        -5.864198  -5.041152      9
5        -7.343550  -6.118776     11
6        -4.716981  -4.059963     13
7        -8.590806  -8.590806      1
8        -3.972950  -2.929687     45}
handle:  Spectral 12 0.7633259097943303 SpectralClustering(n_clusters=12)




<class 'pandas.core.frame.DataFrame'>
{'algo': SpectralClustering(n_clusters=12), 'score': 0.7633259097943303, 'clusters': 12, 'metrics':                min        max  count
cluster                             
0        -2.892562   2.355623   5548
1       -11.006585  -7.540799      3
2         5.642857   5.642857      1
3        -8.590806  -8.590806      1
4       -16.194332 -16.194332      1
5        -7.343550  -6.732026      6
6        -5.864198  -5.041152      9
7        -4.716981  -4.059963     13
8        -6.605114  -6.118776      5
9         2.615279   2.838164      2
10       -3.972950  -3.408211     25
11       -3.355705  -2.929687     20}
                                algo     score  clusters  \
0                           DBSCAN()  0.962682         0   
1                Birch(n_clusters=4)  0.896307         4   
2                Birch(n_clusters=5)  0.889781         5   
3   SpectralClustering(n_clusters=4)  0.889419         4   
4   SpectralClustering(n_clusters=5)  0.868

In [16]:
analytic

Unnamed: 0,algo,score,clusters,histogram
0,OPTICS,0.962682,0,min max count cluster ...
1,OPTICS,0.896307,4,min max count cluster ...
2,OPTICS,0.889781,5,min max count cluster ...
3,OPTICS,0.889419,4,min max count cluster ...
4,OPTICS,0.868007,5,min max count cluster ...
5,OPTICS,0.860072,6,min max count cluster ...
6,OPTICS,0.852918,7,min max count cluster ...
7,OPTICS,0.76786,8,min max count cluster ...
8,OPTICS,0.765416,9,min max count cluster ...
9,OPTICS,0.763326,12,min max count cluster ...


### take top performing cluster techniques, then ...

attache labels to each and analyze the clustering profiles for efficiency, profit

## notes 

1. grid search notes on [sklearn](https://scikit-learn.org/stable/modules/grid_search.html#grid-search)
2. sklearn pipeline [guide](https://scikit-learn.org/stable/modules/compose.html#pipeline)