## premise: Grid search for clustering solution

via skleanr pipeline and grid search

In [1]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.model_selection import GridSearchCV
import sklearn.metrics
from sklearn import preprocessing

In [18]:
#load pipeline 1 csv and prep for clustering
m2_pipeline = pd.read_csv('pipeline1.csv')
#change is surge price rate of change per observation, change.1 is precursor
#sum_change is surge sum_change per surge, and surge_area is surge alone
keepable = ['precursor_buy_cap_pct_change', 
            'precursor_ask_cap_pct_change',
            'precursor_bid_vol_pct_change', 
            'surge_targets_met_pct']

# Normalize the 'surge_targets_met_pct' column
x = m2_pipeline[['surge_targets_met_pct']].values.astype(float)
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
m2_pipeline['surge_targets_met_pct_normalized'] = pd.DataFrame(x_scaled)

m2_pipeline = m2_pipeline[keepable]
m2_pipeline = m2_pipeline.dropna()
print(m2_pipeline.isna().sum(axis=1).astype(bool).sum())
m2_pipeline = m2_pipeline.astype('float')
m2_pipeline.dtypes

0


precursor_buy_cap_pct_change    float64
precursor_ask_cap_pct_change    float64
precursor_bid_vol_pct_change    float64
surge_targets_met_pct           float64
dtype: object

In [3]:
# print(sklearn.metrics.get_scorer_names())

In [6]:
## IF you must cluster on the column 'surge_targets_met_pct' like so
# import pandas as pd
# from sklearn.cluster import KMeans

# # Assuming your dataframe is named 'm2_pipeline'
# kmeans = KMeans(n_clusters=3, init='k-means++', max_iter=300, n_init=10, random_state=0)
# kmeans.fit(m2_pipeline[['surge_targets_met_pct_normalized']])


### clustering types

K-Means
Affinity propagation
Mean Shift
Spectral Clustering
Agglomerative Clustering
DBSCAN
OPTICS
Birch
Gaussian Mixture
MiniBatch K-Means
Affinity Propagation

In [8]:
## alternative: specifying cluster value
import pandas as pd 
from sklearn.cluster import KMeans, AgglomerativeClustering, SpectralClustering,\
MeanShift, AffinityPropagation, DBSCAN, OPTICS, Birch
from sklearn.metrics import silhouette_score 

data =  m2_pipeline
scores = []  # feed in experiments as a dictionary of parameters and scores

csize = [4,5,6,7,8,9,10, 11, 12]
for i in range(len(csize)):
    c= csize[i]
    print("for cluster size",c)
    kmeans = KMeans(n_clusters=c) #
    hierarchical = AgglomerativeClustering(n_clusters=c) 
    spectral = SpectralClustering(n_clusters=c) 
    birch = Birch(n_clusters=c)
    algorithms = [(kmeans, 'KMeans'), (hierarchical, 'Agglomerative'), (spectral, 'Spectral'), (birch, 'Birch')] 
    # algorithms = [kmeans, hierarchical, spectral, birch] 

    # for algorithm, name in algorithms: # Loop over the algorithms and calculate the silhouette score for each
    for algorithm, name in algorithms: # Loop over the algorithms and calculate the silhouette score for each
        algorithm.fit(data) 
        score= silhouette_score(data, algorithm.labels_) 
        fin = {"algo":name, "score":score, "clusters":c}
        print(fin)
        scores.append(fin) 

results = pd.DataFrame(scores)  #{'Algorithm': [name for _, name in algorithms], 'Silhouette Score': scores}) 
print(results) 

for cluster size 4




{'algo': 'KMeans', 'score': 0.6468266676607601, 'clusters': 4}
{'algo': 'Agglomerative', 'score': 0.5544786252177756, 'clusters': 4}




{'algo': 'Spectral', 'score': 0.8894194271443359, 'clusters': 4}
{'algo': 'Birch', 'score': 0.8963065534883663, 'clusters': 4}
for cluster size 5




{'algo': 'KMeans', 'score': 0.5581666769405856, 'clusters': 5}
{'algo': 'Agglomerative', 'score': 0.5643203301653003, 'clusters': 5}




{'algo': 'Spectral', 'score': 0.8650954651734716, 'clusters': 5}
{'algo': 'Birch', 'score': 0.8897806396215054, 'clusters': 5}
for cluster size 6




{'algo': 'KMeans', 'score': 0.4831680743383329, 'clusters': 6}
{'algo': 'Agglomerative', 'score': 0.381872491666767, 'clusters': 6}




{'algo': 'Spectral', 'score': 0.856710543020676, 'clusters': 6}
{'algo': 'Birch', 'score': 0.6744131267918048, 'clusters': 6}
for cluster size 7




{'algo': 'KMeans', 'score': 0.5341001325910059, 'clusters': 7}
{'algo': 'Agglomerative', 'score': 0.3822319644010064, 'clusters': 7}




{'algo': 'Spectral', 'score': 0.8529175760916948, 'clusters': 7}
{'algo': 'Birch', 'score': 0.674899790548584, 'clusters': 7}
for cluster size 8




{'algo': 'KMeans', 'score': 0.5359595860538194, 'clusters': 8}
{'algo': 'Agglomerative', 'score': 0.38847626939917296, 'clusters': 8}




{'algo': 'Spectral', 'score': 0.7678599844306472, 'clusters': 8}
{'algo': 'Birch', 'score': 0.6739887885839931, 'clusters': 8}
for cluster size 9




{'algo': 'KMeans', 'score': 0.5152722010245488, 'clusters': 9}
{'algo': 'Agglomerative', 'score': 0.3889831387281811, 'clusters': 9}




{'algo': 'Spectral', 'score': 0.7654156402145649, 'clusters': 9}
{'algo': 'Birch', 'score': 0.673894150359468, 'clusters': 9}
for cluster size 10




{'algo': 'KMeans', 'score': 0.4838389663418717, 'clusters': 10}
{'algo': 'Agglomerative', 'score': 0.41540364706459565, 'clusters': 10}




{'algo': 'Spectral', 'score': 0.7547513837605193, 'clusters': 10}
{'algo': 'Birch', 'score': 0.6470633893096103, 'clusters': 10}
for cluster size 11




{'algo': 'KMeans', 'score': 0.47802644331595184, 'clusters': 11}
{'algo': 'Agglomerative', 'score': 0.4244966269048636, 'clusters': 11}




{'algo': 'Spectral', 'score': 0.7630498667870546, 'clusters': 11}
{'algo': 'Birch', 'score': 0.6450014158194325, 'clusters': 11}
for cluster size 12




{'algo': 'KMeans', 'score': 0.47089265291265997, 'clusters': 12}
{'algo': 'Agglomerative', 'score': 0.4222297884377017, 'clusters': 12}




{'algo': 'Spectral', 'score': 0.7567062612423767, 'clusters': 12}
{'algo': 'Birch', 'score': 0.6443349201464048, 'clusters': 12}
             algo     score  clusters
0          KMeans  0.646827         4
1   Agglomerative  0.554479         4
2        Spectral  0.889419         4
3           Birch  0.896307         4
4          KMeans  0.558167         5
5   Agglomerative  0.564320         5
6        Spectral  0.865095         5
7           Birch  0.889781         5
8          KMeans  0.483168         6
9   Agglomerative  0.381872         6
10       Spectral  0.856711         6
11          Birch  0.674413         6
12         KMeans  0.534100         7
13  Agglomerative  0.382232         7
14       Spectral  0.852918         7
15          Birch  0.674900         7
16         KMeans  0.535960         8
17  Agglomerative  0.388476         8
18       Spectral  0.767860         8
19          Birch  0.673989         8
20         KMeans  0.515272         9
21  Agglomerative  0.388983        

## do non cluster specified clustering

In [12]:
meanshift = MeanShift( bandwidth=None, seeds=None, bin_seeding=False, min_bin_freq=1, cluster_all=True, n_jobs=None, max_iter=300)
affinity = AffinityPropagation( damping=0.5, max_iter=200, convergence_iter=15, copy=True, preference=None, affinity='euclidean', verbose=False, random_state=None)
dbscan = DBSCAN(eps=0.5,  min_samples=5, metric='euclidean', metric_params=None, algorithm='auto', leaf_size=30, p=None, n_jobs=None)
optics = OPTICS( min_samples=5, max_eps=3, metric='minkowski', p=2, metric_params=None, cluster_method='xi', eps=None, xi=0.05, predecessor_correction=True, min_cluster_size=None, algorithm='auto', leaf_size=30, memory=None, n_jobs=None)


data = m2_pipeline
# scores2 = []  # feed in experiments as a dictionary of parameters and scores
algorithms2 = [(meanshift, 'Meanshift'), (affinity, 'Affinity'), (dbscan, 'DBSCAN'), (optics, 'OPTICS')]
for algorithm, name in algorithms2: # Loop over the algorithms and calculate the silhouette score for each
        algorithm.fit(data) 
        score= silhouette_score(data, algorithm.labels_) 
        fin = {"algo":name, "score":score, "clusters":0} 
        print(fin)
        scores.append(fin) 
# Create a dataframe to store the results 
results = pd.DataFrame(scores) #{'Algorithm': [name for _, name in algorithms2], 'Silhouette Score': scores2}) 
print(results)

{'algo': 'Meanshift', 'score': 0.5060122726757966, 'clusters': 0}




{'algo': 'Affinity', 'score': 0.312711138107636, 'clusters': 0}
{'algo': 'DBSCAN', 'score': 0.9626815572732176, 'clusters': 0}
{'algo': 'OPTICS', 'score': -0.6003076524523103, 'clusters': 0}
             algo     score  clusters
0          KMeans  0.646827         4
1   Agglomerative  0.554479         4
2        Spectral  0.889419         4
3           Birch  0.896307         4
4          KMeans  0.558167         5
5   Agglomerative  0.564320         5
6        Spectral  0.865095         5
7           Birch  0.889781         5
8          KMeans  0.483168         6
9   Agglomerative  0.381872         6
10       Spectral  0.856711         6
11          Birch  0.674413         6
12         KMeans  0.534100         7
13  Agglomerative  0.382232         7
14       Spectral  0.852918         7
15          Birch  0.674900         7
16         KMeans  0.535960         8
17  Agglomerative  0.388476         8
18       Spectral  0.767860         8
19          Birch  0.673989         8
20         

In [13]:
cm = 'ranked_classifier_search.csv'  #clustered methods, specified num clusters
results.to_csv(cm, index=False)

## revive the test from stored memory
via csv, then get top n, n=10

In [14]:
revive = pd.read_csv(cm)
print(revive.head(2))
print(revive.columns)

            algo     score  clusters
0         KMeans  0.646827         4
1  Agglomerative  0.554479         4
Index(['algo', 'score', 'clusters'], dtype='object')


In [15]:
topn = revive.sort_values('score', ascending=False).head(10)

In [16]:
topn

Unnamed: 0,algo,score,clusters
38,DBSCAN,0.962682,0
3,Birch,0.896307,4
7,Birch,0.889781,5
2,Spectral,0.889419,4
6,Spectral,0.865095,5
10,Spectral,0.856711,6
14,Spectral,0.852918,7
18,Spectral,0.76786,8
22,Spectral,0.765416,9
30,Spectral,0.76305,11


In [17]:
topn.columns

Index(['algo', 'score', 'clusters'], dtype='object')

## cluster and back test the cluster as trade strategy


In [None]:
## cluster range evaluation
# Bin the values in the 'surge_targets_met_pct' column after grouped by 'cluster'
# bins = [-float('inf'), -8.5, -5.64, 0, 0.25, 0.35, 0.4, 0.5, 0.75, 1.0, 2.0, 3.0, 4.0, 5.0, 5.64, float('inf')]
# labels = ['< -8.5', '-8.5 to -5.64', '-5.64 to 0', '0 to 0.25', '0.25 to 0.35', '0.35 to 0.4', '0.4 to 0.5', '0.5 to 0.75', '0.75 to 1', '1 to 2', '2 to 3', '3 to 4', '4 to 5', '5 to 5.64', '> 5.64']
# lc['binned'] = pd.cut(lc['surge_targets_met_pct'], bins=bins, labels=labels)


In [None]:
from sklearn.cluster import KMeans

scoring = [] #algo, cluster quality metrics, cluster sizes, from topn
for index, row in topn.iterrows():
    data = m2_pipeline #need a temporary version of the master data

    algo = row['algo']   # Extract the algorithm and cluster from the current row
    c = row['clusters']
    score = row['score']
    kmeans = KMeans(n_clusters=c) #establish a means to build the cluster algo, cluster or no NEED SEEDING 
    hierarchical = AgglomerativeClustering(n_clusters=c) 
    spectral = SpectralClustering(n_clusters=c) 
    birch = Birch(n_clusters=c)
    meanshift = MeanShift()
    affinity = AffinityPropagation()
    dbscan = DBSCAN()
    optics = OPTICS()
    algorithms = [(kmeans, 'KMeans'), (hierarchical, 'Agglomerative'), (spectral, 'Spectral'), (birch, 'Birch')\
                  ,(meanshift, 'Meanshift'), (affinity, 'Affinity'), (dbscan, 'DBSCAN'), (optics, 'OPTICS')]
   
    method = [t for t in algorithms if algo in t][0][0] #read the algo type from the topn list/df
    print("handle: ",algo, c, score, method) #verify it is reading it ok
    # method.fit(m2_pipeline)
    predict = method.fit_predict(data) #fit predict, derive cluster labeling
    data['cluster'] = pd.Series(predict, index=data.index) # Add a new column to the filtered dataframe with the predicted cluster labels
    # group by cluster identifier, define ranges of values, number of clusters and their ranged values
    #NEED A NEW ALGORITHM TO EVALUATE EACH CLUSTER, BY BUSINESS RULE
    # means = data.groupby('Cluster')['surge_targets_met_pct'].mean() ##group by then aggregate
    min_max_count = data.groupby('cluster')['surge_targets_met_pct'].agg(['min', 'max', 'count'])
    print(type(min_max_count))
#cluster analytics: 
    
    fin = {"algo":method, "score":score, "clusters":c, "metrics":min_max_count}
    print(fin)
    scoring.append(fin)

analytic = pd.DataFrame(scoring)
print(analytic)


In [None]:
analytic

### take top performing cluster techniques, then ...

attache labels to each and analyze the clustering profiles for efficiency, profit

## notes 

1. grid search notes on [sklearn](https://scikit-learn.org/stable/modules/grid_search.html#grid-search)
2. sklearn pipeline [guide](https://scikit-learn.org/stable/modules/compose.html#pipeline)