## premise: Grid search for clustering solution

via skleanr pipeline and grid search

In [2]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.model_selection import GridSearchCV
import sklearn.metrics
import numpy as np
from sklearn import preprocessing

## using a discretizer on our target

In [3]:
#load pipeline 1 csv and prep for clustering
m2_pipeline = pd.read_csv('pipeline1.csv')
#change is surge price rate of change per observation, change.1 is precursor
#sum_change is surge sum_change per surge, and surge_area is surge alone
keepable = ['precursor_buy_cap_pct_change', 
            'precursor_ask_cap_pct_change',
            'precursor_bid_vol_pct_change', 
            'precursor_ask_vol_pct_change', 'change.1',
            'surge_targets_met_pct']

# Normalize the 'surge_targets_met_pct' column
x = m2_pipeline[['surge_targets_met_pct']].values.astype(float)
m2_pipeline = m2_pipeline[keepable]
m2_pipeline = m2_pipeline.dropna()
print(m2_pipeline.isna().sum(axis=1).astype(bool).sum())
m2_pipeline = m2_pipeline.astype('float')
m2_pipeline.dtypes

0


precursor_buy_cap_pct_change    float64
precursor_ask_cap_pct_change    float64
precursor_bid_vol_pct_change    float64
precursor_ask_vol_pct_change    float64
change.1                        float64
surge_targets_met_pct           float64
dtype: object

## using predefined bins for discrete clustering

In [4]:
# Define the maximum and minimum values
maximum = 10
minimum = -10

# Create a range of 20 values between the maximum and minimum
values = np.linspace(minimum, maximum, num=20)

# Round each value to two decimal places
rounded_values = np.round(values, decimals=2)

# Print the resulting array
print(rounded_values)


[-10.    -8.95  -7.89  -6.84  -5.79  -4.74  -3.68  -2.63  -1.58  -0.53
   0.53   1.58   2.63   3.68   4.74   5.79   6.84   7.89   8.95  10.  ]


In [5]:
#model 

# Bin the values in the 'surge_targets_met_pct' column
# bins = [-float('inf'), -8.5, -5.64, 0, 0.25, 0.35, 0.4, 0.5, 0.75, 1.0, 2.0, 3.0, 4.0, 5.0, 5.64, float('inf')]
bins = rounded_values
labels = rounded_values #['< -8.5', '-8.5 to -5.64', '-5.64 to 0', '0 to 0.25', '0.25 to 0.35', '0.35 to 0.4', '0.4 to 0.5', '0.5 to 0.75', '0.75 to 1', '1 to 2', '2 to 3', '3 to 4', '4 to 5', '5 to 5.64', '> 5.64']
# m2_pipeline['binned'] = pd.cut(m2_pipeline['surge_targets_met_pct'], bins=bins  )#, labels=labels)

# Display the binned data
# print(m2_pipeline['binned'].value_counts())
#implement
transformer = preprocessing.FunctionTransformer(
    pd.cut, kw_args={'bins': bins, 'retbins': False}  #'labels': labels, 
)
transformer.fit_transform(x)

ValueError: Input array must be 1 dimensional

In [None]:
transformer

In [None]:
m2_pipeline['binned']

In [None]:
## you must cluster on the column 'surge_targets_met_pct' like so
import pandas as pd
from sklearn.cluster import KMeans
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
m2_pipeline['surge_targets_met_pct_normalized'] = pd.DataFrame(x_scaled)

# Assuming your dataframe is named 'm2_pipeline'
kmeans = KMeans(n_clusters=3, init='k-means++', max_iter=300, n_init=10, random_state=0)
kmeans.fit(m2_pipeline[['surge_targets_met_pct_normalized']])


### clustering types

(https://scikit-learn.org/stable/modules/clustering.html#mean-shift)


In [None]:
## alternative: specifying cluster value
import pandas as pd 
from sklearn.cluster import KMeans, AgglomerativeClustering, SpectralClustering,\
MeanShift, AffinityPropagation, DBSCAN, OPTICS, Birch
from sklearn.metrics import silhouette_score 

data =  m2_pipeline
scores = []  # feed in experiments as a dictionary of parameters and scores

csize = [4,5,6,7,8,9,10, 11, 12]
for i in range(len(csize)):
    c= csize[i]
    print("for cluster size",c)
    kmeans = KMeans(n_clusters=c) #
    hierarchical = AgglomerativeClustering(n_clusters=c) 
    spectral = SpectralClustering(n_clusters=c) 
    birch = Birch(n_clusters=c)
    algorithms = [(kmeans, 'KMeans'), (hierarchical, 'Agglomerative'), (spectral, 'Spectral'), (birch, 'Birch')] 
    # algorithms = [kmeans, hierarchical, spectral, birch] 

    # for algorithm, name in algorithms: # Loop over the algorithms and calculate the silhouette score for each
    for algorithm, name in algorithms: # Loop over the algorithms and calculate the silhouette score for each
        algorithm.fit(data) 
        score= silhouette_score(data, algorithm.labels_) 
        fin = {"algo":name, "score":score, "clusters":c}
        print(fin)
        scores.append(fin) 

results = pd.DataFrame(scores)  #{'Algorithm': [name for _, name in algorithms], 'Silhouette Score': scores}) 
print(results) 

## do non cluster specified clustering

In [None]:
meanshift = MeanShift(*, bandwidth=None, seeds=None, bin_seeding=False, min_bin_freq=1, cluster_all=True, n_jobs=None, max_iter=300)
affinity = AffinityPropagation(*, damping=0.5, max_iter=200, convergence_iter=15, copy=True, preference=None, affinity='euclidean', verbose=False, random_state=42)
dbscan = DBSCAN(eps=0.5, *, min_samples=5, metric='euclidean', metric_params=None, algorithm='auto', leaf_size=30, p=None, n_jobs=None)
optics = OPTICS(*, min_samples=5, max_eps=inf, metric='minkowski', p=2, metric_params=None, cluster_method='xi', eps=None, xi=0.05, predecessor_correction=True, min_cluster_size=None, algorithm='auto', leaf_size=30, memory=None, n_jobs=None)

data = m2_pipeline
# scores2 = []  # feed in experiments as a dictionary of parameters and scores
algorithms2 = [(meanshift, 'Meanshift'), (affinity, 'Affinity'), (dbscan, 'DBSCAN'), (optics, 'OPTICS')]
for algorithm, name in algorithms2: # Loop over the algorithms and calculate the silhouette score for each
        algorithm.fit(data) 
        score= silhouette_score(data, algorithm.labels_) 
        fin = {"algo":name, "score":score, "clusters":0} 
        print(fin)
        scores.append(fin) 
# Create a dataframe to store the results 
results = pd.DataFrame(scores) #{'Algorithm': [name for _, name in algorithms2], 'Silhouette Score': scores2}) 
print(results)

In [None]:
cm = 'ranked_classifier_search.csv'  #clustered methods, specified num clusters
results.to_csv(cm, index=False)

## revive the test from stored memory
via csv, then get top n, n=10

In [None]:
revive = pd.read_csv(cm)
print(revive.head(2))
print(revive.columns)

In [None]:
topn = revive.sort_values('score', ascending=False).head(10)

In [None]:
topn

In [None]:
topn.columns

## cluster and back test the cluster as trade strategy


In [None]:
## cluster range evaluation
# Bin the values in the 'surge_targets_met_pct' column after grouped by 'cluster'
# bins = [-float('inf'), -8.5, -5.64, 0, 0.25, 0.35, 0.4, 0.5, 0.75, 1.0, 2.0, 3.0, 4.0, 5.0, 5.64, float('inf')]
# labels = ['< -8.5', '-8.5 to -5.64', '-5.64 to 0', '0 to 0.25', '0.25 to 0.35', '0.35 to 0.4', '0.4 to 0.5', '0.5 to 0.75', '0.75 to 1', '1 to 2', '2 to 3', '3 to 4', '4 to 5', '5 to 5.64', '> 5.64']
# lc['binned'] = pd.cut(lc['surge_targets_met_pct'], bins=bins, labels=labels)


In [None]:
from sklearn.cluster import KMeans

scoring = [] #algo, cluster quality metrics, cluster sizes, from topn
for index, row in topn.iterrows():
    data = m2_pipeline #need a temporary version of the master data

    algo = row['algo']   # Extract the algorithm and cluster from the current row
    c = row['clusters']
    score = row['score']
    kmeans = KMeans(n_clusters=c) #establish a means to build the cluster algo, cluster or no NEED SEEDING 
    hierarchical = AgglomerativeClustering(n_clusters=c) 
    spectral = SpectralClustering(n_clusters=c) 
    birch = Birch(n_clusters=c)
    meanshift = MeanShift()
    affinity = AffinityPropagation()
    dbscan = DBSCAN()
    optics = OPTICS()
    algorithms = [(kmeans, 'KMeans'), (hierarchical, 'Agglomerative'), (spectral, 'Spectral'), (birch, 'Birch')\
                  ,(meanshift, 'Meanshift'), (affinity, 'Affinity'), (dbscan, 'DBSCAN'), (optics, 'OPTICS')]
   
    method = [t for t in algorithms if algo in t][0][0] #read the algo type from the topn list/df
    print("handle: ",algo, c, score, method) #verify it is reading it ok
    # method.fit(m2_pipeline)
    predict = method.fit_predict(data) #fit predict, derive cluster labeling
    data['cluster'] = pd.Series(predict, index=data.index) # Add a new column to the filtered dataframe with the predicted cluster labels
    # group by cluster identifier, define ranges of values, number of clusters and their ranged values
    #NEED A NEW ALGORITHM TO EVALUATE EACH CLUSTER, BY BUSINESS RULE
    # means = data.groupby('Cluster')['surge_targets_met_pct'].mean() ##group by then aggregate
    min_max_count = data.groupby('cluster')['surge_targets_met_pct'].agg(['min', 'max', 'count'])
    print(type(min_max_count))
#cluster analytics: 
    
    fin = {"algo":method, "score":score, "clusters":c, "metrics":min_max_count}
    print(fin)
    scoring.append(fin)

analytic = pd.DataFrame(scoring)
print(analytic)


In [None]:
analytic

### take top performing cluster techniques, then ...

attache labels to each and analyze the clustering profiles for efficiency, profit

## notes 

1. grid search notes on [sklearn](https://scikit-learn.org/stable/modules/grid_search.html#grid-search)
2. sklearn pipeline [guide](https://scikit-learn.org/stable/modules/compose.html#pipeline)