## premise: Grid search for clustering solution

via skleanr pipeline and grid search

In [2]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.model_selection import GridSearchCV
import sklearn.metrics
from sklearn import preprocessing

In [3]:
#load pipeline 1 csv and prep for clustering
m2_pipeline = pd.read_csv('pipeline1.csv')
#change is surge price rate of change per observation, change.1 is precursor
#sum_change is surge sum_change per surge, and surge_area is surge alone
keepable = ['precursor_buy_cap_pct_change', 
            'precursor_ask_cap_pct_change',
            'precursor_bid_vol_pct_change', 
            'precursor_ask_vol_pct_change', 'change.1',
            'surge_targets_met_pct']
# Load the data into a pandas dataframe
# df = pd.read_csv('data.csv')

# Normalize the 'surge_targets_met_pct' column
x = m2_pipeline[['surge_targets_met_pct']].values.astype(float)
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
m2_pipeline['surge_targets_met_pct_normalized'] = pd.DataFrame(x_scaled)


m2_pipeline = m2_pipeline[keepable]
m2_pipeline = m2_pipeline.dropna()
print(m2_pipeline.isna().sum(axis=1).astype(bool).sum())
m2_pipeline = m2_pipeline.astype('float')
m2_pipeline.dtypes

0


precursor_buy_cap_pct_change    float64
precursor_ask_cap_pct_change    float64
precursor_bid_vol_pct_change    float64
precursor_ask_vol_pct_change    float64
change.1                        float64
surge_targets_met_pct           float64
dtype: object

In [12]:
print(sklearn.metrics.get_scorer_names())

['accuracy', 'adjusted_mutual_info_score', 'adjusted_rand_score', 'average_precision', 'balanced_accuracy', 'completeness_score', 'explained_variance', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'fowlkes_mallows_score', 'homogeneity_score', 'jaccard', 'jaccard_macro', 'jaccard_micro', 'jaccard_samples', 'jaccard_weighted', 'matthews_corrcoef', 'max_error', 'mutual_info_score', 'neg_brier_score', 'neg_log_loss', 'neg_mean_absolute_error', 'neg_mean_absolute_percentage_error', 'neg_mean_gamma_deviance', 'neg_mean_poisson_deviance', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'neg_median_absolute_error', 'neg_negative_likelihood_ratio', 'neg_root_mean_squared_error', 'normalized_mutual_info_score', 'positive_likelihood_ratio', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'r2', 'rand_score', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'roc_auc', 'roc_auc_ovo', 'roc_auc_ovo_weight

In [19]:

# # Load the data into a pandas dataframe
# # df = pd.read_csv('data.csv')

# # Define the estimator you want to use for clustering
# estimator = KMeans()

# # Define the parameter grid that you want to search over
# param_grid = {'n_clusters': [2, 3, 4, 5]}

# # Define the scoring metric you want to use
# scoring = 'silhouette'

# # Define the GridSearchCV object with the estimator, parameter grid, and scoring metric
# grid_search = GridSearchCV(estimator=estimator, param_grid=param_grid, scoring=scoring)

# # Fit the GridSearchCV object to your data
# grid_search.fit(m2_pipeline)


### clustering types

K-Means
Affinity propagation
Mean Shift
Spectral Clustering
Agglomerative Clustering
DBSCAN
OPTICS
Birch
Gaussian Mixture
MiniBatch K-Means
Affinity Propagation

In [5]:
## alternative: specifying cluster value
import pandas as pd 
from sklearn.cluster import KMeans, AgglomerativeClustering, SpectralClustering,\
MeanShift, AffinityPropagation, DBSCAN, OPTICS, Birch
from sklearn.metrics import silhouette_score 

data =  m2_pipeline
scores = []  # feed in experiments as a dictionary of parameters and scores

csize = [4,5,6,7,8,9,10, 11, 12]
for i in range(len(csize)):
    c= csize[i]
    print("for cluster size",c)
    kmeans = KMeans(n_clusters=c) #
    hierarchical = AgglomerativeClustering(n_clusters=c) 
    spectral = SpectralClustering(n_clusters=c) 
    birch = Birch(n_clusters=c)
    algorithms = [(kmeans, 'KMeans'), (hierarchical, 'Agglomerative'), (spectral, 'Spectral'), (birch, 'Birch')] 
    for algorithm, name in algorithms: # Loop over the algorithms and calculate the silhouette score for each
        algorithm.fit(data) 
        
        score= silhouette_score(data, algorithm.labels_) 
        fin = {"algo":name, "score":score, "clusters":c}
        print(fin)
        scores.append(fin) 

results = pd.DataFrame(scores)  #{'Algorithm': [name for _, name in algorithms], 'Silhouette Score': scores}) 
print(results) 

for cluster size 4




{'algo': 'KMeans', 'score': 0.6455332908395774, 'clusters': 4}
{'algo': 'Agglomerative', 'score': 0.5544786252177756, 'clusters': 4}




{'algo': 'Spectral', 'score': 0.8894194271443359, 'clusters': 4}
{'algo': 'Birch', 'score': 0.8963065534883663, 'clusters': 4}
for cluster size 5




{'algo': 'KMeans', 'score': 0.559216943942274, 'clusters': 5}
{'algo': 'Agglomerative', 'score': 0.5643203301653003, 'clusters': 5}




{'algo': 'Spectral', 'score': 0.8650954651734716, 'clusters': 5}
{'algo': 'Birch', 'score': 0.8897806396215054, 'clusters': 5}
for cluster size 6




{'algo': 'KMeans', 'score': 0.4713397557069962, 'clusters': 6}
{'algo': 'Agglomerative', 'score': 0.381872491666767, 'clusters': 6}




{'algo': 'Spectral', 'score': 0.860072471203043, 'clusters': 6}
{'algo': 'Birch', 'score': 0.6744131267918048, 'clusters': 6}
for cluster size 7




{'algo': 'KMeans', 'score': 0.5319741457108857, 'clusters': 7}
{'algo': 'Agglomerative', 'score': 0.3822319644010064, 'clusters': 7}




{'algo': 'Spectral', 'score': 0.8529175760916948, 'clusters': 7}
{'algo': 'Birch', 'score': 0.674899790548584, 'clusters': 7}
for cluster size 8




{'algo': 'KMeans', 'score': 0.526166002404663, 'clusters': 8}
{'algo': 'Agglomerative', 'score': 0.38847626939917296, 'clusters': 8}




{'algo': 'Spectral', 'score': 0.7763038471019854, 'clusters': 8}
{'algo': 'Birch', 'score': 0.6739887885839931, 'clusters': 8}
for cluster size 9




{'algo': 'KMeans', 'score': 0.5290761952686733, 'clusters': 9}
{'algo': 'Agglomerative', 'score': 0.3889831387281811, 'clusters': 9}




{'algo': 'Spectral', 'score': 0.7733864804657862, 'clusters': 9}
{'algo': 'Birch', 'score': 0.673894150359468, 'clusters': 9}
for cluster size 10




{'algo': 'KMeans', 'score': 0.48534213588569053, 'clusters': 10}
{'algo': 'Agglomerative', 'score': 0.41540364706459565, 'clusters': 10}




{'algo': 'Spectral', 'score': 0.7547513837605193, 'clusters': 10}
{'algo': 'Birch', 'score': 0.6470633893096103, 'clusters': 10}
for cluster size 11




{'algo': 'KMeans', 'score': 0.4778905428239072, 'clusters': 11}
{'algo': 'Agglomerative', 'score': 0.4244966269048636, 'clusters': 11}




{'algo': 'Spectral', 'score': 0.7622392704381659, 'clusters': 11}
{'algo': 'Birch', 'score': 0.6450014158194325, 'clusters': 11}
for cluster size 12




{'algo': 'KMeans', 'score': 0.46414071698499537, 'clusters': 12}
{'algo': 'Agglomerative', 'score': 0.4222297884377017, 'clusters': 12}




{'algo': 'Spectral', 'score': 0.7721629675285338, 'clusters': 12}
{'algo': 'Birch', 'score': 0.6443349201464048, 'clusters': 12}
             algo     score  clusters
0          KMeans  0.645533         4
1   Agglomerative  0.554479         4
2        Spectral  0.889419         4
3           Birch  0.896307         4
4          KMeans  0.559217         5
5   Agglomerative  0.564320         5
6        Spectral  0.865095         5
7           Birch  0.889781         5
8          KMeans  0.471340         6
9   Agglomerative  0.381872         6
10       Spectral  0.860072         6
11          Birch  0.674413         6
12         KMeans  0.531974         7
13  Agglomerative  0.382232         7
14       Spectral  0.852918         7
15          Birch  0.674900         7
16         KMeans  0.526166         8
17  Agglomerative  0.388476         8
18       Spectral  0.776304         8
19          Birch  0.673989         8
20         KMeans  0.529076         9
21  Agglomerative  0.388983        

In [6]:
results.to_csv('ranked_classifier_search.csv', index=False)

In [42]:
meanshift = MeanShift()
affinity = AffinityPropagation()
dbscan = DBSCAN()
optics = OPTICS()
scores2 = []  # feed in experiments as a dictionary of parameters and scores
algorithms2 = [(meanshift, 'Meanshift'), (affinity, 'Affinity'), (dbscan, 'DBSCAN'), (optics, 'OPTICS')]
for algorithm, name in algorithms2: # Loop over the algorithms and calculate the silhouette score for each
        algorithm.fit(data) 
        score= silhouette_score(data, algorithm.labels_) 
        fin = {"algo":algorithm.labels_, "score":score, "size":c} 
        scores2.append(fin) 
# Create a dataframe to store the results 
results2 = pd.DataFrame(scores2) #{'Algorithm': [name for _, name in algorithms2], 'Silhouette Score': scores2}) 
print(results2)



ValueError: All arrays must be of the same length

In [None]:
# append cluster to each row in your data source, example

from sklearn.cluster import SpectralClustering

# Assuming 'df' is a pandas dataframe
# Replace 'affinity' with the appropriate affinity matrix
# Replace 'n_clusters' with the desired number of clusters
clustering = SpectralClustering(affinity='...', n_clusters=..., random_state=0).fit(df)

# Add a new column to the dataframe with the cluster names
df['cluster_name'] = clustering.labels_


## notes 

1. grid search notes on [sklearn](https://scikit-learn.org/stable/modules/grid_search.html#grid-search)
2. sklearn pipeline [guide](https://scikit-learn.org/stable/modules/compose.html#pipeline)

In [None]:
# USE A PIPELINE ONCE YOU IDENTIFY A CLUSTER/ CLASSIFY SET

from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN #cluster types
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# Define the pipeline
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('cluster', KMeans())
])

# Define the parameter grid
param_grid = {
    'cluster': [KMeans(), AgglomerativeClustering(), DBSCAN()],
    'cluster__n_clusters': range(2, 11),
    'cluster__eps': [0.1, 0.5, 1],
    'cluster__min_samples': [2, 5, 10]
}

# Perform the grid search
grid_search = GridSearchCV(pipe, param_grid=param_grid, cv=5)
grid_search.fit(X)

# Print the best parameters and score
print("Best parameters: ", grid_search.best_params_)
print("Best score: ", grid_search.best_score_)
