## Cluster Technique Search and Accuracy Test

In [1]:
import pandas as pd
# from sklearn.cluster import KMeans
from sklearn.model_selection import GridSearchCV
import sklearn.metrics
import numpy as np
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
import datetime

#SPECIFIED CLUSTER DISCOVERY
# from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
# from sklearn.model_selection import GridSearchCV
# from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN, SpectralClustering, Birch
from sklearn.cluster import KMeans, AgglomerativeClustering, SpectralClustering,\
MeanShift, AffinityPropagation, DBSCAN, OPTICS, Birch
from sklearn.metrics import silhouette_score

In [2]:
# BIN TECHNIQUE
# load pipeline 1 csv and prep for clustering
m2_pipeline = pd.read_csv('pipeline1.csv')
# change is surge price rate of change per observation, change.1 is precursor
# sum_change is surge sum_change per surge, and surge_area is surge alone
keepable = ['precursor_buy_cap_pct_change', 
            'precursor_ask_cap_pct_change',
            'precursor_bid_vol_pct_change', 
            'precursor_ask_vol_pct_change', 'change.1',
            'surge_targets_met_pct']

# Normalize the 'surge_targets_met_pct' column
x = m2_pipeline[['surge_targets_met_pct']].values.astype(float)
m2_pipeline = m2_pipeline[keepable]
print(m2_pipeline.isna().sum(axis=1).astype(bool).sum())
m2_pipeline = m2_pipeline.astype('float')
m2_pipeline.dtypes

# # bins = [float(f"{x:.2f}") for x in range(-10, 11)]
# bins = [x * 0.2 for x in range(-10, 11)]
# print(bins)
# #model 
# m2_pipeline['bin'] = pd.cut(m2_pipeline['surge_targets_met_pct'], bins=bins  )#, labels=labels)
# # Display the binned data
# print(m2_pipeline['bin'].value_counts())
# m2_pipeline['bin']
# m2_pipeline['bin'] = m2_pipeline['bin'].astype('category')

0


precursor_buy_cap_pct_change    float64
precursor_ask_cap_pct_change    float64
precursor_bid_vol_pct_change    float64
precursor_ask_vol_pct_change    float64
change.1                        float64
surge_targets_met_pct           float64
dtype: object

In [3]:
bins = [
    m2_pipeline['surge_targets_met_pct'].min() -1,  # Min value
    -4/3*2,  # -4 to 0 divided into three equal parts
    -4/3,  # Second bin edge for negative values
    0,  # Zero separating negative and positive values
    0.25, 0.5, 0.75, 1,  # Four bins between 0 and 1
    2,  # One bin between 1 and 2
    m2_pipeline['surge_targets_met_pct'].max() + 1]
bins

[-4.24232081911261,
 -2.6666666666666665,
 -1.3333333333333333,
 0,
 0.25,
 0.5,
 0.75,
 1,
 2,
 5.322377307519139]

In [4]:
m2_pipeline['label'] = pd.cut(m2_pipeline['surge_targets_met_pct'], bins=bins, labels=list(range(1, len(bins))))

In [5]:
m2_pipeline.columns

Index(['precursor_buy_cap_pct_change', 'precursor_ask_cap_pct_change',
       'precursor_bid_vol_pct_change', 'precursor_ask_vol_pct_change',
       'change.1', 'surge_targets_met_pct', 'label'],
      dtype='object')

In [6]:
m2_pipeline.to_csv('binned_pipeline.csv')

In [7]:
global_test_results = [] #capture each test summary here, make df on this later
profitability_analytics =[] #capture the value of each cluster, bin x price avg

In [8]:
def get_cluster_profit(cluster):
    print(cluster) #method, cluster count, silhouette score
    # for the dataframe m2_pipeline, group by 'cluster' then 'bin', multiply bin as number by the average price by bin

### standardize all features pre train/test

In [9]:

m2_pipeline = m2_pipeline.dropna()

# Splitting the dataframe into features and labels
X = m2_pipeline.drop(columns=['bin'])
y = m2_pipeline['bin']

# Performing the test/train split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

#DO STANDARDIZATION ONCE YOU DO TEST TRAIN NOT BEFORE 
def silhouette_scorer(estimator, X, y=None):
    labels = estimator.fit_predict(X)
    score = silhouette_score(X, labels)
    return score
# Defining the parameter grid for GridSearchCV
param_grid = {'n_clusters': [2,3,4,5,6,7,8,9,10]}  #'algorithm': ['auto', 'full', 'elkan']

clustering_models = [
    ('KMeans', KMeans()),
    # ('SpectralClustering', SpectralClustering(eigen_solver=None, n_components=None, random_state=42, n_init=10, gamma=1.0, affinity='rbf',\
    #                   n_neighbors=10, eigen_tol='auto', assign_labels='kmeans', degree=3, coef0=1, kernel_params=None, n_jobs=None, verbose=False)),
    ('Birch', Birch()),
    ('Hierarchical',AgglomerativeClustering())]
# Performing GridSearchCV for each clustering model
for model_name, model in clustering_models:
    grid_search = GridSearchCV(model, param_grid, scoring=silhouette_scorer)
    grid_search.fit(X_train_scaled, y_train)
    # Evaluating the best model based on silhouette score
    best_model = grid_search.best_estimator_
    best_score = grid_search.best_score_
    print(f'{model_name}:')
    print(f'Best parameters: {grid_search.best_params_}')
    print(f'Silhouette score (train): {best_score:.4f}')
    print(f'Silhouette score (test): {best_score:.4f}')
    best_params = grid_search.best_params_
    big_dict = grid_search.cv_results_
    pkg = {"algo": model_name, "best_params": grid_search.best_params_,
        "best_estimator": grid_search.best_estimator_,
        "best_score": best_score, "all_results": big_dict} 
    # # Add the cluster label as a new column
    method_df = X_train_scaled #as it stands, then append a new column to it, write to csv
    method_df['cluster_label'] = best_model.labels_

    now = datetime.datetime.now()
    print("Current date and time: ")
    print(now.strftime("%Y-%m-%d %H:%M:%S"))
    run_time = now.strftime("%Y-%m-%d %H:%M:%S")
    file_name = 'clustered/'+ 'clustered_'+ model_name+' _'+ run_time+ '.csv'
    print(file_name)
    method_df.to_csv(file_name)
    # print(pkg)
    global_test_results.append(pkg)

KeyError: "['bin'] not found in axis"

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
# from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN, SpectralClustering, Birch
from sklearn.cluster import MeanShift, AffinityPropagation, DBSCAN, OPTICS
from sklearn.metrics import silhouette_score
# Splitting the dataframe into features and labels
X = m2_pipeline.drop(columns=['bin'])
y = m2_pipeline['bin']
# Performing the test/train split
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train)
# X_test_scaled = scaler.fit_transform(X_test)

def silhouette_scorer(estimator, X, y=None):
    labels = estimator.fit_predict(X)
    score = silhouette_score(X, labels)
    return score

param_grid = {}  #'algorithm': ['auto', 'full', 'elkan']
clustering_models = [
    ('Meanshift',  MeanShift( bandwidth=None, seeds=None, bin_seeding=False, min_bin_freq=1, cluster_all=True, n_jobs=None, max_iter=300)),
    ('AffinityPropagation', AffinityPropagation( damping=0.5, max_iter=200, convergence_iter=15, copy=True, preference=None, affinity='euclidean', verbose=False, random_state=42)),
    ('DBSCAN', DBSCAN(eps=0.5,  min_samples=5, metric='euclidean', metric_params=None, algorithm='auto', leaf_size=30, p=None, n_jobs=None)),
('OPTICS', OPTICS( min_samples=5, max_eps=3, metric='minkowski', p=2, metric_params=None, cluster_method='xi', eps=None, xi=0.05, predecessor_correction=True, min_cluster_size=None, algorithm='auto', leaf_size=30, memory=None, n_jobs=None))]
# Performing GridSearchCV for each clustering model
for model_name, model in clustering_models:
    grid_search = GridSearchCV(model, param_grid, scoring=silhouette_scorer)
    grid_search.fit(X_train_scaled, y_train)
    # Evaluating the best model based on silhouette score
    best_model = grid_search.best_estimator_
    best_score = grid_search.best_score_
    print(f'{model_name}:')
    # print(f'Best parameters: {grid_search.best_params_}')
    print(f'Silhouette score (train): {best_score:.4f}')
    print(f'Silhouette score (test): {best_score:.4f}')
    big_dict = grid_search.cv_results_
    pkg = {"algo":model_name, "best_params":grid_search.best_params_, "best_estimator":grid_search.best_estimator_,\
            "best_score":best_score, "all_results":big_dict}
    # print(pkg)
    global_test_results.append(pkg)

In [None]:
# from sklearn.cluster import SpectralClustering
# from sklearn.model_selection import GridSearchCV

# param_grid = {
#     'n_clusters': [2,3,4,5,6,7,8,9,10],
#     'affinity': ['nearest_neighbors', 'rbf'],
#     'gamma': [0.1, 1.0, 10.0]
# }
# def silhouette_scorer(estimator, X, y=None):
#     labels = estimator.fit_predict(X)
#     score = silhouette_score(X, labels)
#     return score
# # ('SpectralClustering', SpectralClustering(eigen_solver=None, n_components=None, random_state=42, n_init=10, gamma=1.0, affinity='rbf',\
#     #                   n_neighbors=10, eigen_tol='auto', assign_labels='kmeans', degree=3, coef0=1, kernel_params=None, n_jobs=None, verbose=False)),
# clustering = SpectralClustering(eigen_solver=None, n_components=None, random_state=42, n_init=10, gamma=1.0, affinity='rbf',\
#                       n_neighbors=10, eigen_tol='auto', assign_labels='kmeans', degree=3, coef0=1, kernel_params=None, n_jobs=None, verbose=False)
# grid_search = GridSearchCV(clustering, param_grid, scoring=silhouette_scorer)
# grid_search.fit(X_train_scaled, y_train)
# # Evaluating the best model based on silhouette score
# best_model = grid_search.best_estimator_
# best_score = grid_search.best_score_
# print(f'{model_name}:')
# # print(f'Best parameters: {grid_search.best_params_}')
# print(f'Silhouette score (train): {best_score:.4f}')
# print(f'Silhouette score (test): {best_score:.4f}')
# big_dict = grid_search.cv_results_
# pkg = {"algo":model_name, "best_params":grid_search.best_params_, "best_estimator":grid_search.best_estimator_,\
#         "best_score":best_score, "all_results":big_dict}
# # print(pkg)
# global_test_results.append(pkg)

## explore clustering methods by profitability

apply the cluster id to the dataframe, group by, then summarize value by bin x price range

## explore top results by cluster quality scoring

use silhouette viz

then davies

then kolmogorov

In [None]:
result_df = pd.DataFrame(global_test_results)
result_df

In [None]:
#charting silhouettes
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import silhouette_samples, silhouette_score

# Calculate the silhouette scores for each sample
silhouette_values = silhouette_samples(X, labels)

# Calculate the average silhouette score
average_score = silhouette_score(X, labels)

# Plot the silhouette chart
fig, ax = plt.subplots()
y_lower = 10

for i in range(n_clusters):
    # Aggregate the silhouette scores for samples in cluster i and sort them
    ith_cluster_silhouette_values = silhouette_values[labels == i]
    ith_cluster_silhouette_values.sort()

    size_cluster_i = ith_cluster_silhouette_values.shape[0]
    y_upper = y_lower + size_cluster_i

    # Fill the silhouette chart with the corresponding color
    color = cm.nipy_spectral(float(i) / n_clusters)
    ax.fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_values, facecolor=color, edgecolor=color, alpha=0.7)

    # Label each cluster with its silhouette score
    ax.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
    
    # Compute the new y_lower for next plot
    y_lower = y_upper + 10

ax.set_xlabel("Silhouette coefficient values")
ax.set_ylabel("Cluster label")

# The vertical line indicates the average silhouette score
ax.axvline(x=average_score, color="red", linestyle="--")

plt.show()


## top method hyperparameter optimization
take the top technique by silhouette then dive into a fuller exploration of its specific hyperparameters

In [None]:
from sklearn.cluster import SpectralClustering
from sklearn.model_selection import RandomizedSearchCV

param_distributions = {
    'n_clusters': [2, 3, 4],
    'affinity': ['nearest_neighbors', 'rbf'],
    'gamma': [0.1, 1.0, 10.0]
}

clustering = SpectralClustering()
random_search = RandomizedSearchCV(clustering, param_distributions)
random_search.fit(X)


## estimate the model accuracy, givne best hyperparameters

In [None]:
#use best model:
from sklearn.model_selection import train_test_split
from sklearn.cluster import SpectralClustering
from sklearn.metrics import accuracy_score
X = m2_pipeline.drop(columns=['bin'])
y = m2_pipeline['bin']
# Splitting the dataset into train and test sets

#USE STANDARD SCALING ONCE YOU DIVDE TEST AND TRAIN NOT BEFORE
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Creating an instance of SpectralClustering
model = SpectralClustering(n_clusters=7)

# Fitting the model to the training data
model.fit_predict(X_train)

# Predicting labels for the testing data
y_pred = model.predict(X_test)

# Calculating the accuracy score
accuracy = accuracy_score(y_test, y_pred)
