# Clustering Pipelines

In [None]:
from sklearn.pipeline import Pipeline

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering

from sklearn.metrics.cluster import calinski_harabaz_score, adjusted_rand_score

import matplotlib.pyplot as plt
%matplotlib inline

> Let's create 2 pipelines: one for K-Means Clustering, one for Hierarchical Agglomerative Clustering

In [None]:
c = None
k = None
k_max = None # maximum no. of clusters will will evaluate

pipe_kmeans = Pipeline([('scl', StandardScaler()),
            ('pca', PCA(n_components=c)),  # change value of c for # of PCA components
            ('clf', KMeans(n_clusters=k))]) # value of k changes in loop

pipe_HAC = Pipeline([('scl', StandardScaler()),
            ('pca', PCA(n_components=c)), # see above
            ('clf', AgglomerativeClustering(n_clusters=k))]) # see above

> Now let's pass our data through the K-means pipeline, visually graph clusters, and interpret results

In [None]:
 def processKmeans(k_max, df):
    # fit and predict clusters
    for k in range(0, k_max):
        pipe_kmeans.fit(df)
        cluster_preds = pipe_kmeans.predict(df)
        print(f'K-Means Solution with {k} clusters')
        print('Calinski-Harabaz (CA) cluster variance score: ', calinski_harabaz_score(df, cluster_preds))
        print('\n')

        plt.title(f'K-Means Solution with {k} clusters')
        plt.scatter(X.Frequency, X.MonetaryValue, c=cluster_preds, s=10)
        cl_centers = pipe_kmeans.cluster_centers_
        plt.scatter(cl_centers[:, 0], cl_centers[:, 1], c='black', s=30); # ! check on this centering selection
        
    # merge cluster predicitons to unlogged, unscaled df and print summary stats by cluster
    customer_level_data_raw['Cluster'] = cluster_preds
    for cluster in set(cluster_preds):
        print(f'Summary Stats for cluster # {cluster}')
        customer_level_data_raw[customer_level_data_raw['Cluster'] == cluster].describe()
        pop_perc = (len(customer_level_data_raw[customer_level_data_raw['Cluster'] == cluster]) / len(customer_level_data_raw))*100
        rev_perc = (sum(customer_level_data_raw[customer_level_data_raw['Cluster'] == cluster].MonetaryValue) / sum(customer_level_data_raw.MonetaryValue))*100               
        total_spend = sum(customer_level_data_raw[customer_level_data_raw['Cluster'] == cluster].MonetaryValue)
        print(f'This cluster represents {pop_perc}% of the population and {rev_perc}% or Total Revenue, with a total spend of {total_spend}')
        print('\n\n\n')
        

> > Now let's pass our data through the HAC pipeline, visually graph clusters, and interpret results