# NOTEBOOK 1:  CLUSTERING

This notebook create cluster of MIGROS buildings accoring to their energy consumption patterns using ML unsupervised clustering methods

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns

import matplotlib.pyplot as plt

%matplotlib inline

sns.set(style="ticks", font_scale=1.2, context="talk")
sns.set_style("white", {'axes.grid' : False})
plt.rcParams['figure.figsize'] = (18, 7)

### 1. Data preparation

In [None]:
migros_FULL_data = pd.read_csv("data/191126_E3M_LESO_GMOSRawValuesWithRanking.csv",delimiter=",",header=0,encoding="ISO-8859-1")

# === Creating electricity time series =========
# Select only the electricity demand (Stromverbrauch Gesamt = Total electricity consumption)
# and add the time stamps columns at the beginning
lol = migros_FULL_data.loc[0,:]
good_columns = ['DPName'] + list(lol[lol=="Stromverbrauch Gesamt"].index)
migros_FULL_data_elec = migros_FULL_data[good_columns]

# fill the Nans in a forward way
#migros_FULL_data_elec = migros_FULL_data_elec.fillna(method="ffill")

In [None]:
Station_locations_MIGROS = np.array(migros_FULL_data_elec.iloc[2,:])

In [None]:
# separate the series only and set the time stamps as the index of the series
migros_FRAME = migros_FULL_data_elec.iloc[3:,:]
migros_FRAME.index = pd.DatetimeIndex(migros_FRAME['DPName'])

# to keep track of the location names, add them to the frame for now
locations_frame = pd.DataFrame(Station_locations_MIGROS).T
locations_frame.columns = migros_FRAME.columns
migros_FRAME2 = pd.concat([locations_frame,migros_FRAME])

# drop the old time stamps column and other random columns with names instead of values....
migros_FRAME2 = migros_FRAME2.drop('DPName', 1)
migros_FRAME2 = migros_FRAME2.drop('LESO_WTWU_EV_ELE', 1)
migros_FRAME2 = migros_FRAME2.drop('WTOW_EV_ELE', 1)
migros_FRAME2 = migros_FRAME2.drop('LESO_APPE_EV_ELE', 1)
migros_FRAME2 = migros_FRAME2.drop('LESO_CHFT_EV_ELE', 1)
migros_FRAME2 = migros_FRAME2.drop('LESO_ILAN_EV_ELE', 1)

# now extract the location names after droping weird columns
Station_locations_MIGROS2 = list(migros_FRAME2.iloc[0])

# now take back the location names from the frame itself, and rename the original name!
migros_FRAME = migros_FRAME2.drop([0])

# put back in datetime
migros_FRAME.index = pd.DatetimeIndex(migros_FRAME.index)

In [None]:
migros_FRAME.columns = Station_locations_MIGROS2

In [None]:
migros_FRAME

In [None]:
"OBI", "MFIT","MFit","Fitnesspark", "NEU","Neu" 

In [None]:
Station_locations_MIGROS2

In [None]:
## Now transforming strings into numbers. 
## A GOOD OLD CLASSIC, with the nice to_numeric function, that we APPLY to all columns with the equally nice apply function.

migros_FRAME[migros_FRAME.columns] = migros_FRAME[migros_FRAME.columns].apply(pd.to_numeric)

In [None]:
migros_FRAME

In [None]:
migros_FRAME.to_csv("Migros_data_clean.csv")

### 2. Fourier features

In [None]:
## Aggregating them to a daily mean hourly
migros_FRAME_daily = migros_FRAME.resample("1H").sum().resample("1D").mean()
migros_FRAME_daily.head(5)

In [None]:
def ComputeAndPlotFourier(time_series,WePlot=True):
    '''
    Take a  time series, and performs Fourier analysis: 
    - Computes the amplitudes of N/2 frequencies (N the number of original samples) using FFT, 
    - extract the corresponding phases. 
    It can also plot amplitudes, phases, and the resulting approx from the third main harmonics along with the 
    original signal by setting WePlot to True. The default is False. Also, the list of years is an argument of the function.
    '''

    if WePlot == True:
        fig = plt.figure()

    # current time series    
    #time_series = Full_matrix_2017_2018[i, :]

    #### ------------ TIME DOMAIN ------------------------------------------
    
    if WePlot==True:
        #nameSelected = names[StationIndex]
        #yearSelected = years[yearIndex]
        ax1 = fig.add_subplot(2,2,1)  
        plt.plot(range(len(time_series)), time_series, color="orange" )
        ax1.set_title("Time domain (Daily mean consumption)")
        ax1.set_xlabel("days")
        ax1.set_ylabel("Consumption")
        #ax1.set_xlabel(r"$(a)$")
        #ax1.legend(frameon=True,loc='upper left')
        #ax1.set_xlim(-10, 370)
        
    ### ------------ FREQUENCY DOMAIN -----------------------------------
    # compute the fft (meaning the corresponding frequencies)
    Y = np.fft.fft(time_series)
    #FT[i, :] = Y
    
    # ---- compute AMPLITUDE with abs (computes the amplitude of the complex number as sqrt(re**2 + im**2),
    # Multiply by 2 because we take only half of the frequencies (NYquist theorem)
    # and NORMALIZE by sampled #data points.
    F_amplitudes = abs(Y)*2 / len(time_series)
    
    # Remember that N the number of frequencies cannot be more than half of the sampling frequency!! (Nyquist theorem) Thats why there's a mirrored image
    # when we plot the amplitudes as is. We need to plot only the first half of the freqencies, meaning N=#data/2
    N = int(len(time_series)/2)
    FrequenciesSelected = range(int(N))

    # ---- Plot AMPLITUDE
    if WePlot==True:
        ax2 = fig.add_subplot(2,2,2)  
        ax2.stem(FrequenciesSelected, F_amplitudes[:N])
        ax2.set_title("Frequency domain")
        ax2.set_ylabel("Amplitude")
        ax2.set_xlabel(r"$(b)$")
        #ax2.set_xlabel("frequencies")
        ax2.set_xlim(-1, 30)

    # ---- compute PHASE of each frequency (using the angle function from numpy)
    F_phases = np.angle(Y)
    
    # ---- Plot PHASE
    if WePlot==True:
        ax4 = fig.add_subplot(2,2,4)  
        # plot phase 
        ax4.stem(F_phases[:30])
        ax4.set_xlim(-1, 30)
        ax4.set_ylim(-3.5, 3.5)
        ax4.set_xlabel(r"$(d)$")
        ax4.set_ylabel("Phase")
        #ax4.set_title("Phase")
        # y ticks
        my_yticks = np.array([-np.pi, -0.75*np.pi, -0.5*np.pi, -0.25*np.pi,0, 0.25*np.pi, 0.5*np.pi, 0.75*np.pi, np.pi])
        y_label = [r"$-\pi$",r"$-\frac{3\pi}{4}$",r"$-\frac{\pi}{2}$", r"$-\frac{\pi}{4}$", r"$0$", r"$\frac{\pi}{4}$", r"$\frac{\pi}{2}$", r"$\frac{3\pi}{4}$",r"$\pi$"]
        ax4.set_yticks(my_yticks)
        ax4.set_yticklabels(y_label, fontsize=20)

    #### ------------ Fourier Estimation of the signal, back in Time domain------------------------------------------
    
    # harmonic index
    #n = 1
    # period in days
    T = 2* 365.242189 
    w = (2*np.pi)/(T)
    # compute a0 (mean daily temperature through the year)
    T0 = np.mean(time_series)
    # NOTE that T0 = the 0th amplitude/2 !! (a0/2), I CHECKED :)
    
    # extract the three main harmonics, besides the 0th - the mean-, for which the amplitude (the first half of the amplitudes - Nyquist) is the maximum.
    mainHarmIn = list(reversed( sorted(range(len(F_amplitudes[:N])), key=lambda i: F_amplitudes[:N][i])[-4:] ))[1:]
    
    #plot separately these four harmonics
    if WePlot==True: 
        ax3 = fig.add_subplot(2,2,3)
        for n in mainHarmIn:
            ax3.plot(range(len(time_series)), T0 + np.array([ F_amplitudes[n] * np.cos(n*w*t + F_phases[n]) for t in range(len(time_series))]), linestyle='--',label=''.join([r"$n=$",str(n)]) )

        # plot the fourier approx using these four main harmonics
        ax3.plot(range(len(time_series)), T0 + np.array([ F_amplitudes[mainHarmIn[0]] * np.cos(mainHarmIn[0]*w*t + F_phases[mainHarmIn[0]]) for t in range(len(time_series))])
                            + np.array([ F_amplitudes[mainHarmIn[1]] * np.cos(mainHarmIn[1]*w*t + F_phases[mainHarmIn[1]]) for t in range(len(time_series))])
                            + np.array([ F_amplitudes[mainHarmIn[2]] * np.cos(mainHarmIn[2]*w*t + F_phases[mainHarmIn[2]]) for t in range(len(time_series))]) ,
                            color='red', label="Fourier" )
        ax3.plot(range(len(time_series)), time_series, color="orange", linestyle='-',label="Real")
        
        lines = ax3.get_lines()
        legend1 = plt.legend(lines[:3], [''.join([r"$n=$",str(n)]) for n in mainHarmIn], loc=2, frameon=True)
        legend2 = plt.legend(lines[3:], ["Fourier", "Real"], loc=1, frameon=True)
        ax3.add_artist(legend1)
        ax3.add_artist(legend2)
        
        #ax3.legend(frameon=True,loc='upper left')
        ax3.set_ylabel("Consumption")
        #ax3.set_xlabel(r"$(c)$")
        #ax3.set_xlim(-10, 370)
        
        
    # extract the main Fourier features: T0 (mean of series), main harmonics, amplitudes and phases of these harmonics
    return([T0,mainHarmIn[0],mainHarmIn[1],mainHarmIn[2],
                F_amplitudes[mainHarmIn[0]],F_amplitudes[mainHarmIn[1]],F_amplitudes[mainHarmIn[2]],
              F_phases[mainHarmIn[0]],F_phases[mainHarmIn[1]],F_phases[mainHarmIn[2]]])
    

In [None]:
plt.rcParams['figure.figsize'] = (13, 7)

In [None]:
migros_FRAME_daily.iloc[:,10].plot()

In [None]:
## There are two annoying high peaks of electricity, which "kill the shape" of the loads,
## and change dramatically the fourier features. We try to cut these peaks. 
migros_FRAME_daily_GOOD = migros_FRAME_daily[(migros_FRAME_daily.index<"2018-09-10") 
                                               | ( (migros_FRAME_daily.index>"2018-09-12") & (migros_FRAME_daily.index<"2019-02-07") ) 
                                               | (migros_FRAME_daily.index>"2019-02-08") ]
migros_FRAME_daily_GOOD

In [None]:
migros_FRAME_daily_GOOD.iloc[:,10].plot()

In [None]:
migros_FRAME_daily_GOOD.iloc[:,10]["2018-04-01":"2018-05-01"].plot()

In [None]:
i=10
ComputeAndPlotFourier(np.array(migros_FRAME_daily_GOOD.iloc[:,i]),WePlot=True)

#### Fourier Features extraction

In [None]:
len(Station_locations_MIGROS)

In [None]:
Fourier_features = []
for i in range(len(np.array(migros_FRAME_daily_GOOD).T)):
    features = ComputeAndPlotFourier(np.array(migros_FRAME_daily_GOOD).T[i, :],WePlot=False)
    Fourier_features.append(features)
    
Fourier_features = np.array(Fourier_features)

Fourier_features_FRAME = pd.DataFrame(Fourier_features,index = migros_FRAME_daily_GOOD.columns)
Fourier_features_FRAME

### 3. Applying PCA over selected Fourier features

In [None]:
from sklearn.decomposition import PCA, KernelPCA
from sklearn.preprocessing import StandardScaler

In [None]:
# Normalisation
scaler = StandardScaler()
Fourier_features_std = scaler.fit_transform(Fourier_features)

pca = PCA(n_components=5)
pca.fit(Fourier_features_std)
Fourier_features_std_pca = pca.transform(Fourier_features_std)
# check the explained variance ratio and if 3 components is enough
print("Explained variance ratio for each direction:",pca.explained_variance_ratio_)
print("TOTAL explained variance:",sum(pca.explained_variance_ratio_))

In [None]:
## Question to Dan: why not using the 5 features only? I think the students reduced it to 2 if I'm not mistaken.. 

### 4. Apply a clustering algorithm (Nearest neighbors, DBSCAN and OPTICS)

In [None]:
from sklearn.cluster import DBSCAN,OPTICS
from sklearn.neighbors import NearestNeighbors
plt.rcParams['figure.figsize'] = (8,5)
sns.set_style("white", {'axes.grid' : True})

#### Nearest-neighbors

In [None]:
neigh = NearestNeighbors(n_neighbors=2)
nbrs = neigh.fit(Fourier_features_std)
distances, indices = nbrs.kneighbors(Fourier_features_std)

plt.figure()
ax=sns.distplot(distances[:,1],kde=False,bins=[i*1 for i in range(10)])
plt.xlim(0,50)
plt.xlabel("Distance to NN")

In [None]:
neigh = NearestNeighbors(n_neighbors=2)
nbrs = neigh.fit(Fourier_features)
distances, indices = nbrs.kneighbors(Fourier_features)

plt.figure()
ax=sns.distplot(distances[:,1],kde=False,bins=[i*180 for i in range(40)])
plt.xlim(0,2000)
plt.xlabel("Distance to NN")

In [None]:
neigh = NearestNeighbors(n_neighbors=2)
nbrs = neigh.fit(Fourier_features_std)
distances, indices = nbrs.kneighbors(Fourier_features_std)
distances = np.sort(distances, axis=0)
distances = distances[:,1]
plt.figure()
plt.plot(distances)
plt.ylim(0,4)

In [None]:
neigh = NearestNeighbors(n_neighbors=2)
nbrs = neigh.fit(Fourier_features)
distances, indices = nbrs.kneighbors(Fourier_features)
distances = np.sort(distances, axis=0)
distances = distances[:,1]
plt.figure()
plt.plot(distances)
plt.ylim(0,800)

#### DBSCAN

Parameters:

* MinPts: As a rule of thumb, a minimum minPts can be derived from the number of dimensions D in the data set, as minPts ≥ D + 1. The low value of minPts = 1 does not make sense, as then every point on its own will already be a cluster.[dubious – discuss] With minPts ≤ 2, the result will be the same as of hierarchical clustering with the single link metric, with the dendrogram cut at height ε. Therefore, minPts must be chosen at least 3. However, larger values are usually better for data sets with noise and will yield more significant clusters. As a rule of thumb, minPts = 2·dim can be used,[6] but it may be necessary to choose larger values for very large data, for noisy data or for data that contains many duplicates.[5]

* ε: The value for ε can then be chosen by using a k-distance graph, plotting the distance to the k = minPts-1 nearest neighbor ordered from the largest to the smallest value.[5] Good values of ε are where this plot shows an "elbow":[1][6][5] if ε is chosen much too small, a large part of the data will not be clustered; whereas for a too high value of ε, clusters will merge and the majority of objects will be in the same cluster. In general, small values of ε are preferable,[5] and as a rule of thumb only a small fraction of points should be within this distance of each other. Alternatively, an OPTICS plot can be used to choose ε,[5] but then the OPTICS algorithm itself can be used to cluster the data.

============================

eps(float, default=0.5)

    The maximum distance between two samples for one to be considered as in the neighborhood of the other. This is not a maximum bound on the distances of points within a cluster. This is the most important DBSCAN parameter to choose appropriately for your data set and distance function.

min_samples(int, default=5)

    The number of samples (or total weight) in a neighborhood for a point to be considered as a core point. This includes the point itself.
    


In [None]:
## Question to Dan: dataset dimension D = 10, 5 is PCA is applied, what not choosing then minPts >= 2*D ??

#### Scanning the epsilon parameter

In [None]:
for el in [0.2 + i*3 for i in range(60)]:
# we also played a little with eps before reaching a "reasonable" plot
    clustering = DBSCAN(eps=el, min_samples=3).fit(np.array(Fourier_features))

    # labels
    DBSCAN_labels = clustering.labels_
    #DBSCAN_labels.shape = (614,1)

    # all unique classes and number of points per cluster
    points_per_class = [ "".join([ str(el),":",str(np.sum(1*(DBSCAN_labels==el))) ]) for el in np.unique(DBSCAN_labels)]
    print("eps =",round(el,1))
    print(points_per_class)

#### Scanning the epsilon parameter (with Fourier_features_std)

In [None]:
for el in [0.2 + i*0.1 for i in range(30)]:
# we also played a little with eps before reaching a ":reasonable" plot
    clustering = DBSCAN(eps=el, min_samples=10,metric="euclidean").fit(np.array(Fourier_features_std))

    # labels
    DBSCAN_labels = clustering.labels_
    #DBSCAN_labels.shape = (614,1)

    # all unique classes and number of points per cluster
    points_per_class = [ "".join([ str(el),":",str(np.sum(1*(DBSCAN_labels==el))) ]) for el in np.unique(DBSCAN_labels)]
    print("eps =",round(el,1))
    print(points_per_class)

#### TEST 1: eps=180,min_samples=2, metric=default (euclidean)

In [None]:
clust_DB = DBSCAN(eps=180,min_samples=2).fit(Fourier_features)
# labels
DBSCAN_labels = clust_DB.labels_
# all unique classes and number of points per cluster
points_per_class_DB = [ "".join([ str(el),":",str(np.sum(1*(DBSCAN_labels==el))) ]) for el in np.unique(DBSCAN_labels)]
print(points_per_class_DB)

#### TEST 2: eps=180,min_samples=2, metric=l1

In [None]:
clust_DB = DBSCAN(eps=180,min_samples=2,metric="l1").fit(Fourier_features)
# labels
DBSCAN_labels = clust_DB.labels_
# all unique classes and number of points per cluster
points_per_class_DB = [ "".join([ str(el),":",str(np.sum(1*(DBSCAN_labels==el))) ]) for el in np.unique(DBSCAN_labels)]
print(points_per_class_DB)

#### TEST 3: eps=180,min_samples=3, metric=l1

In [None]:
clust_DB = DBSCAN(eps=180,min_samples=3,metric="l1").fit(Fourier_features)
# labels
DBSCAN_labels = clust_DB.labels_
# all unique classes and number of points per cluster
points_per_class_DB = [ "".join([ str(el),":",str(np.sum(1*(DBSCAN_labels==el))) ]) for el in np.unique(DBSCAN_labels)]
print(points_per_class_DB)

#### TEST 4: eps=1.5,min_samples=5, metric=euclidean

In [None]:
clust_DB = DBSCAN(eps=1.5,min_samples=5,metric="euclidean").fit(Fourier_features_std)
# labels
DBSCAN_labels = clust_DB.labels_
# all unique classes and number of points per cluster
points_per_class_DB = [ "".join([ str(el),":",str(np.sum(1*(DBSCAN_labels==el))) ]) for el in np.unique(DBSCAN_labels)]
print(points_per_class_DB)

#### TEST 5: eps=1.7,min_samples=10, metric=euclidean

In [None]:
clust_DB = DBSCAN(eps=1.7,min_samples=10,metric="euclidean").fit(Fourier_features_std)
# labels
DBSCAN_labels = clust_DB.labels_
# all unique classes and number of points per cluster
points_per_class_DB = [ "".join([ str(el),":",str(np.sum(1*(DBSCAN_labels==el))) ]) for el in np.unique(DBSCAN_labels)]
print(points_per_class_DB)

In [None]:
migros_FRAME_daily_GOOD_2 = migros_FRAME_daily_GOOD.T
migros_FRAME_daily_GOOD_2["ClusterDBSCAN"] = np.array(DBSCAN_labels)

# put back location names as index
migros_FRAME_daily_GOOD_2.index = Station_locations_MIGROS2
migros_FRAME_daily_GOOD_2

In [None]:
def plotClusteredTimeSeries(cluster_labels,ClusterName,Save=False):
    #plt.rcParams['figure.figsize'] = (20,16)
    fig=plt.figure()
    for i in np.unique(cluster_labels):
        plt.subplot(np.ceil(len(np.unique(cluster_labels))/3),3,i+2)
        
        # time series in the current cluster
        cluster_frame = migros_FRAME_daily_GOOD_2[migros_FRAME_daily_GOOD_2[ClusterName]==i]
        for el in np.array(cluster_frame):
            plt.plot(range(len(el)),el,linewidth=2)
        # PLOT THE MEAN series of the current cluster
        plt.plot(range(len(np.array(cluster_frame.mean()))),np.array(cluster_frame.mean()),color="yellow")

        plt.tick_params(
        axis='x',          # changes apply to the x-axis
        which='both',      # both major and minor ticks are affected
        bottom=False,      # ticks along the bottom edge are off
        top=False,         # ticks along the top edge are off
        labelbottom=False)
    
        plt.title( "".join([str(len(np.array(cluster_frame)))," series"]) )
    plt.show()
    if Save==True:
        fig.savefig("".join(["SwisscomTimeSeries_",ClusterName,".pdf"]),dpi=300,bbox_inches="tight")
        

In [None]:
plt.rcParams['figure.figsize'] = (18,10)

In [None]:
plotClusteredTimeSeries(DBSCAN_labels,"ClusterDBSCAN",Save=True)

### Check if the locations are somehow different in clusters... and if the cluster make sense (Need MIGROS experts' input).

eps=180, minSamples=2, metric=l1

In [None]:
print("Outliers:")
print(list(migros_FRAME_daily_GOOD_2[migros_FRAME_daily_GOOD_2.ClusterDBSCAN==-1].index))


print("cluster0:")
print(list(migros_FRAME_daily_GOOD_2[migros_FRAME_daily_GOOD_2.ClusterDBSCAN==0].index))
print("cluster1:")
print(list(migros_FRAME_daily_GOOD_2[migros_FRAME_daily_GOOD_2.ClusterDBSCAN==1].index))
print("cluster2:")
print(list(migros_FRAME_daily_GOOD_2[migros_FRAME_daily_GOOD_2.ClusterDBSCAN==2].index))
print("cluster3:")
print(list(migros_FRAME_daily_GOOD_2[migros_FRAME_daily_GOOD_2.ClusterDBSCAN==3].index))

###### Comment: We have fitness locations in the last cluster!

eps=180, minSamples=3, metric=l1

In [None]:
print("Outliers:")
print(list(migros_FRAME_daily_GOOD_2[migros_FRAME_daily_GOOD_2.ClusterDBSCAN==-1].index))


print("cluster0:")
print(list(migros_FRAME_daily_GOOD_2[migros_FRAME_daily_GOOD_2.ClusterDBSCAN==0].index))
print("cluster1:")
print(list(migros_FRAME_daily_GOOD_2[migros_FRAME_daily_GOOD_2.ClusterDBSCAN==1].index))
print("cluster2:")
print(list(migros_FRAME_daily_GOOD_2[migros_FRAME_daily_GOOD_2.ClusterDBSCAN==2].index))
print("cluster3:")
print(list(migros_FRAME_daily_GOOD_2[migros_FRAME_daily_GOOD_2.ClusterDBSCAN==3].index))
print("cluster4:")
print(list(migros_FRAME_daily_GOOD_2[migros_FRAME_daily_GOOD_2.ClusterDBSCAN==4].index))


eps=180, minSamples=2, metric=euclidean

In [None]:
print("Outliers:")
print(list(migros_FRAME_daily_GOOD_2[migros_FRAME_daily_GOOD_2.ClusterDBSCAN==-1].index))


print("cluster0:")
print(list(migros_FRAME_daily_GOOD_2[migros_FRAME_daily_GOOD_2.ClusterDBSCAN==0].index))
print("cluster1:")
print(list(migros_FRAME_daily_GOOD_2[migros_FRAME_daily_GOOD_2.ClusterDBSCAN==1].index))
print("cluster2:")
print(list(migros_FRAME_daily_GOOD_2[migros_FRAME_daily_GOOD_2.ClusterDBSCAN==2].index))
print("cluster3:")
print(list(migros_FRAME_daily_GOOD_2[migros_FRAME_daily_GOOD_2.ClusterDBSCAN==3].index))

##### OPTICS

In [None]:
for mSamples in range(1,20):
    clust = OPTICS(min_samples=mSamples,cluster_method="xi",min_cluster_size=.1).fit(Fourier_features)
    # labels
    OPTICS_labels = clust.labels_
    # all unique classes and number of points per cluster
    points_per_class = [ "".join([ str(el),":",str(np.sum(1*(OPTICS_labels==el))) ]) for el in np.unique(OPTICS_labels)]
    print("min_samples = ",mSamples)
    print(points_per_class)

In [None]:
clust_OP = OPTICS(min_samples=3,cluster_method="xi",metric="euclidean").fit(Fourier_features)
# labels
OPTICS_labels = clust_OP.labels_
# all unique classes and number of points per cluster
points_per_class_OP = [ "".join([ str(el),":",str(np.sum(1*(OPTICS_labels==el))) ]) for el in np.unique(OPTICS_labels)]
print(points_per_class_OP)

In [None]:
clust_OP = OPTICS(min_samples=5,cluster_method="xi",metric="l1").fit(Fourier_features)
# labels
OPTICS_labels = clust_OP.labels_
# all unique classes and number of points per cluster
points_per_class_OP = [ "".join([ str(el),":",str(np.sum(1*(OPTICS_labels==el))) ]) for el in np.unique(OPTICS_labels)]
print(points_per_class_OP)