## Load Data

In [45]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import os
from sklearn.metrics import rand_score, adjusted_rand_score

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
try:
    table = pd.read_pickle("data/table.pkl")
except:
    table = pd.read_excel("data/association_table_corrected_index.xlsx")
    table.to_pickle("data/table.pkl")

In [30]:
from transformer import Event
from subsampler import balanced_subsampler

NUMBER_OF_SAMPLES_PER_EVENT = 100

In [None]:

tablerich = Event().fit_transform(table)

In [31]:
balanced_table = balanced_subsampler(tablerich, NUMBER_OF_SAMPLES_PER_EVENT)
balanced_table.event.value_counts()

LP     100
TOR    100
TR     100
VT     100
PIS    100
HIB    100
EXP    100
Name: event, dtype: int64

In [32]:
from transformer import Transformer

df = Transformer("data/Extracted/").fit_transform(balanced_table)

In [33]:
df

Unnamed: 0,Index,Event,Duration,Event start,Event stop,Sampling rate,Overlap,npts,energy,path,event,variance,mean,median,maximum,amplitude
60021,60021,LP1,41,2007-03-22 09:09:09.560000+00:00,2007-03-22 09:09:50.560000+00:00,40.0,False,143999.0,0.00858,data/Extracted/LP/LP_60021.npy,LP,5025.824219,-42.770733,-43.000000,317.000000,690.000000
121684,121684,TOR,46,2011-06-01 13:45:10.510000+00:00,2011-06-01 13:45:56.510000+00:00,100.0,False,360001.0,1.65191,data/Extracted/TOR/TOR_121684.npy,TOR,89.271751,0.058913,0.022933,34.215183,66.605408
3430,3430,TR2,585,2006-05-30 10:08:55.830000+00:00,2006-05-30 10:18:40.830000+00:00,50.0,False,180000.0,0.55054,data/Extracted/TR/TR_3430.npy,TR,8848.485352,283.737854,283.224243,739.492432,958.898010
116704,116704,VT1,15,2009-03-25 03:20:13.900000+00:00,2009-03-25 03:20:28.900000+00:00,50.0,False,180001.0,3.37127,data/Extracted/VT/VT_116704.npy,VT,221090.375000,74.813332,70.000000,2680.000000,4350.000000
113183,113183,VT1,20,2008-09-25 10:28:58.940000+00:00,2008-09-25 10:29:18.940000+00:00,50.0,False,180001.0,0.14110,data/Extracted/VT/VT_113183.npy,VT,6950.770996,61.595001,61.000000,507.000000,954.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23588,23588,TR2,89,2006-07-30 22:52:50.450000+00:00,2006-07-30 22:54:19.450000+00:00,40.0,False,143999.0,0.18380,data/Extracted/TR/TR_23588.npy,TR,49823.167969,-114.280899,-116.000000,859.000000,1951.000000
113771,113771,EXP,25,2008-10-14 14:38:29.020000+00:00,2008-10-14 14:38:54.020000+00:00,50.0,False,180001.0,1.98882,data/Extracted/EXP/EXP_113771.npy,EXP,78114.031250,64.496002,63.000000,1078.000000,2114.000000
99399,99399,PIS,31,2008-07-02 18:18:58.390000+00:00,2008-07-02 18:19:29.390000+00:00,50.0,False,180001.0,0.21969,data/Extracted/PIS/PIS_99399.npy,PIS,22.048834,67.111610,67.000000,89.000000,39.000000
109903,109903,VT1,28,2008-07-18 14:28:33.260000+00:00,2008-07-18 14:29:01.260000+00:00,50.0,False,180001.0,0.30168,data/Extracted/VT/VT_109903.npy,VT,10621.668945,56.142143,58.000000,636.000000,1084.000000


In [8]:
features = ["variance", "mean", "median", "maximum", "amplitude", "Duration", "energy"]

In [9]:
df[features]

Unnamed: 0,variance,mean,median,maximum,amplitude,Duration,energy
120791,3.737696e+02,68.153000,69.000000,153.000000,223.000000,20,0.01106
112247,1.719263e+04,61.473061,63.000000,560.000000,1111.000000,49,0.85581
68819,1.085370e+04,-181.736984,-181.000000,503.000000,1258.000000,96,0.04361
81379,1.145074e+04,-176.205765,-176.000000,287.000000,938.000000,33,0.03921
105861,3.632470e+04,65.935486,64.500000,841.000000,1638.000000,31,2.10690
...,...,...,...,...,...,...,...
68295,2.546701e+04,-168.690109,-170.000000,408.000000,1349.000000,48,0.05448
120714,2.135279e+02,0.162082,0.484489,31.466145,65.403915,17,0.00358
74417,1.430122e+07,-242.838440,-198.500000,21338.000000,38853.000000,64,94.78396
99339,1.213186e+01,70.803871,71.000000,82.000000,23.000000,31,0.21075


## K-Means Clustering

In [34]:
from sklearn.cluster import KMeans

kmeans = KMeans(7)

kmeans.fit(df[features])

KMeans(n_clusters=7)

### Imbalance measure

In [36]:
from gini import gini

In [37]:
gini(kmeans.labels_)

0.9545574057843996

In [38]:
from sklearn.preprocessing import LabelEncoder

real_labels = LabelEncoder().fit_transform(balanced_table.event.values)

In [40]:
gini(real_labels)

0.38095238095238093

### Rand Index

In [41]:
rand_score(kmeans.labels_, real_labels)

0.239922338033926

In [46]:
adjusted_rand_score(kmeans.labels_, real_labels)

0.009958308705330524

## OPTICS

In [43]:
from sklearn.cluster import OPTICS

optics = OPTICS()

optics.fit(df[features])

OPTICS()

In [44]:
rand_score(optics.labels_, real_labels)

0.7235561005518087

In [47]:
adjusted_rand_score(optics.labels_, real_labels)

0.02350357359167249

## Affinity propagation

In [48]:
from sklearn.cluster import AffinityPropagation

aff = AffinityPropagation()

aff.fit(df[features])

print(adjusted_rand_score(aff.labels_, real_labels))

0.0


