## Load Data

In [2]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import os

In [3]:
try:
    table = pd.read_pickle("data/table.pkl")
except:
    table = pd.read_excel("data/association_table_corrected_index.xlsx")
    table.to_pickle("data/table.pkl")

In [4]:
from transformer import Event
from subsampler import balanced_subsampler

NUMBER_OF_SAMPLES_PER_EVENT = 20

tablerich = Event().fit_transform(table)


In [5]:
balanced_table = balanced_subsampler(tablerich, NUMBER_OF_SAMPLES_PER_EVENT)
balanced_table.event.value_counts()

TOR    20
PIS    20
TR     20
LP     20
HIB    20
EXP    20
VT     20
Name: event, dtype: int64

In [6]:
from transformer import Transformer

df = Transformer("data/Extracted/").fit_transform(balanced_table)

## K-Means clustering

In [7]:
df

Unnamed: 0,Index,Event,Duration,Event start,Event stop,Sampling rate,Overlap,npts,energy,path,event,variance,mean,median,maximum,amplitude
120791,120791,TOR,20,2010-07-26 16:14:04.560000+00:00,2010-07-26 16:14:24.560000+00:00,50.0,False,180001.0,0.01106,data/Extracted/TOR/TOR_120791.npy,TOR,3.737696e+02,68.153000,69.000000,153.000000,223.000000
112247,112247,PIS,49,2008-09-14 08:06:04.950000+00:00,2008-09-14 08:06:53.950000+00:00,50.0,False,180001.0,0.85581,data/Extracted/PIS/PIS_112247.npy,PIS,1.719263e+04,61.473061,63.000000,560.000000,1111.000000
68819,68819,TR2,96,2007-04-25 09:16:33.840000+00:00,2007-04-25 09:18:09.840000+00:00,40.0,False,319890.0,0.04361,data/Extracted/TR/TR_68819.npy,TR,1.085370e+04,-181.736984,-181.000000,503.000000,1258.000000
81379,81379,LP1,33,2007-06-20 13:28:11.680000+00:00,2007-06-20 13:28:44.680000+00:00,100.0,False,360001.0,0.03921,data/Extracted/LP/LP_81379.npy,LP,1.145074e+04,-176.205765,-176.000000,287.000000,938.000000
105861,105861,PIS,31,2008-04-17 19:26:08.180000+00:00,2008-04-17 19:26:39.180000+00:00,50.0,False,180001.0,2.10690,data/Extracted/PIS/PIS_105861.npy,PIS,3.632470e+04,65.935486,64.500000,841.000000,1638.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68295,68295,LP1,48,2007-04-23 21:08:28.190000+00:00,2007-04-23 21:09:16.190000+00:00,40.0,False,143999.0,0.05448,data/Extracted/LP/LP_68295.npy,LP,2.546701e+04,-168.690109,-170.000000,408.000000,1349.000000
120714,120714,TOR,17,2010-10-07 03:16:38.970000+00:00,2010-10-07 03:16:55.970000+00:00,50.0,False,180001.0,0.00358,data/Extracted/TOR/TOR_120714.npy,TOR,2.135279e+02,0.162082,0.484489,31.466145,65.403915
74417,74417,EXP,64,2007-05-24 06:32:32.350000+00:00,2007-05-24 06:33:36.350000+00:00,100.0,False,360001.0,94.78396,data/Extracted/EXP/EXP_74417.npy,EXP,1.430122e+07,-242.838440,-198.500000,21338.000000,38853.000000
99339,99339,PIS,31,2008-06-02 20:40:11.180000+00:00,2008-06-02 20:40:42.180000+00:00,50.0,False,180001.0,0.21075,data/Extracted/PIS/PIS_99339.npy,PIS,1.213186e+01,70.803871,71.000000,82.000000,23.000000


In [8]:
features = ["variance", "mean", "median", "maximum", "amplitude", "Duration", "energy"]

In [9]:
df[features]

Unnamed: 0,variance,mean,median,maximum,amplitude,Duration,energy
120791,3.737696e+02,68.153000,69.000000,153.000000,223.000000,20,0.01106
112247,1.719263e+04,61.473061,63.000000,560.000000,1111.000000,49,0.85581
68819,1.085370e+04,-181.736984,-181.000000,503.000000,1258.000000,96,0.04361
81379,1.145074e+04,-176.205765,-176.000000,287.000000,938.000000,33,0.03921
105861,3.632470e+04,65.935486,64.500000,841.000000,1638.000000,31,2.10690
...,...,...,...,...,...,...,...
68295,2.546701e+04,-168.690109,-170.000000,408.000000,1349.000000,48,0.05448
120714,2.135279e+02,0.162082,0.484489,31.466145,65.403915,17,0.00358
74417,1.430122e+07,-242.838440,-198.500000,21338.000000,38853.000000,64,94.78396
99339,1.213186e+01,70.803871,71.000000,82.000000,23.000000,31,0.21075


In [10]:
from sklearn.cluster import KMeans

kmeans = KMeans(7)

kmeans.fit(df[features])

KMeans(n_clusters=7)

In [11]:
kmeans.labels_

array([0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 6, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 2,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0,
       5, 0, 0, 0, 0, 6, 0, 0], dtype=int32)

### Imbalance measure

In [12]:
df.event

120791    TOR
112247    PIS
68819      TR
81379      LP
105861    PIS
         ... 
68295      LP
120714    TOR
74417     EXP
99339     PIS
113058    PIS
Name: event, Length: 140, dtype: object

In [16]:
from gini import gini

In [17]:
gini(kmeans.labels_)

0.9402597402597402

In [22]:
from sklearn.preprocessing import LabelEncoder

real_labels = LabelEncoder().fit_transform(balanced_table.event.values)

In [23]:
real_labels

array([4, 3, 5, 2, 3, 1, 3, 0, 4, 5, 6, 1, 4, 5, 3, 6, 2, 6, 0, 4, 2, 6,
       5, 2, 0, 2, 6, 0, 0, 1, 1, 3, 4, 2, 6, 6, 4, 0, 5, 2, 0, 6, 3, 6,
       5, 2, 6, 4, 4, 3, 4, 3, 4, 3, 1, 0, 1, 6, 0, 6, 0, 1, 4, 2, 1, 6,
       2, 2, 5, 4, 3, 5, 4, 1, 3, 3, 4, 0, 3, 1, 5, 6, 0, 6, 1, 5, 2, 4,
       2, 3, 5, 1, 1, 6, 3, 5, 6, 5, 2, 0, 5, 6, 2, 2, 0, 1, 4, 0, 1, 0,
       1, 5, 2, 4, 4, 2, 5, 3, 0, 2, 1, 3, 6, 5, 1, 0, 3, 5, 5, 6, 4, 5,
       0, 1, 1, 2, 4, 0, 3, 3])

In [24]:
gini(real_labels)

0.38095238095238093