In [10]:
import pandas as pd
import numpy as np

from sklearn.cluster import KMeans, AgglomerativeClustering, SpectralClustering, DBSCAN
from sklearn.metrics import adjusted_rand_score, f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [58]:
df = pd.read_pickle('crowd_train_all_data_embedded.pkl')


def get_dataframe(df, min_count, max_count):
    df_temp = df.copy()

    freq = df_temp.source_id.value_counts()
    freq = freq[min_count <= freq]
    df_temp = df_temp[df.source_id.isin(freq.index)]

    df_temp = (
        df_temp.groupby('source_id')
        .apply(lambda x: x.nlargest(max_count, 'duration'))
        .reset_index(drop=True)
        .drop(
            columns=[
                'duration',
                'hash_id',
                'annotator_emo',
                'golden_emo',
                'annotator_id',
                'speaker_text',
                'speaker_emo',
            ]
        )
    )

    df_temp.dropna(inplace=True)

    print(df_temp.source_id.nunique())

    return df_temp


df_t = get_dataframe(df, 350, 350)

8


In [59]:
X = np.vstack(df_t.audio_feature.to_numpy()).reshape(df_t.shape[0], -1)
le = LabelEncoder()
y = le.fit_transform(df_t.source_id)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
X_train.shape, X_test.shape, len(y_train[y_train == 4]), len(y_test[y_test == 4])

((2240, 498), (560, 498), 280, 70)

In [60]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [61]:
params = {
    "n_clusters": len(set(y)),
    "random_state": 42,
    # "max_iter": 10_000,
    # "algorithm": "elkan",
    # "init": "k-means++",
    # "tol": 0.01,
}
sk_kmeans = KMeans(**params)
sk_kmeans.fit(X_train_scaled)

  super()._check_params_vs_input(X, default_n_init=10)


In [62]:
preds_train = sk_kmeans.predict(X_train_scaled)
preds_test = sk_kmeans.predict(X_test_scaled)
kmeans_f1_w_train = f1_score(y_train, preds_train, average='weighted')
kmeans_f1_w_test = f1_score(y_test, preds_test, average='weighted')
kmeans_f1_w_train, kmeans_f1_w_test

(0.02143987781202623, 0.024519291361576812)

In [None]:
from esoinn import ESoinn

s = ESoinn(iteration_threshold=300, plt_in_fit=False)
s.fit(X)

# s.fit(X, iters=1_000, reset=False)
# s.fit(X, iters=1_000, reset=False)
# s.fit(X, iters=1_000, reset=False)

# # nodes = s.nodes

# # print(len(nodes))

# # print("end")

In [17]:
essoin_preds = s.predict(X)
essoin_score = adjusted_rand_score(y, essoin_preds)
essoin_score

Number of classes： 174


100%|██████████| 5639/5639 [00:06<00:00, 857.21it/s]


0.02239257341062798

In [7]:
sk_kmeans = KMeans(n_clusters=len(set(y)), n_init='auto', random_state=0)
sk_kmeans.fit(X)

In [8]:
sk_kmeans_pred_res = sk_kmeans.predict(X)
sk_kmeans_ari = adjusted_rand_score(y, sk_kmeans_pred_res)
sk_kmeans_centroinds = sk_kmeans.cluster_centers_
print(f'Adjusted Rand Score for sk KMeans: {sk_kmeans_ari}', '', sep='\n')
print(sk_kmeans_centroinds, '', sep='\n')
print('prediction', sk_kmeans_pred_res, sep='\n')

Adjusted Rand Score for sk KMeans: 0.028899193321984266

[[0.38986363 0.39170796 0.40495995 ... 0.40756367 0.13941199 0.15164149]
 [0.33985543 0.36217529 0.35772014 ... 0.33941419 0.08055199 0.11999184]
 [0.36440136 0.33980225 0.32937995 ... 0.4480504  0.12651256 0.16391591]
 ...
 [0.33642276 0.33196179 0.35390949 ... 0.3720779  0.13663802 0.1316373 ]
 [0.32933408 0.34497613 0.42107573 ... 0.43799211 0.07595324 0.24247997]
 [0.47509672 0.38872761 0.32449922 ... 0.33633232 0.15845421 0.1881055 ]]

prediction
[253 242 140 ... 190 190  93]


In [9]:
sk_ac = AgglomerativeClustering(n_clusters=len(set(y)), linkage='ward')
sk_ac_pred_res = sk_ac.fit_predict(X)
sk_ac_ari = adjusted_rand_score(y, sk_ac_pred_res)
print(f'Adjusted Rand Score for sk AgglomerativeClustering: {sk_ac_ari}', '', sep='\n')
print('prediction', sk_ac_pred_res, sep='\n')

Adjusted Rand Score for sk AgglomerativeClustering: 0.02797180762776233

prediction
[125 125  41 ... 171 342 171]


In [10]:
sk_sc = SpectralClustering(n_clusters=len(set(y)), gamma=10, random_state=0)
sk_sc_pred_res = sk_sc.fit_predict(X)
sk_sc_ari = adjusted_rand_score(y, sk_sc_pred_res)
print(f'Adjusted Rand Score for sk SpectralClustering: {sk_sc_ari}', '', sep='\n')
print('prediction', sk_sc_pred_res, sep='\n')



KeyboardInterrupt: 

Exception ignored in: 'sklearn.cluster._k_means_common._relocate_empty_clusters_dense'
Traceback (most recent call last):
  File "<__array_function__ internals>", line 177, in where
KeyboardInterrupt: 


In [15]:
sk_dbscan = DBSCAN(eps=0.3, min_samples=3)
sk_dbscan_pred_res = sk_dbscan.fit_predict(X)
sk_dbscan_ari = adjusted_rand_score(y, sk_dbscan_pred_res)
print(f'Adjusted Rand Score for sk DBSCAN: {sk_dbscan_ari}', '', sep='\n')
print('prediction', sk_dbscan_pred_res, sep='\n')

Adjusted Rand Score for sk DBSCAN: 0.0

prediction
[-1 -1 -1 ... -1 -1 -1]
