In [1]:
import pandas as pd
import numpy as np

## Sep28k

In [2]:
df = pd.read_csv('../datasets/sep28k/SEP-28k_labels_new.csv')

In [3]:
from statsmodels.stats import inter_rater as irr
label_df = df.iloc[:,5:-1]
def calculate_agreement(label_df):
    for col in label_df.columns:
        temp = pd.DataFrame()
        temp['pos'] = label_df[col]
        temp['neg'] = temp['pos'].apply(lambda x: 3-x )
        # compute Fleiss' Kappa for the current category
        fleiss_kappa = irr.fleiss_kappa(temp.values, method='fleiss')
        print(f"Fleiss' Kappa for category {col}: {fleiss_kappa:.2f}")
calculate_agreement(label_df)

Fleiss' Kappa for category Unsure: 0.02
Fleiss' Kappa for category PoorAudioQuality: 0.19
Fleiss' Kappa for category Prolongation: 0.24
Fleiss' Kappa for category Block: 0.12
Fleiss' Kappa for category SoundRep: 0.41
Fleiss' Kappa for category WordRep: 0.64
Fleiss' Kappa for category DifficultToUnderstand: 0.16
Fleiss' Kappa for category Interjection: 0.57
Fleiss' Kappa for category NoStutteredWords: 0.40
Fleiss' Kappa for category NaturalPause: 0.09
Fleiss' Kappa for category Music: 0.80


In [4]:
from sklearn.model_selection import train_test_split

df['split'] = 'train'
df.loc[train_test_split(df.index, test_size=0.11, random_state=42)[1], 'split'] = 'temp'
df.loc[train_test_split(df[df['split'] == 'temp'].index, test_size=0.7, random_state=42)[1], 'split'] = 'val'
df['split'] = df['split'].replace('temp', 'test')
df['split'].value_counts()

split
train    19451
val       1684
test       721
Name: count, dtype: int64

In [5]:
df.to_csv('../outputs/sep28k/SEP-28k_labels_new_split.csv', index=False)

## Fluencybank

In [10]:
# read the data
import pandas as pd
import numpy as np
df = pd.read_csv('../datasets/fluencybank/fluencybank_labels_new.csv')
calculate_agreement(df.iloc[:,5:-1])

# unique_clips = df['EpId'].unique()
# np.random.shuffle(unique_clips) 
# df['split'] = df['EpId'].apply(lambda x: 'train' if x in unique_clips[:26] else 'val' if x in unique_clips[26:29] else 'test')

df['split'] = 'train'
df.loc[train_test_split(df.index, test_size=0.15, random_state=42)[1], 'split'] = 'temp'
df.loc[train_test_split(df[df['split'] == 'temp'].index, test_size=0.5, random_state=42)[1], 'split'] = 'val'
df['split'] = df['split'].replace('temp', 'test')

df['split'].value_counts()

Fleiss' Kappa for category Unsure: 0.07
Fleiss' Kappa for category PoorAudioQuality: nan
Fleiss' Kappa for category Prolongation: 0.35
Fleiss' Kappa for category Block: 0.15
Fleiss' Kappa for category SoundRep: 0.39
Fleiss' Kappa for category WordRep: 0.50
Fleiss' Kappa for category DifficultToUnderstand: 0.23
Fleiss' Kappa for category Interjection: 0.58
Fleiss' Kappa for category NoStutteredWords: 0.54
Fleiss' Kappa for category NaturalPause: 0.05
Fleiss' Kappa for category Music: nan


  kappa = (p_mean - p_mean_exp) / (1- p_mean_exp)
  kappa = (p_mean - p_mean_exp) / (1- p_mean_exp)


split
train    3354
val       297
test      296
Name: count, dtype: int64

In [11]:
df.to_csv('../outputs/fluencybank/fluencybank_labels_new_split.csv', index=False)

In [2]:
# read temp labels and organize them
import pandas as pd
df = pd.read_csv('../datasets/fluencybank/labels_temp.csv')
df.head()

Unnamed: 0,annotation_id,start_time,end_time,RM,FP,SR,ISR,MUR,P,B,V,NV,T,annotator_id,media_file
0,a58,1520,2360,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,I,24ma.wav
1,a59,2660,3440,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,I,24ma.wav
2,a73,4240,5920,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,I,24ma.wav
3,a74,13680,14300,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,I,24ma.wav
4,a61,23880,24340,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,I,24ma.wav


In [3]:
# sort by start time
df = df.sort_values(['start_time', 'end_time'])
df.head()

Unnamed: 0,annotation_id,start_time,end_time,RM,FP,SR,ISR,MUR,P,B,V,NV,T,annotator_id,media_file
0,a58,1520,2360,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,I,24ma.wav
37,a58,1750,2400,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,R,24ma.wav
15,a58,1800,2390,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,A,24ma.wav
1,a59,2660,3440,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,I,24ma.wav
38,a59,2680,3480,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,R,24ma.wav
