Loading the dataset
At the moment we need to load the annotations directly from the database. We can replace this code with loading the csv file once the challenge is released

In [24]:
import numpy as np
import pandas as pd
from hcai_datasets.hcai_nova_dynamic.nova_db_handler import NovaDBHandler
from utils.config_utils import read_configs

# Database Config file
db_cfg_path = '../../.configs/nova/nova_db.cfg'

# Sets
sessions_test = [f'recording{i:02d}' for i in range(1,8)]
sessions_train = [f'recording{i:02d}' for i in range(8,29)]

# Annotators
annotators = ['liechtensteinaugust','reineckermarcleon', 'fabianmaurer']

# Roles
roles = ['subjectPos1', 'subjectPos2', 'subjectPos3', 'subjectPos4']

# Schemes
scheme_consensus = 'backchannel_consensuality'
scheme_nonverbal = 'backchannel_nonverbal'
scheme_paraverbal = 'backchannel_paraverbal'
scheme_verbal = 'backchannel_verbal'

# Database handler
db_handler = NovaDBHandler(db_cfg_path)

# Annos
consensus = {}
nonverbal = {}
paraverbal = {}
verbal = {}


# Loading
for s in sessions_train + sessions_test:
    for r in roles:
        for a in annotators:
            print(f'Loading anno {s} - {r} - {a}')
            consensus[(s, r, a)] = db_handler.get_annos(dataset='mpiiemo', scheme=scheme_consensus, session=s,annotator=a,roles=r)
            nonverbal[(s, r, a)] = db_handler.get_annos(dataset='mpiiemo', scheme=scheme_nonverbal, session=s,annotator=a,roles=r)
            paraverbal[(s, r, a)] = db_handler.get_annos(dataset='mpiiemo', scheme=scheme_paraverbal, session=s,annotator=a,roles=r)
            verbal[(s, r, a)] = db_handler.get_annos(dataset='mpiiemo', scheme=scheme_verbal, session=s,annotator=a,roles=r)

Loaded config from ../../.configs/nova/nova_db.cfg:
---------------------
DB
	ip : 137.250.171.233
	port : 37317
	user : schildom
---------------------
Loading anno recording08 - subjectPos1 - liechtensteinaugust
No annotions found for 
	-annotator: liechtensteinaugust
	-scheme: backchannel_nonverbal
	-session: recording08
	-role: subjectPos1
No annotions found for 
	-annotator: liechtensteinaugust
	-scheme: backchannel_paraverbal
	-session: recording08
	-role: subjectPos1
No annotions found for 
	-annotator: liechtensteinaugust
	-scheme: backchannel_verbal
	-session: recording08
	-role: subjectPos1
Loading anno recording08 - subjectPos1 - reineckermarcleon
Loading anno recording08 - subjectPos1 - fabianmaurer
No annotions found for 
	-annotator: fabianmaurer
	-scheme: backchannel_nonverbal
	-session: recording08
	-role: subjectPos1
No annotions found for 
	-annotator: fabianmaurer
	-scheme: backchannel_paraverbal
	-session: recording08
	-role: subjectPos1
No annotions found for 
	-ann

In [209]:
# 0: consensus, 1: nonverbal, 2: paraverbal, 3: verbal
data_header = ['consensus', 'non_verbal', 'para_verbal', 'verbal']

# Dropping empty annos and convert rest to pandas dataframes
annos = [
    {key:pd.DataFrame(val) for key, val in consensus.items() if val != -1},
    {key:pd.DataFrame(val) for key, val in nonverbal.items() if val != -1},
    {key:pd.DataFrame(val) for key, val in paraverbal.items() if val != -1},
    {key:pd.DataFrame(val) for key, val in verbal.items() if val != -1}
]

# Add Partition to index
annos = [ {(('train',)+key if key[0] in sessions_train else ('test',) + key) : val for key, val in x.items() } for x in annos]

In [210]:
def df_for_scheme(annos):
    df = pd.concat(annos)
    df = df.drop(columns=['conf', 'meta'])
    df.index.names = ['set', 'session', 'subject', 'annotator', 'id']
    df['dur'] = df['to'] - df['from']
    return df

annos_stats = {n : df_for_scheme(annos[i]) for i,n in enumerate(data_header)}

# Map consensus label id to float value
annos_stats['consensus'] = annos_stats['consensus'].astype(float).replace(
    [0,1,2,3,4,5,6,7,8,9],
    [-1,-0.75,-0.5,-0.25,0,0.25,0.5,0.75,1,np.nan]
)

print('Annos have been generated. Read for analysis')

Annos have been generated. Read for analysis


Let`s generate some statistics...
We start with the gneral description of the dataframes

In [211]:
for k,v in annos_stats.items():
    print('----------------')
    print(k)
    print(v.describe())

----------------
consensus
              from           to           id          dur
count  6039.000000  6039.000000  5969.000000  6039.000000
mean    584.105859   585.792416     0.289873     1.605211
std     350.037287   350.114822     0.411309     1.623509
min       0.200000     1.600000    -1.000000    -1.000000
25%     272.840000   273.960000     0.000000     0.720000
50%     580.720000   582.520000     0.250000     1.240000
75%     879.340000   881.360000     0.500000     1.960000
max    1301.440000  1301.880000     1.000000    25.080000
----------------
non_verbal
              from           to      id          dur
count  4328.000000  4328.000000  4328.0  4328.000000
mean    592.173013   594.015102     0.0     1.842089
std     353.044153   353.127587     0.0     1.506278
min       0.200000     1.640000     0.0     0.000000
25%     282.670000   284.320000     0.0     0.960000
50%     585.700000   587.100000     0.0     1.440000
75%     901.300000   903.540000     0.0     2.200000

In [214]:
# example
annos_stats['consensus'].loc['train','recording08','subjectPos1','reineckermarcleon']
#annos_stats['consensus'].droplevel('id').drop(columns=['from', 'to']).groupby(level=['set','session']).describe()

  annos_stats['consensus'].loc['train','recording08','subjectPos1','reineckermarcleon']


Unnamed: 0_level_0,from,to,id,dur
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,35.0,35.4,,0.4
1,35.92,37.68,,1.76
2,49.68,50.64,0.5,0.96
3,121.08,121.76,0.0,0.68
4,124.48,124.88,1.0,0.4
5,242.36,242.88,0.5,0.52
6,328.36,329.36,0.0,-0.75
7,354.68,355.64,0.5,0.96
8,459.24,460.28,0.25,1.04
9,483.64,484.4,0.0,0.76
