In [10]:
import re
import pickle 
import os 
import glob
from tqdm import tqdm
import pympi.Elan 
import pandas as pd
from sklearn.preprocessing import StandardScaler

import torch 
from transformers import BertModel 
from transformers import BertJapaneseTokenizer

import utils

from transformers import logging
logging.set_verbosity_error()


device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 
feature_path = '../data/Hazumi_features/'

# オークナイザーの読み込み
tokenizer = BertJapaneseTokenizer.from_pretrained(
    "cl-tohoku/bert-base-japanese-whole-word-masking"
)

# 学習済みモデルの読み込み
model = BertModel.from_pretrained("cl-tohoku/bert-base-japanese-whole-word-masking").to(device)


def preprocess(text):
    """ 
    フィラー記号(F *)，|の除去
    """
    text = re.sub('\([^)]*\)', '', text)
    text = text.replace('|', '')

    return text


def embedding(sentences):
    # 前処理

    sentences = [preprocess(text) for text in sentences]

    # BERTトークン化
    encoded = tokenizer.batch_encode_plus(
        sentences, padding=True, add_special_tokens=True
    )


    # BERTトークンID列を抽出
    input_ids = torch.tensor(encoded["input_ids"], device=device) 

    # BERTの最大許容トークン数が512なので超える場合は切り詰める
    input_ids = input_ids[:, :512] 

    with torch.no_grad():
        outputs = model(input_ids) 

    # 最終層の隠れ状態ベクトルを取得
    last_hidden_states = outputs[0] 

    # [CLS]トークンの単語ベクトルを抽出
    vecs = last_hidden_states[:, 0, :]

    return vecs.tolist()


def eaf_to_df( eaf: pympi.Elan.Eaf ) -> pd.DataFrame:
    tier_names = list( eaf.tiers.keys() )

    def timeslotid_to_time( timeslotid: str ) -> float:
        return eaf.timeslots[ timeslotid ] / 1000

    def parse( tier_name: str, tier: dict ) -> pd.DataFrame:
        values = [ (key,) + value[:-1] for key, value in tier.items() ]
        df = pd.DataFrame( values, columns=[ "id", "start", "end", "transcription"] )

        df["start"] = df["start"].apply( timeslotid_to_time )
        df["end"] = df["end"].apply( timeslotid_to_time )
        df["ID"] = df.apply( lambda x: f"{tier_name}-{x.name}", axis=1 )
        df = df.reindex( columns=["ID", "start", "end", "transcription"] )

        return df

    dfs = [ parse(tier_name=name, tier=eaf.tiers[name][0]) for name in tier_names ]
    df = pd.concat( dfs )
    df = df.sort_values( "start" )
    df = df.reset_index( drop=True )
    return df

def extract_sentence(version, filename, start):
    res = []
    sys_res = []
    raw_sentence = []
    raw_sys_sentence = []
    src = f'../data/Hazumi/Hazumi{version}/elan/' + filename + '.eaf' 

    eaf = pympi.Elan.Eaf(src) 
    df = eaf_to_df(eaf) 

    df['start'] = (df['start'] * 1000).astype(int)


    for time in start:
        sentence = df[(df['start'] == time) & (df['ID'].str.contains('user'))]['transcription'].values.tolist()
        sys_sentence = df[(df['start'] == time) & (df['ID'].str.contains('sys'))]['transcription'].values.tolist()
        if len(sentence) == 0:
            sentence = ['']
        if len(sys_sentence) == 0:
            sys_sentence = ['']
        raw_sentence.append(sentence) 
        raw_sys_sentence.append(sys_sentence)
        sentence = embedding(sentence)
        sys_sentence = embedding(sys_sentence)
        res.append(sentence[0])
        sys_res.append(sys_sentence[0])

    return res


def personality_binarization(x, vid):
    """
    性格特性スコアを2クラスに分類
    """
    binary = {}
    df = pd.DataFrame.from_dict(x, orient='index')

    df = (df >= 0) * 1

    for id in vid:
        binary[id] = df.loc[id, :].tolist()

    return binary

def get_personality(hazumiversion, filename):
    """
    アノテータの質問回答から性格特性スコアを算出
    """
    if hazumiversion == "1911":
        filepath = "../data/Hazumi/Hazumi1911/questionnaire/1911questionnaires.xlsx"
        df = pd.read_excel(filepath, sheet_name=4, header=1, index_col=0)
    elif hazumiversion == "2010":
        filepath = "../data/Hazumi/Hazumi2010/questionnaire/questionnaires.xlsx"
        df = pd.read_excel(filepath, sheet_name=3, header=1, index_col=0)
    elif hazumiversion == "2012":
        filepath = "../data/Hazumi/Hazumi2012/questionnaire/questionnaires.xlsx"
        df = pd.read_excel(filepath, sheet_name=3, header=1, index_col=0)
    data = df.loc[filename].values.tolist()
    return [data[0] + (8 - data[5]), (8-data[1]) + data[6], data[2] + (8-data[7]), data[3] + (8-data[8]), data[4] + (8-data[9])]

def get_third_personality(version, filename):
    """
    アノテータの質問回答から性格特性スコアを算出
    """
    if version == "1911":
        file_path = '../data/Hazumi/Hazumi1911/questionnaire/220818thirdbigfive-Hazumi1911.xlsx'
    elif version == "2010":
        file_path = '../data/Hazumi/Hazumi2010/questionnaire/220822thirdbigfive-Hazumi2010.xlsx'
    df = pd.read_excel(file_path, sheet_name=5, header=1, index_col=0)
    data = df.loc[filename].values.tolist()
    return [data[5], data[13], data[21], data[29], data[37]]

In [17]:
# 1712
version = "1712"
dumpfile_path = f'../data/Hazumi/Hazumi{version}/dumpfiles/*'
files = glob.glob(dumpfile_path)

text = {}
audio = {}
visual = {} 

SP = {}
TP = {}

SS = {}
TS_raw = {}
TS_binary = {}

TS_ternary = {}

vid = []

for file_path in tqdm(sorted(files)):
    filename = os.path.basename(file_path).split('.', 1)[0]
    df = pd.read_csv(file_path)
    start = df['start(exchange)[ms]'].values.tolist()

    vid.append(filename)
    text[filename] = extract_sentence(version, filename, start)
    audio[filename] = df.loc[:, 'pcm_RMSenergy_sma_max':'F0_sma_de_kurtosis'].values.tolist()
    visual[filename] = df.loc[:, '17_acceleration_max':'AU45_c_mean'].values.tolist()

    # SP[filename] = get_personality(version, filename)
    # TP[filename] = get_third_personality(version, filename)

    # row
    # SS[filename] = df.loc[:, 'SS'].astype(int).values.tolist()
    TS_raw[filename] = (df.loc[:, 'TS1':'TS5'].mean(axis='columns')).values.tolist()
    # binary
    # SS[filename] = (df.loc[:, 'SS'] > 4.0).astype(int).values.tolist()
    TS_binary[filename] = (df.loc[:, 'TS1':'TS5'].mean(axis='columns') >= 4.0).astype(int).values.tolist()
    # ternary
    # SS[filename] = df.loc[:, 'SS_ternary'].astype(int).values.tolist()
    TS_ternary[filename] = df.loc[:, 'TS_ternary'].astype(int).values.tolist()

# SP_scaled = utils.dict_standardize(SP, vid)
# SP_binary = personality_binarization(SP_scaled, vid)
# SP_cluster = utils.clustering(SP_scaled, vid, n_clusters=4)
with open(feature_path + f'Hazumi{version}_features_raw.pkl', mode='wb') as f:
    pickle.dump((SS, TS_raw, SP, TP, text, audio, visual, vid), f)

with open(feature_path + f'Hazumi{version}_features_binary.pkl', mode='wb') as f:
    pickle.dump((SS, TS_binary, SP, TP, text, audio, visual, vid), f)

with open(feature_path + f'Hazumi{version}_features_ternary.pkl', mode='wb') as f:
    pickle.dump((SS, TS_ternary, SP, TP, text, audio, visual, vid), f)

100%|██████████| 27/27 [00:34<00:00,  1.29s/it]


In [18]:
# 1902
version = "1902"
dumpfile_path = f'../data/Hazumi/Hazumi{version}/dumpfiles/*'
files = glob.glob(dumpfile_path)

text = {}
audio = {}
visual = {} 

SP = {}
TP = {}

SS_raw = {}
TS_raw = {}

SS_binary = {}
TS_binary = {}

SS_ternary = {}
TS_ternary = {}

vid = []

for file_path in tqdm(sorted(files)):
    filename = os.path.basename(file_path).split('.', 1)[0]
    df = pd.read_csv(file_path)
    start = df['start(exchange)[ms]'].values.tolist()

    vid.append(filename)
    text[filename] = extract_sentence(version, filename, start)
    audio[filename] = df.loc[:, 'pcm_RMSenergy_sma_max':'F0_sma_de_kurtosis'].values.tolist()
    visual[filename] = df.loc[:, '17_acceleration_max':'AU45_c_mean'].values.tolist()

    # SP[filename] = get_personality(version, filename)
    # TP[filename] = get_third_personality(version, filename)

    # row
    SS_raw[filename] = df.loc[:, 'SS'].astype(int).values.tolist()
    TS_raw[filename] = df.loc[:, 'TS1':'TS5'].mean(axis='columns').values.tolist()
    # binary
    SS_binary[filename] = (df.loc[:, 'SS'] >= 4.0).astype(int).values.tolist()
    TS_binary[filename] = (df.loc[:, 'TS1':'TS5'].mean(axis='columns') >= 4.0).astype(int).values.tolist()
    # ternary
    SS_ternary[filename] = df.loc[:, 'SS_ternary'].astype(int).values.tolist()
    TS_ternary[filename] = df.loc[:, 'TS_ternary'].astype(int).values.tolist()

# SP_scaled = utils.dict_standardize(SP, vid)
# SP_binary = personality_binarization(SP_scaled, vid)
# SP_cluster = utils.clustering(SP_scaled, vid, n_clusters=4)
with open(feature_path + f'Hazumi{version}_features_raw.pkl', mode='wb') as f:
    pickle.dump((SS_raw, TS_raw, SP, TP, text, audio, visual, vid), f)

with open(feature_path + f'Hazumi{version}_features_binary.pkl', mode='wb') as f:
    pickle.dump((SS_binary, TS_binary, SP, TP, text, audio, visual, vid), f)

with open(feature_path + f'Hazumi{version}_features_ternary.pkl', mode='wb') as f:
    pickle.dump((SS_ternary, TS_ternary, SP, TP, text, audio, visual, vid), f)

100%|██████████| 28/28 [00:36<00:00,  1.30s/it]


In [4]:
# 1911
version = "1911"
dumpfile_path = f'../data/Hazumi/Hazumi{version}/dumpfiles/*'
files = glob.glob(dumpfile_path)

text = {}
audio = {}
visual = {} 

SP = {}
TP = {}

SS_raw = {}
TS_raw = {}

SS_binary = {}
TS_binary = {}

SS_ternary = {}
TS_ternary = {}

vid = []

for file_path in tqdm(sorted(files)):
    filename = os.path.basename(file_path).split('.', 1)[0]
    df = pd.read_csv(file_path)
    start = df['start(exchange)[ms]'].values.tolist()

    vid.append(filename)
    text[filename] = extract_sentence(version, filename, start)
    audio[filename] = df.loc[:, 'pcm_RMSenergy_sma_max':'F0_sma_de_kurtosis'].values.tolist()
    visual[filename] = df.loc[:, '17_acceleration_max':'AU45_c_mean'].values.tolist()

    SP[filename] = get_personality(version, filename)
    TP[filename] = get_third_personality(version, filename)

    # row
    SS_raw[filename] = df.loc[:, 'SS'].astype(int).values.tolist()
    # TS[filename] = df.loc[:, 'TS1':'TS5'].values.tolist()
    TS_raw[filename] = df.loc[:, 'TS1':'TS5'].mean(axis='columns').values.tolist()
    # binary
    SS_binary[filename] = (df.loc[:, 'SS'] >= 4.0).astype(int).values.tolist()
    TS_binary[filename] = (df.loc[:, 'TS1':'TS5'].mean(axis='columns') >= 4.0).astype(int).values.tolist()
    # ternary
    SS_ternary[filename] = df.loc[:, 'SS_ternary'].astype(int).values.tolist()
    TS_ternary[filename] = df.loc[:, 'TS_ternary'].astype(int).values.tolist()

# SP_scaled = utils.dict_standardize(SP, vid)
# SP_binary = personality_binarization(SP_scaled, vid)
# SP_cluster = utils.clustering(SP_scaled, vid, n_clusters=4)
with open(feature_path + f'Hazumi{version}_features_raw.pkl', mode='wb') as f:
    pickle.dump((SS_raw, TS_raw, SP, TP, text, audio, visual, vid), f)

with open(feature_path + f'Hazumi{version}_features_binary.pkl', mode='wb') as f:
    pickle.dump((SS_binary, TS_binary, SP, TP, text, audio, visual, vid), f)

with open(feature_path + f'Hazumi{version}_features_ternary.pkl', mode='wb') as f:
    pickle.dump((SS_ternary, TS_ternary, SP, TP, text, audio, visual, vid), f)

  0%|          | 0/26 [00:00<?, ?it/s]

100%|██████████| 26/26 [00:38<00:00,  1.48s/it]


In [5]:
# 2010
version = "2010"
dumpfile_path = f'../data/Hazumi/Hazumi{version}/dumpfiles/*'
files = glob.glob(dumpfile_path)

text = {}
audio = {}
visual = {} 

SP = {}
TP = {}

SS = {}
TS_raw = {}
TS_binary = {}
# TS_ternary = {}

vid = []

for file_path in tqdm(sorted(files)):
    filename = os.path.basename(file_path).split('.', 1)[0]
    df = pd.read_csv(file_path)
    start = df['start(exchange)'].values.tolist()

    vid.append(filename)
    text[filename] = extract_sentence(version, filename, start)
    audio[filename] = df.loc[:, 'pcm_RMSenergy_sma_max':'F0_sma_de_kurtosis\n'].values.tolist()
    visual[filename] = df.loc[:, '17_acceleration_max':'AU45_c_mean'].values.tolist()

    SP[filename] = get_personality(version, filename)
    TP[filename] = get_third_personality(version, filename)

    # row
    TS_raw[filename] = df.loc[:, 'TS1':'TS3'].mean(axis='columns').values.tolist()
    # binary
    TS_binary[filename] = (df.loc[:, 'TS1':'TS3'].mean(axis='columns') >= 4.0).astype(int).values.tolist()
    # ternary
    # TS_ternary[filename] = df.loc[:, 'TS_ternary'].astype(int).values.tolist()
    # TS[filename] = (df.loc[:, 'TS1':'TS3'].mean(axis='columns') >= 4.0).astype(int).values.tolist()

# SP_scaled = utils.dict_standardize(SP, vid)
# SP_binary = personality_binarization(SP_scaled, vid)

# SP_cluster = utils.clustering(SP_scaled, vid, n_clusters=4)
with open(feature_path + f'Hazumi{version}_features_raw.pkl', mode='wb') as f:
    pickle.dump((SS, TS_raw, SP, TP, text, audio, visual, vid), f)

with open(feature_path + f'Hazumi{version}_features_binary.pkl', mode='wb') as f:
    pickle.dump((SS, TS_binary, SP, TP, text, audio, visual, vid), f)

# with open(feature_path + f'Hazumi{version}_features.pkl', mode='wb') as f:
#     pickle.dump((SS, TS_ternary, SP, TP, text, audio, visual, vid), f)

  0%|          | 0/33 [00:00<?, ?it/s]

100%|██████████| 33/33 [00:42<00:00,  1.28s/it]


In [6]:
TS_raw

{'2010F2002': [4.0,
  4.0,
  5.0,
  5.666666666666667,
  4.333333333333333,
  4.0,
  4.0,
  5.333333333333333,
  5.333333333333333,
  5.666666666666667,
  5.666666666666667,
  4.0,
  5.333333333333333,
  5.333333333333333,
  4.0,
  4.333333333333333,
  4.666666666666667,
  5.333333333333333,
  4.0,
  5.666666666666667,
  6.0,
  6.333333333333333,
  4.666666666666667,
  6.0,
  5.0,
  3.6666666666666665,
  5.666666666666667,
  3.0,
  3.0,
  5.333333333333333,
  5.666666666666667,
  6.0,
  5.333333333333333,
  4.0,
  4.333333333333333,
  5.0,
  4.0,
  5.0,
  3.0,
  3.0,
  6.0,
  5.666666666666667,
  5.666666666666667,
  6.0,
  5.666666666666667,
  4.666666666666667,
  4.333333333333333,
  5.333333333333333,
  6.333333333333333,
  5.666666666666667,
  5.666666666666667,
  3.0,
  6.0,
  5.333333333333333,
  5.333333333333333,
  5.333333333333333,
  5.333333333333333,
  4.666666666666667,
  5.333333333333333,
  6.0,
  5.333333333333333,
  4.333333333333333,
  4.0,
  4.0,
  4.0,
  5.0,
  5.66

In [7]:
version = "2012"
dumpfile_path = f'../data/Hazumi/Hazumi{version}/dumpfiles/*'
files = glob.glob(dumpfile_path)

text = {}
audio = {}
visual = {} 

SP = {}
TP = {}

SS = {}
TS_raw = {}
TS_binary = {}
# TS_ternary = {}

vid = []

for file_path in tqdm(sorted(files)):
    filename = os.path.basename(file_path).split('.', 1)[0]
    df = pd.read_csv(file_path)
    start = df['start(exchange)'].values.tolist()

    vid.append(filename)
    text[filename] = extract_sentence(version, filename, start)
    audio[filename] = df.loc[:, 'pcm_RMSenergy_sma_max':'F0_sma_de_kurtosis\n'].values.tolist()
    visual[filename] = df.loc[:, '17_acceleration_max':'AU45_c_mean'].values.tolist()

    SP[filename] = get_personality(version, filename)
    TP[filename] = None

    # row
    TS_raw[filename] = df.loc[:, 'TS1':'TS3'].mean(axis='columns').round(2).values.tolist()
    # binary
    TS_binary[filename] = (df.loc[:, 'TS1':'TS3'].mean(axis="columns") >= 4.0).astype(int).values.tolist()
    # ternary
    # TS_ternary[filename] = df.loc[:, 'TS_ternary'].astype(int).values.tolist()
    
# SP_scaled = utils.dict_standardize(SP, vid)
# SP_binary = personality_binarization(SP_scaled, vid)

# SP_cluster = utils.clustering(SP_scaled, vid, n_clusters=4)
with open(feature_path + f'Hazumi{version}_features_raw.pkl', mode='wb') as f:
    pickle.dump((SS, TS_raw, SP, TP, text, audio, visual, vid), f)

with open(feature_path + f'Hazumi{version}_features_binary.pkl', mode='wb') as f:
    pickle.dump((SS, TS_binary, SP, TP, text, audio, visual, vid), f)

# with open(feature_path + f'Hazumi{version}_features_ternary.pkl', mode='wb') as f:
#     pickle.dump((SS, TS_ternary, SP, TP, text, audio, visual, vid), f)

  0%|          | 0/63 [00:00<?, ?it/s]

100%|██████████| 63/63 [01:24<00:00,  1.33s/it]


In [20]:
# All(raw)
with open('../data/Hazumi_features/Hazumi1712_features_raw.pkl', mode='rb') as f:
    data1712 = pickle.load(f, encoding='utf-8')
    
with open('../data/Hazumi_features/Hazumi1902_features_raw.pkl', mode='rb') as f:
    data1902 = pickle.load(f, encoding='utf-8')

with open('../data/Hazumi_features/Hazumi1911_features_raw.pkl', mode='rb') as f:
    data1911 = pickle.load(f, encoding='utf-8')

with open('../data/Hazumi_features/Hazumi2010_features_raw.pkl', mode='rb') as f:
    data2010 = pickle.load(f, encoding='utf-8')

with open('../data/Hazumi_features/Hazumi2012_features_raw.pkl', mode='rb') as f:
    data2012 = pickle.load(f, encoding='utf-8')

data = {
    '1712': data1712,
    '1902': data1902,
    '1911': data1911,
    '2010': data2010, 
    '2012': data2012
}

def data_augmentation(versions):
    SS, TS, SP, TP = {}, {}, {}, {} 
    text, audio, visual, vid = {}, {}, {}, []
    for version in versions:
        SS.update(data[version][0])
        TS.update(data[version][1])
        SP.update(data[version][2])
        TP.update(data[version][3])
        text.update(data[version][4])
        audio.update(data[version][5])
        visual.update(data[version][6])
        vid.extend(data[version][7])
    return SS, TS, SP, TP, text, audio, visual, vid

SS, TS, SP, TP, text, audio, visual, vid = data_augmentation(['1712', '1902', '1911', '2010', '2012'])

with open(feature_path + 'Hazumiall_features_raw.pkl', mode='wb') as f:
    pickle.dump((SS, TS, SP, TP, text, audio, visual, vid), f)

In [16]:
# All(binary)
with open('../data/Hazumi_features/Hazumi1712_features_binary.pkl', mode='rb') as f:
    data1712 = pickle.load(f, encoding='utf-8')
    
with open('../data/Hazumi_features/Hazumi1902_features_binary.pkl', mode='rb') as f:
    data1902 = pickle.load(f, encoding='utf-8')

with open('../data/Hazumi_features/Hazumi1911_features_binary.pkl', mode='rb') as f:
    data1911 = pickle.load(f, encoding='utf-8')

with open('../data/Hazumi_features/Hazumi2010_features_binary.pkl', mode='rb') as f:
    data2010 = pickle.load(f, encoding='utf-8')

with open('../data/Hazumi_features/Hazumi2012_features_binary.pkl', mode='rb') as f:
    data2012 = pickle.load(f, encoding='utf-8')

data = {
    '1712': data1712,
    '1902': data1902,
    '1911': data1911,
    '2010': data2010, 
    '2012': data2012
}

def data_augmentation(versions):
    SS, TS, SP, TP = {}, {}, {}, {} 
    text, audio, visual, vid = {}, {}, {}, []
    for version in versions:
        SS.update(data[version][0])
        TS.update(data[version][1])
        SP.update(data[version][2])
        TP.update(data[version][3])
        text.update(data[version][4])
        audio.update(data[version][5])
        visual.update(data[version][6])
        vid.extend(data[version][7])
    return SS, TS, SP, TP, text, audio, visual, vid

SS, TS, SP, TP, text, audio, visual, vid = data_augmentation(['1712', '1902', '1911'])

with open(feature_path + 'Hazumiall_features_binary.pkl', mode='wb') as f:
    pickle.dump((SS, TS, SP, TP, text, audio, visual, vid), f)

In [5]:
count = [0, 0]
for i, ts in TS.items():
    for s in ts:
        count[s] += 1
count

[2022, 13147]

In [15]:
# All (ternary)
with open('../data/Hazumi_features/Hazumi1712_features_ternary.pkl', mode='rb') as f:
    data1712 = pickle.load(f, encoding='utf-8')
    
with open('../data/Hazumi_features/Hazumi1902_features_ternary.pkl', mode='rb') as f:
    data1902 = pickle.load(f, encoding='utf-8')

with open('../data/Hazumi_features/Hazumi1911_features_ternary.pkl', mode='rb') as f:
    data1911 = pickle.load(f, encoding='utf-8')

# with open('../data/Hazumi_features/Hazumi2010_features.pkl', mode='rb') as f:
#     data2010 = pickle.load(f, encoding='utf-8')

# with open('../data/Hazumi_features/Hazumi2012_features.pkl', mode='rb') as f:
#     data2012 = pickle.load(f, encoding='utf-8')

data = {
    '1712': data1712,
    '1902': data1902,
    '1911': data1911,
    # '2010': data2010, 
    # '2012': data2012
}

def data_augmentation(versions):
    SS, TS, SP, TP = {}, {}, {}, {} 
    text, audio, visual, vid = {}, {}, {}, []
    for version in versions:
        SS.update(data[version][0])
        TS.update(data[version][1])
        SP.update(data[version][2])
        TP.update(data[version][3])
        text.update(data[version][4])
        audio.update(data[version][5])
        visual.update(data[version][6])
        vid.extend(data[version][7])
    return SS, TS, SP, TP, text, audio, visual, vid

SS, TS, SP, TP, text, audio, visual, vid = data_augmentation(['1712', '1902', '1911'])

with open(feature_path + 'Hazumiall_features_ternary.pkl', mode='wb') as f:
    pickle.dump((SS, TS, SP, TP, text, audio, visual, vid), f)

In [4]:
count = [0, 0, 0]
for i, ts in TS.items():
    for s in ts:
        count[s] += 1
count

[772, 2735, 3530]