In [23]:
import re
import pickle 
import os 
import glob
from tqdm import tqdm
import pympi.Elan 
import pandas as pd
from sklearn.preprocessing import StandardScaler

import torch 
from transformers import BertModel 
from transformers import BertJapaneseTokenizer

import utils

from transformers import logging
logging.set_verbosity_error()


device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 

# オークナイザーの読み込み
tokenizer = BertJapaneseTokenizer.from_pretrained(
    "cl-tohoku/bert-base-japanese-whole-word-masking"
)

# 学習済みモデルの読み込み
model = BertModel.from_pretrained("cl-tohoku/bert-base-japanese-whole-word-masking").to(device)


def preprocess(text):
    """ 
    フィラー記号(F *)，|の除去
    """
    text = re.sub('\([^)]*\)', '', text)
    text = text.replace('|', '')

    return text


def embedding(sentences):
    # 前処理

    sentences = [preprocess(text) for text in sentences]

    # BERTトークン化
    encoded = tokenizer.batch_encode_plus(
        sentences, padding=True, add_special_tokens=True
    )


    # BERTトークンID列を抽出
    input_ids = torch.tensor(encoded["input_ids"], device=device) 

    # BERTの最大許容トークン数が512なので超える場合は切り詰める
    input_ids = input_ids[:, :512] 

    with torch.no_grad():
        outputs = model(input_ids) 

    # 最終層の隠れ状態ベクトルを取得
    last_hidden_states = outputs[0] 

    # [CLS]トークンの単語ベクトルを抽出
    vecs = last_hidden_states[:, 0, :]

    return vecs.tolist()


def eaf_to_df( eaf: pympi.Elan.Eaf ) -> pd.DataFrame:
    tier_names = list( eaf.tiers.keys() )

    def timeslotid_to_time( timeslotid: str ) -> float:
        return eaf.timeslots[ timeslotid ] / 1000

    def parse( tier_name: str, tier: dict ) -> pd.DataFrame:
        values = [ (key,) + value[:-1] for key, value in tier.items() ]
        df = pd.DataFrame( values, columns=[ "id", "start", "end", "transcription"] )

        df["start"] = df["start"].apply( timeslotid_to_time )
        df["end"] = df["end"].apply( timeslotid_to_time )
        df["ID"] = df.apply( lambda x: f"{tier_name}-{x.name}", axis=1 )
        df = df.reindex( columns=["ID", "start", "end", "transcription"] )

        return df

    dfs = [ parse(tier_name=name, tier=eaf.tiers[name][0]) for name in tier_names ]
    df = pd.concat( dfs )
    df = df.sort_values( "start" )
    df = df.reset_index( drop=True )
    return df

def extract_sentence(version, filename, start):
    res = []
    src = f'../data/Hazumi/Hazumi{version}/elan/' + filename + '.eaf' 

    eaf = pympi.Elan.Eaf(src) 
    df = eaf_to_df(eaf) 

    df['start'] = (df['start'] * 1000).astype(int)

    for time in start:
        sentence = df[(df['start'] == time) & (df['ID'].str.contains('user'))]['transcription'].values.tolist()
        if len(sentence) == 0:
            sentence = ['']
        sentence = embedding(sentence)
        res.append(sentence[0])

    return res


def personality_binarization(x, vid):
    """
    性格特性スコアを2クラスに分類
    """
    binary = {}
    df = pd.DataFrame.from_dict(x, orient='index')

    df = (df >= 0) * 1

    for id in vid:
        binary[id] = df.loc[id, :].tolist()

    return binary

# def calc_Big5(version, filename, third=False):
#     """
#     アノテータの質問回答から性格特性スコアを算出
#     """
#     df = pd.read_excel(f'../data/Hazumi{version}/questionnaire/220818thirdbigfive-Hazumi1911.xlsx', sheet_name=5, header=1, index_col=0)
#     data = df.loc[filename].values.tolist()
#     if third:
#         res = [data[5], data[13], data[21], data[29], data[37]]
#     else:
#         res = [data[7], data[15], data[23], data[31], data[39]]
#     return res

def calc_Big5(hazumiversion, filename):
    """
    アノテータの質問回答から性格特性スコアを算出
    """
    if hazumiversion == "1911":
        filepath = "../data/Hazumi/Hazumi1911/questionnaire/1911questionnaires.xlsx"
        df = pd.read_excel(filepath, sheet_name=4, header=1, index_col=0)
    elif hazumiversion == "2010":
        filepath = "../data/Hazumi/Hazumi2010/questionnaire/questionnaires.xlsx"
        df = pd.read_excel(filepath, sheet_name=3, header=1, index_col=0)
    elif hazumiversion == "2012":
        filepath = "../data/Hazumi/Hazumi2012/questionnaire/questionnaires.xlsx"
        df = pd.read_excel(filepath, sheet_name=3, header=1, index_col=0)
    data = df.loc[filename].values.tolist()
    return [data[0] + (8 - data[5]), (8-data[1]) + data[6], data[2] + (8-data[7]), data[3] + (8-data[8]), data[4] + (8-data[9])]

In [None]:
# 第三者アノテーション
text = {}
bert_text = {}
audio = {}
visual = {} 

TP = {}
TP_binary = None
TP_cluster = None

TS_ternary = {}
TS = {}

vid = []

for file_path in tqdm(sorted(files)):
    filename = os.path.basename(file_path).split('.', 1)[0]
    df = pd.read_csv(file_path)
    start = df['start(exchange)[ms]'].values.tolist()

    vid.append(filename)
    bert_text[filename] = extract_sentence(version, filename, start)
    text[filename] = df.loc[:, 'word#0001':'su'].values.tolist()
    audio[filename] = df.loc[:, 'pcm_RMSenergy_sma_max':'F0_sma_de_kurtosis'].values.tolist()
    visual[filename] = df.loc[:, '17_acceleration_max':'AU45_c_mean'].values.tolist()

    TP[filename] = calc_Big5(filename, third=True)
    TS_ternary[filename] = df.loc[:, 'TS_ternary'].astype(int).values.tolist()
    TS[filename] = df.loc[:, 'TS1':'TS5'].mean(axis='columns').values.tolist()

TP_scaled = utils.dict_standardize(TP, vid)
TP_binary = personality_binarization(TP_scaled, vid)

TP_cluster = utils.clustering(TP_scaled, vid, n_clusters=4)

with open(feature_path + 'Hazumi1911_features.pkl', mode='wb') as f:
    pickle.dump((TS, TS_ternary, TP_scaled, TP_binary, TP_cluster, bert_text, audio, visual, vid), f)

In [None]:
# 1911
version = "1911"
dumpfile_path = f'../data/Hazumi/Hazumi{version}/dumpfiles/*'
files = glob.glob(dumpfile_path)
feature_path = '../data/Hazumi_features/'

# text = {}
bert_text = {}
audio = {}
visual = {} 

SP = {}
SP_binary = None
SP_cluster = None

TS_ternary = {}
TS = {}

vid = []

for file_path in tqdm(sorted(files)):
    filename = os.path.basename(file_path).split('.', 1)[0]
    df = pd.read_csv(file_path)
    start = df['start(exchange)[ms]'].values.tolist()

    vid.append(filename)
    bert_text[filename] = extract_sentence(version, filename, start)
    # text[filename] = df.loc[:, 'word#0001':'su'].values.tolist()
    audio[filename] = df.loc[:, 'pcm_RMSenergy_sma_max':'F0_sma_de_kurtosis'].values.tolist()
    visual[filename] = df.loc[:, '17_acceleration_max':'AU45_c_mean'].values.tolist()

    SP[filename] = calc_Big5(version, filename)
    # SS_ternary[filename] = df.loc[:, 'SS_ternary'].astype(int).values.tolist()
    TS[filename] = df.loc[:, 'TS1':'TS5'].mean(axis='columns').values.tolist()
    

SP_scaled = utils.dict_standardize(SP, vid)
SP_binary = personality_binarization(SP_scaled, vid)

SP_cluster = utils.clustering(SP_scaled, vid, n_clusters=4)

with open(feature_path + f'Hazumi{version}_features_self.pkl', mode='wb') as f:
    pickle.dump((SS, SS_ternary, SP_scaled, SP_binary, SP_cluster, bert_text, audio, visual, vid), f)

In [36]:
# 2010
version = "2010"
dumpfile_path = f'../data/Hazumi/Hazumi{version}/dumpfiles/*'
files = glob.glob(dumpfile_path)
feature_path = '../data/Hazumi_features/'

# text = {}
bert_text = {}
audio = {}
visual = {} 

SP = {}
SP_binary = None
SP_cluster = None

# TS_ternary = {}
TS = {}

vid = []

for file_path in tqdm(sorted(files)):
    filename = os.path.basename(file_path).split('.', 1)[0]
    df = pd.read_csv(file_path)
    start = df['start(exchange)'].values.tolist()

    vid.append(filename)
    bert_text[filename] = extract_sentence(version, filename, start)
    # text[filename] = df.loc[:, 'word#0001':'su'].values.tolist()
    # audio[filename] = df.loc[:, 'pcm_RMSenergy_sma_max':'F0_sma_de_kurtosis'].values.tolist()
    visual[filename] = df.loc[:, '17_acceleration_max':'AU45_c_mean'].values.tolist()

    SP[filename] = calc_Big5(version, filename)
    TS[filename] = df.loc[:, 'TS1':'TS3'].mean(axis='columns').values.tolist()
    

SP_scaled = utils.dict_standardize(SP, vid)
SP_binary = personality_binarization(SP_scaled, vid)

SP_cluster = utils.clustering(SP_scaled, vid, n_clusters=4)

with open(feature_path + f'Hazumi{version}_features_self.pkl', mode='wb') as f:
    pickle.dump((TS, SP_scaled, SP_binary, SP_cluster, bert_text, audio, visual, vid), f)

100%|██████████| 33/33 [00:25<00:00,  1.31it/s]


In [39]:
# 2012
version = "2012"
dumpfile_path = f'../data/Hazumi/Hazumi{version}/dumpfiles/*'
files = glob.glob(dumpfile_path)
feature_path = '../data/Hazumi_features/'

# text = {}
bert_text = {}
audio = {}
visual = {} 

SP = {}
SP_binary = None
SP_cluster = None

TS_ternary = {}
TS = {}

vid = []

for file_path in tqdm(sorted(files)):
    filename = os.path.basename(file_path).split('.', 1)[0]
    df = pd.read_csv(file_path)
    start = df['start(exchange)'].values.tolist()

    vid.append(filename)
    bert_text[filename] = extract_sentence(version, filename, start)
    # text[filename] = df.loc[:, 'word#0001':'su'].values.tolist()
    audio[filename] = df.loc[:, 'pcm_RMSenergy_sma_max':'F0_sma_de_kurtosis'].values.tolist()
    visual[filename] = df.loc[:, '17_acceleration_max':'AU45_c_mean'].values.tolist()

    SP[filename] = calc_Big5(version, filename)
    TS[filename] = df.loc[:, 'TS1':'TS3'].mean(axis='columns').values.tolist()
    

SP_scaled = utils.dict_standardize(SP, vid)
SP_binary = personality_binarization(SP_scaled, vid)

SP_cluster = utils.clustering(SP_scaled, vid, n_clusters=4)

with open(feature_path + f'Hazumi{version}_features_self.pkl', mode='wb') as f:
    pickle.dump((TS, SP_scaled, SP_binary, SP_cluster, bert_text, audio, visual, vid), f)

  0%|          | 0/63 [00:00<?, ?it/s]


KeyError: 'F0_sma_de_kurtosis'