# 特徴量作成

In [6]:
import pandas as pd
import numpy as np
import pickle 
import matplotlib.pyplot as plt
import os 
import glob
import pympi.Elan 


# 本人にアノテーションされた性格特性スコアの算出
def calc_persona(filename, normalized=False):
    df = pd.read_excel('../../data/Hazumi1911/questionnaire/1911questionnaires.xlsx', sheet_name=4, index_col=0, header=1)
    data = df.loc[filename, :].values.tolist()
    res = [data[0]+(8-data[5]), (8-data[1])+data[6], data[2]+(8-data[7]), data[3]+(8-data[8]), data[4]+(8-data[9])]
    if normalized:
        return [(i-2)/12 for i in res]
    else:
        return res

# 第三者にアノテーションされた性格特性スコアの算出
def calc_thirdpersona(filename, normalized=False):
    df = pd.read_excel('../../data/Hazumi1911/questionnaire/220818thirdbigfive-Hazumi1911.xlsx', sheet_name=5, header=1, index_col=0)
    data = df.loc[filename].values.tolist()
    res = [data[5], data[13], data[21], data[29], data[37]]
    if normalized:
        return [(i-2)/12 for i in res]
    else:
        return res

def eaf_to_df( eaf: pympi.Elan.Eaf ) -> pd.DataFrame:
    tier_names = list( eaf.tiers.keys() )

    def timeslotid_to_time( timeslotid: str ) -> float:
        return eaf.timeslots[ timeslotid ] / 1000

    def parse( tier_name: str, tier: dict ) -> pd.DataFrame:
        values = [ (key,) + value[:-1] for key, value in tier.items() ]
        df = pd.DataFrame( values, columns=[ "id", "start", "end", "transcription"] )

        df["start"] = df["start"].apply( timeslotid_to_time )
        df["end"] = df["end"].apply( timeslotid_to_time )
        df["ID"] = df.apply( lambda x: f"{tier_name}-{x.name}", axis=1 )
        df = df.reindex( columns=["ID", "start", "end", "transcription"] )

        return df

    dfs = [ parse(tier_name=name, tier=eaf.tiers[name][0]) for name in tier_names ]
    df = pd.concat( dfs )
    df = df.sort_values( "start" )
    df = df.reset_index( drop=True )
    return df

In [12]:
src = r'/home/ryoyanagimoto/persona/data/1911F2001_elan.eaf' 
 
eaf = pympi.Elan.Eaf(src) 
df = eaf_to_df(eaf) 

print(df)

           ID     start       end  \
0    Tier-0-0   255.000   262.660   
1    Tier-0-1   262.660   277.106   
2    Tier-0-2   277.106   301.465   
3    Tier-0-3   301.465   304.248   
4    Tier-0-4   304.248   333.039   
..        ...       ...       ...   
68  Tier-0-68  1381.872  1405.293   
69  Tier-0-69  1405.293  1439.194   
70  Tier-0-70  1439.194  1449.703   
71  Tier-0-71  1449.703  1467.870   
72  Tier-0-72  1467.870  1474.156   

                                        transcription  
0                     やば|(F はい)|(F んん)(F ん)(F あ)(F あ)  
1                               (F あ)|なるほど|よろしくお願いします  
2   はい|実は私はここまで自転車で来ました|めっちゃ(D は)|遠かったかな|でもなんかすごい|...  
3                                                      
4   (F あ)迷いました|(F あのー)なんか最初|阪大の門から入ったんですけど|なんかバス|な...  
..                                                ...  
68  はい|もうなんかほんとに|そういうＡＩ|の(F あの)だらけの世界になっていくんじゃないかな...  
69  (F うん)|(F えー)どんなとき(F えっと)なんか|(F あのー)|(F うーん)場所...  
70                     (F うん)|はい|あとなんかちょっと悩みとか相談する時

### ダンプファイル

In [3]:
audio = {}
text = {}
visual = {} 

third_persona = {}
persona = {}
TS_ternary = {}
SS_ternary = {}
third_sentiment = {}
sentiment = {}

vid = []

path = '../../data/Hazumi1911/dumpfiles/*'

files = glob.glob(path)

for file_path in sorted(files):
    filename = os.path.basename(file_path).split('.', 1)[0]
    df = pd.read_csv(file_path)

    vid.append(filename)
    text[filename] = df.loc[:, 'word#0001':'su'].values.tolist()
    audio[filename] = df.loc[:, 'pcm_RMSenergy_sma_max':'F0_sma_de_kurtosis'].values.tolist()
    visual[filename] = df.loc[:, '17_acceleration_max':'AU45_c_mean'].values.tolist()

    persona[filename] = calc_persona(filename)
    third_persona[filename] = calc_thirdpersona(filename)
    TS_ternary[filename] = df.loc[:, 'TS_ternary'].values.tolist()
    SS_ternary[filename] = df.loc[:, 'SS_ternary'].values.tolist()
    third_sentiment[filename] = df.loc[:, 'TS1':'TS5'].mean(axis='columns').values.tolist()
    sentiment[filename] = df.loc[:, 'SS'].values.tolist()

# ファイル書き込み
with open('../../data/Hazumi_features/Hazumi1911_features.pkl', mode='wb') as f:
    pickle.dump((SS_ternary, TS_ternary, sentiment, third_sentiment, persona, third_persona, text, audio, visual, vid), f)

In [None]:
audio = {}
text = {}
visual = {} 

third_persona = {}
persona = {}
TS_ternary = {}
SS_ternary = {}
third_sentiment = {}
sentiment = {}

vid = []

path = '../../data/Hazumi1911/dumpfiles/*'

files = glob.glob(path)

for file_path in sorted(files):
    filename = os.path.basename(file_path).split('.', 1)[0]
    df = pd.read_csv(file_path)

    vid.append(filename)
    text[filename] = df.loc[:, 'word#0001':'su'].values.tolist()
    audio[filename] = df.loc[:, 'pcm_RMSenergy_sma_max':'F0_sma_de_kurtosis'].values.tolist()
    visual[filename] = df.loc[:, '17_acceleration_max':'AU45_c_mean'].values.tolist()

    persona[filename] = calc_persona(filename)
    third_persona[filename] = calc_thirdpersona(filename)
    TS_ternary[filename] = df.loc[:, 'TS_ternary'].values.tolist()
    SS_ternary[filename] = df.loc[:, 'SS_ternary'].values.tolist()
    third_sentiment[filename] = df.loc[:, 'TS1':'TS5'].mean(axis='columns').values.tolist()
    sentiment[filename] = df.loc[:, 'SS'].values.tolist()

# ファイル書き込み
with open('../../data/Hazumi_features/Hazumi1911_features.pkl', mode='wb') as f:
    pickle.dump((SS_ternary, TS_ternary, sentiment, third_sentiment, persona, third_persona, text, audio, visual, vid), f)

In [7]:
neg, neu, pos = 0, 0, 0
for value in TS_ternary.values():
    for data in value:
        if data == 0:
            neg += 1
        elif data == 1:
            neu += 1
        else:
            pos += 1
sum = neg + neu + pos
print('-----TS_ternary-----')
print(neg, neu, pos, sum)
majo = max([neg/sum, neu/sum, pos/sum])
print(f'マジョリティベースライン：{majo:.3}')

neg, neu, pos = 0, 0, 0
for value in SS_ternary.values():
    for data in value:
        if data == 0:
            neg += 1
        elif data == 1:
            neu += 1
        else:
            pos += 1
sum = neg + neu + pos
print('-----SS_ternary-----')
print(neg, neu, pos, sum)
majo = max([neg/sum, neu/sum, pos/sum])
print(f'マジョリティベースライン(SS_ternary)：{majo:.3}')

-----TS_ternary-----
178 881 1380 2439
マジョリティベースライン：0.566
-----SS_ternary-----
485 848 1106 2439
マジョリティベースライン(SS_ternary)：0.453
