# 特徴量作成

In [7]:
import pandas as pd
import numpy as np
import pickle 
import matplotlib.pyplot as plt
import os 
import glob
import pprint
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

In [5]:
# 本人によるアノテーションスコアの算出
def make_label(filename):
    df = pd.read_excel('../../data/Hazumi1911/questionnaire/1911questionnaires.xlsx', sheet_name=4, index_col=0, header=1)
    data = df.loc[filename, :].values.tolist()
    return [data[0]+(8-data[5]), (8-data[1])+data[6], data[2]+(8-data[7]), data[3]+(8-data[8]), data[4]+(8-data[9])]

# 第三者によるアノテーションスコアの算出
def make_label_thirdbigfive(filename):
    df = pd.read_excel('../../data/Hazumi1911/questionnaire/220818thirdbigfive-Hazumi1911.xlsx', sheet_name=5, header=1, index_col=0)
    data = df.loc[filename].values.tolist()
    return [data[5], data[13], data[21], data[29], data[37]]

videoIDs = {}
videoAudio = {}
videoText = {}
videoVisual = {} 
videoSentence = {}

videoThirdPersona = {}
videoPersona = {}
videoSentiment = {}

Vid = []

path = '../../data/Hazumi1911/dumpfiles/*'

files = glob.glob(path)

for file_path in sorted(files):
    filename = os.path.basename(file_path).split('.', 1)[0]
    df = pd.read_csv(file_path)
    text = df.loc[:, 'word#0001':'su'].values.tolist()
    audio = df.loc[:, 'pcm_RMSenergy_sma_max':'F0_sma_de_kurtosis'].values.tolist()
    visual = df.loc[:, '17_acceleration_max':'AU45_c_mean'].values.tolist()
    label = df.loc[:, 'TS_ternary'].values.tolist()
    
    Vid.append(filename)
    videoAudio[filename] = audio 
    videoText[filename] = text 
    videoVisual[filename] = visual 
    videoSentence[filename] = []

    videoThirdPersona[filename] = make_label_thirdbigfive(filename)
    videoPersona[filename] = make_label(filename)
    videoSentiment[filename] = label

# ファイル書き込み
with open('../../data/Hazumi_features/Hazumi1911_features.pkl', mode='wb') as f:
    pickle.dump((videoSentiment, videoPersona, videoThirdPersona, videoText, videoAudio, videoVisual, videoSentence, Vid), f)

# ファイル読み込み
with open('../../data/Hazumi_features/Hazumi1911_features.pkl', mode='rb') as f:
    videoSentiment, videoPersona, videoThirdPersona, videoText, videoAudio,\
    videoVisual, videoSentence, Vid = pickle.load(f, encoding='utf-8')

In [10]:
# クラスタ番号の追加

df = pd.DataFrame.from_dict(videoPersona, orient="index", columns=['extraversion', 'agreauleness', 'conscientiousness', 'neuroticism', 'openness'])
sc = StandardScaler()
df_sc = sc.fit_transform(df)
df_sc = pd.DataFrame(df_sc, columns=df.columns)
model = KMeans(n_clusters=4, random_state=1)
model.fit(df_sc)
cluster = model.labels_ 

videoLabel = {} 

for i, file_path in enumerate(sorted(files)):
    filename = os.path.basename(file_path).split('.', 1)[0]
    df = pd.read_csv(file_path)

    videoLabel[filename] = cluster[i]

# ファイル書き込み
with open('../../data/Hazumi_features/Hazumi1911_features.pkl', mode='wb') as f:
    pickle.dump((videoLabel, videoSentiment, videoPersona, videoThirdPersona, videoText, videoAudio, videoVisual, videoSentence, Vid), f)


In [14]:
# クラスタごとに特徴量をまとめる

def make_feature(i):
    for key, value in videoLabel.items():
        


with open('../../data/Hazumi_features/Hazumi1911_features.pkl', mode='rb') as f:
    videoLabel, videoSentiment, videoPersona, videoThirdPersona, videoText, videoAudio,\
    videoVisual, videoSentence, Vid = pickle.load(f, encoding='utf-8')

for i in range(4):
    make_feature(i)

1911F2001
1911F2002
1911F3001
1911F3002
1911F3003
1911F4001
1911F4002
1911F4003
1911F5001
1911F5002
1911F6001
1911F6002
1911F6003
1911F7002
1911M2001
1911M2002
1911M2003
1911M4001
1911M4002
1911M5001
1911M5002
1911M6001
1911M6002
1911M6003
1911M7001
1911M7002
1911F2001
1911F2002
1911F3001
1911F3002
1911F3003
1911F4001
1911F4002
1911F4003
1911F5001
1911F5002
1911F6001
1911F6002
1911F6003
1911F7002
1911M2001
1911M2002
1911M2003
1911M4001
1911M4002
1911M5001
1911M5002
1911M6001
1911M6002
1911M6003
1911M7001
1911M7002
1911F2001
1911F2002
1911F3001
1911F3002
1911F3003
1911F4001
1911F4002
1911F4003
1911F5001
1911F5002
1911F6001
1911F6002
1911F6003
1911F7002
1911M2001
1911M2002
1911M2003
1911M4001
1911M4002
1911M5001
1911M5002
1911M6001
1911M6002
1911M6003
1911M7001
1911M7002
1911F2001
1911F2002
1911F3001
1911F3002
1911F3003
1911F4001
1911F4002
1911F4003
1911F5001
1911F5002
1911F6001
1911F6002
1911F6003
1911F7002
1911M2001
1911M2002
1911M2003
1911M4001
1911M4002
1911M5001
1911M5002
1911M6001


# 特徴量集計

In [19]:
# 特徴量の分布を集計
text = []
for filename, data in videoText.items():
    data = np.array(data)
    text.extend(data)

df = pd.DataFrame(text)
print('----text----')
print(df.describe())

audio = []
for filename, data in videoAudio.items():
    data = np.array(data)
    audio.extend(data)

df = pd.DataFrame(audio)
print('----audio----')
print(df.describe())

visual = []
for filename, data in videoVisual.items():
    data = np.array(data)
    visual.extend(data)

print('----visual----')
df = pd.DataFrame(visual)
print(df.describe())

----text----
              0            1            2            3            4     \
count  2439.000000  2439.000000  2439.000000  2439.000000  2439.000000   
mean      0.060271     0.305043     0.000410     0.158672     0.018450   
std       0.259472     0.488190     0.020249     0.510591     0.137614   
min       0.000000     0.000000     0.000000     0.000000     0.000000   
25%       0.000000     0.000000     0.000000     0.000000     0.000000   
50%       0.000000     0.000000     0.000000     0.000000     0.000000   
75%       0.000000     1.000000     0.000000     0.000000     0.000000   
max       3.000000     3.000000     1.000000     7.000000     2.000000   

              5            6            7            8            9     ...  \
count  2439.000000  2439.000000  2439.000000  2439.000000  2439.000000  ...   
mean      0.191062     0.004510     0.010250     0.008610     0.006150  ...   
std       0.512744     0.088164     0.100743     0.096746     0.083277  ...   
min 