In [1]:
import pandas as pd
import numpy as np
import contextlib
import wave
import os

In [2]:
diari_file = 'diarization'

df_diari = pd.read_csv(diari_file, sep='\s+', header=None)
df_diari.columns = ['y%s'%i for i in range(1, df_diari.shape[1]+1)]

In [3]:
# y2: audiofilename
# y4: start time
# y5: duration
# y8: speaker id
df_diari.head(5)

Unnamed: 0,y1,y2,y3,y4,y5,y6,y7,y8,y9,y10
0,SPEAKER,a_Gong_10_0.wav,1,0.01,0.65,,,0,,
1,SPEAKER,a_Gong_10_0.wav,1,7.39,4.85,,,0,,
2,SPEAKER,a_Gong_10_0.wav,1,19.67,1.89,,,0,,
3,SPEAKER,a_Gong_10_0.wav,1,25.0,7.03,,,0,,
4,SPEAKER,a_Gong_10_0.wav,1,36.65,2.52,,,0,,


In [4]:
import matplotlib.pyplot as plt

In [8]:
def get_rename_id_dict():
    ''' rename_id_dict
  'Gong_8_4': 'Gong_8_4_ES',
  'all_cilp_rename.npy' in 
  '''
    rename_dict=np.load('all_cilp_rename.npy', allow_pickle=True).item()
    rename_id_dict = {}
    for key in rename_dict.keys():
        key_id = key[:-4]
        value_id = rename_dict[key][:-4]
        rename_id_dict[key_id] = value_id
    return rename_id_dict
def get_diari_dict(df_diari,rename_id_dict):
    diari_dict = {}
    video_ls = df_diari['y2']
    video_set = set(video_ls)
    for video in video_set:
        video_id = video[2:-4]
        if video_id in rename_id_dict.keys():
#             print(video_id)
            key = rename_id_dict[video_id]
            diari_dict[key] = []
    for i in range(len(df_diari)):
        key_id = df_diari['y2'][i][2:-4]
        if key_id in rename_id_dict.keys():
            key = rename_id_dict[key_id]
            value = [df_diari['y4'][i],df_diari['y5'][i],df_diari['y8'][i]]
            diari_dict[key].append(value)
    return diari_dict

In [9]:
def get_audio_duration(file_path):
    with contextlib.closing(wave.open(file_path, 'r')) as f:
        frames = f.getnframes()
        rate = f.getframerate()
        wav_length = frames / float(rate)
    return wav_length

def get_speak_time(diari_dict):
    '''
    [speak_duration,whole_time,speak_percent]
    '''
    speak_dict = {}
    for key in diari_dict.keys():
        diari_ls = diari_dict[key]
        duration_ls = []
        
        for i in range(len(diari_ls)):
            duration = diari_ls[i][1]
            duration_ls.append(duration)
        speak_duration = sum(duration_ls)

        teacher_folder = (key.split('_'))[0]+'-a'
        audio_name = 'a_'+key[:-3]+'.wav'
        file_path = os.path.join(root_path,teacher_folder,audio_name)
        whole_time = get_audio_duration(file_path)
        
        speak_percent = round((speak_duration/whole_time),4)
        speak_dict[key] = [round(speak_duration,3),round(whole_time,3),speak_percent]
    return speak_dict

In [12]:
# diari_dict
'''
{'LiuY_9_4_FP': [[0.0, 12.88, 0],
  [16.9, 10.13, 0],
  [30.0, 138.81, 0],

'''

rename_id_dict = get_rename_id_dict()
diari_dict = get_diari_dict(df_diari,rename_id_dict)

root_path = '/AsdData/DMHXM/Teamwork-clip-audio'
speak_dict = get_speak_time(diari_dict)

In [14]:
import numpy as np
import os
import contextlib
import wave

In [15]:
def get_turns(ls):
    '''count : turn people'''
    turn = 0
    term = ls[0]
    for i in ls:
        if i != term:
            turn += 1
            term = i   
    return turn
def get_audio_duration(file_path):
    with contextlib.closing(wave.open(file_path, 'r')) as f:
        frames = f.getnframes()
        rate = f.getframerate()
        wav_length = frames / float(rate)
    return wav_length
def get_total_time(video):
    root_path = '/Teamwork-clip-audio'
    folder =(video.split('_'))[1]+'-a'
    video_path = os.path.join(root_path,folder,video)
#     print(video_path)
    if os.path.exists(video_path):
        total_time = get_audio_duration(video_path)
    else:
        print('PATH ERROR',video_path)
        total_time = None
    return total_time

In [41]:
def summary_df(df_diari):
    '''
 [total_time,turns,avg_turns,duration_sum,,duration_per]

    '''
    diari_per_dict= {}
    for video in set(df_diari['y2']):
        mini_df = df_diari[df_diari['y2'].isin([video])]
        duration_ls = mini_df['y5']
        duration_sum = sum(np.array(duration_ls))
        turns = get_turns(list(mini_df['y8']))
        total_time = get_total_time(video)
        if total_time != None:
#             print(duration_sum)
#             print(total_time)
            duration_per = duration_sum/total_time
            avg_turns = turns/total_time

            video_key = video[2:-4]
            diari_per_dict[video_key] = [total_time,turns,avg_turns,duration_sum,duration_per]
        
    return diari_per_dict

In [48]:
def get_seq_matrix_dict():
    '''
    matrix: vector sequence 

    [total_time,turns,avg_turns,duration_sum,duration_per],as a matrix
    '''
    seq_dict = {}
    seq_matrix_dict = {}

    for clip_big in big_type_ls:
        seq_dict[clip_big]=[]

    for key in ls_dict.keys():
        for single_clip in ls_dict[key]:
            new_key = single_clip
            single_value = single_clip_dict[new_key]
            (seq_dict[key]).append(single_value)

    for key in seq_dict.keys():
        label_ls = seq_dict[key]
        if len(label_ls)>1:
            label_matrix = np.vstack(label_ls)
            label_sum = label_matrix.sum(axis=0)
            label_sum_ls = np.array([label_sum[0],label_sum[1],label_sum[1]/label_sum[0],label_sum[3],label_sum[3]/label_sum[0]])
            seq_matrix_dict[key] = label_sum_ls
        elif len(label_ls)==1:
#             label_matrix_value = (np.array(label_ls[0]))[:,np.newaxis]
            label_matrix_value = np.array(label_ls[0])
#             print(label_matrix_value)
#             print(len(label_matrix_value))
        
            seq_matrix_dict[key] = label_matrix_value
        elif len(label_ls)==0:
            print(key)
        else:
            print(key,len(label_ls))
    return seq_matrix_dict

In [44]:
diari_per_dict = summary_df(df_diari)

big_type_one_dict = np.load('big_type_one_dict.npy', allow_pickle=True).item()
big_type_ls = list(set(big_type_one_dict.values()))

single_clip_dict = diari_per_dict
ls_dict = {}
for clip_big in big_type_ls:
    ls_dict[clip_big]=[]
for key in single_clip_dict.keys():
    
    big_key = key
    
    if big_key in big_type_one_dict.keys():
        big_value = big_type_one_dict[big_key]
        ls_dict[big_value].append(big_key)
    else:
        print(big_key)

In [51]:
np.save('diari_per_dict.npy',diari_per_dict)