In [2]:
import json
import pandas as pd
import subprocess
import time

from pydub import AudioSegment
from tqdm import tqdm

WAV_OUTPUT_DIR = '../data/processed/wav/'

# Video data

In [3]:
df_videos = pd.read_csv('../data/processed/videos_with_segmented_audio.csv')
df_videos.drop_duplicates(subset=['id'], inplace=True)
df_videos.head()

Unnamed: 0,search_tag,video_path,video_classes,id,desc,create_time,is_duet_enabled,hashtags,cover,play_address,...,author_followers_count,author_heart_count,author_digg_count,author_heart,video_contains_music,video_contains_male,video_contains_female,video_contains_noise,video_contains_no_energy,segments
0,biden2020,data/videos/6892193566290889985.mp4,,6892193566290889985,"We'll be right, thanks. New Zealand.. anyone? ...",1604713867,True,"bidenharris2020, straya, aussie, biden2020",https://p16-sign-sg.tiktokcdn.com/obj/tos-alis...,https://v16-webapp-prime.tiktok.com/video/tos/...,...,1800000,82800000,7080,82800000,True,True,True,False,True,"[{'label': 'music', 'start': 0.0, 'end': 3.86}..."
1,biden2020,data/videos/6839079433794505989.mp4,,6839079433794505989,Reply to @nickh1940 #repost share this so more...,1592347270,True,", repost, trump, viral, fyp, foryoupage, parat...",https://p16-sign-va.tiktokcdn.com/obj/tos-mali...,https://v16-webapp-prime.tiktok.com/video/tos/...,...,208200,6600000,9305,6600000,True,True,False,False,False,"[{'label': 'music', 'start': 0.0, 'end': 1.54}..."
2,biden2020,data/videos/6885403130297732357.mp4,,6885403130297732357,anyways vote bidenðŸ’™#fyp #foryou #foryoupage #b...,1603132900,False,"fyp, foryou, foryoupage, biden2020",https://p16-sign-va.tiktokcdn.com/obj/tos-mali...,https://v16-webapp-prime.tiktok.com/video/tos/...,...,236200,15700000,100400,15700000,True,False,False,False,False,"[{'label': 'music', 'start': 0.0, 'end': 6.54}]"
3,biden2020,data/videos/6889924077721521413.mp4,,6889924077721521413,Oh my god Barack Obama ðŸ¥µðŸ¥µðŸ¥µ #biden2020,1604185495,True,biden2020,https://p16-sign-va.tiktokcdn.com/obj/tos-mali...,https://v16-webapp-prime.tiktok.com/video/tos/...,...,2200000,81400000,14300,81400000,True,False,False,True,False,"[{'label': 'noise', 'start': 0.0, 'end': 3.9},..."
4,biden2020,data/videos/6891017029051108614.mp4,,6891017029051108614,Reply to @taybarnett262 IDK HOW TO FEEL RN HE...,1604439965,True,", fyp, election2020, biden2020, trump2020",https://p16-sign-va.tiktokcdn.com/obj/tos-mali...,https://v16-webapp-prime.tiktok.com/video/tos/...,...,206100,6200000,78000,6200000,True,True,True,True,True,"[{'label': 'music', 'start': 0.0, 'end': 14.88..."


# Checking what percentage of the video is made of music

In [4]:
audio_segments_df = pd.read_csv('../data/processed/video_audio_segments.csv')
audio_segments_df['total_duration'] = audio_segments_df['end'] - audio_segments_df['start']

# Grouping by video_id and label
grouped_audio_segments = audio_segments_df.groupby(['video_id', 'label']).agg({'total_duration': 'sum'})
grouped_audio_segments = grouped_audio_segments.reset_index()

videos_with_music = grouped_audio_segments.loc[grouped_audio_segments['label'] == 'music', ['video_id', 'total_duration']]
videos_with_music.columns = ['video_id', 'total_music_duration']
videos_with_music.head()

Unnamed: 0,video_id,total_music_duration
2,6748605663171136773,10.9
3,6750663345616981253,14.98
5,6752944407395175686,4.54
9,6755172553632926981,16.22
11,6756623980511137026,10.28


# Merging dataframe with audio segment

In [5]:
df_merged_videos_annotations = df_videos.merge(videos_with_music, left_on='id', right_on='video_id', how='left')
df_merged_videos_annotations = df_merged_videos_annotations.drop(['video_id'], axis=1)

# Calculating percentage of music in video
df_merged_videos_annotations['total_music_duration'] = df_merged_videos_annotations['total_music_duration'].fillna(0)
df_merged_videos_annotations['percentage_of_video_made_of_music'] = df_merged_videos_annotations['total_music_duration'] * 100.00 / df_merged_videos_annotations['duration']
df_merged_videos_annotations.set_index('id', inplace=True)
df_merged_videos_annotations.head()

Unnamed: 0_level_0,search_tag,video_path,video_classes,desc,create_time,is_duet_enabled,hashtags,cover,play_address,download_address,...,author_digg_count,author_heart,video_contains_music,video_contains_male,video_contains_female,video_contains_noise,video_contains_no_energy,segments,total_music_duration,percentage_of_video_made_of_music
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6892193566290889985,biden2020,data/videos/6892193566290889985.mp4,,"We'll be right, thanks. New Zealand.. anyone? ...",1604713867,True,"bidenharris2020, straya, aussie, biden2020",https://p16-sign-sg.tiktokcdn.com/obj/tos-alis...,https://v16-webapp-prime.tiktok.com/video/tos/...,https://v16-webapp-prime.tiktok.com/video/tos/...,...,7080,82800000,True,True,True,False,True,"[{'label': 'music', 'start': 0.0, 'end': 3.86}...",3.86,13.310345
6839079433794505989,biden2020,data/videos/6839079433794505989.mp4,,Reply to @nickh1940 #repost share this so more...,1592347270,True,", repost, trump, viral, fyp, foryoupage, parat...",https://p16-sign-va.tiktokcdn.com/obj/tos-mali...,https://v16-webapp-prime.tiktok.com/video/tos/...,https://v16-webapp-prime.tiktok.com/video/tos/...,...,9305,6600000,True,True,False,False,False,"[{'label': 'music', 'start': 0.0, 'end': 1.54}...",1.54,6.695652
6885403130297732357,biden2020,data/videos/6885403130297732357.mp4,,anyways vote bidenðŸ’™#fyp #foryou #foryoupage #b...,1603132900,False,"fyp, foryou, foryoupage, biden2020",https://p16-sign-va.tiktokcdn.com/obj/tos-mali...,https://v16-webapp-prime.tiktok.com/video/tos/...,https://v16-webapp-prime.tiktok.com/video/tos/...,...,100400,15700000,True,False,False,False,False,"[{'label': 'music', 'start': 0.0, 'end': 6.54}]",6.54,109.0
6889924077721521413,biden2020,data/videos/6889924077721521413.mp4,,Oh my god Barack Obama ðŸ¥µðŸ¥µðŸ¥µ #biden2020,1604185495,True,biden2020,https://p16-sign-va.tiktokcdn.com/obj/tos-mali...,https://v16-webapp-prime.tiktok.com/video/tos/...,https://v16-webapp-prime.tiktok.com/video/tos/...,...,14300,81400000,True,False,False,True,False,"[{'label': 'noise', 'start': 0.0, 'end': 3.9},...",12.84,80.25
6891017029051108614,biden2020,data/videos/6891017029051108614.mp4,,Reply to @taybarnett262 IDK HOW TO FEEL RN HE...,1604439965,True,", fyp, election2020, biden2020, trump2020",https://p16-sign-va.tiktokcdn.com/obj/tos-mali...,https://v16-webapp-prime.tiktok.com/video/tos/...,https://v16-webapp-prime.tiktok.com/video/tos/...,...,78000,6200000,True,True,True,True,True,"[{'label': 'music', 'start': 0.0, 'end': 14.88...",14.88,26.105263


# Trimming audio containing only male and female voice

In [7]:
successful_audios_with_voice = 0
for video_id in tqdm(df_merged_videos_annotations.index):
    video = df_merged_videos_annotations.loc[[video_id]]
    video_id = video_id

    video_filename = '../' + video['video_path'].values[0]

    audio_voice_segments = audio_segments_df.loc[(audio_segments_df['video_id'] == video_id) & 
                                                 (audio_segments_df['label'].isin(['male', 'female']))].copy()
    audio_voice_segments.loc[:, 'next_start'] = audio_voice_segments.loc[:, 'start'].shift(-1)

    audio = AudioSegment.from_file(video_filename)
    trimmed_audio_voices = []

    audio_start = audio_voice_segments['start'].min() * 1000
    for idx, row in audio_voice_segments.iterrows():
        if row['next_start'] < row['end'] + 1:
            continue

        start = audio_start        
        end = (row['end'] + 1) * 1000
        if len(trimmed_audio_voices) == 0:
            trimmed_audio_voices = audio[start:end]
        else:
            trimmed_audio_voices = trimmed_audio_voices + audio[start:end]

        audio_start = row['next_start'] * 1000

    if len(trimmed_audio_voices) > 0:
        trimmed_audio_voices.export(f"../data/audios/{video_id}.mp3", format="mp3")
        successful_audios_with_voice += 1

100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 2029/2029 [10:06<00:00,  3.35it/s]


# Saving transcripted video

In [8]:
df_merged_videos_annotations.to_csv('../data/processed/transcripted_processed_videos.csv')