In [11]:
import pandas as pd
pd.set_option('display.float_format', lambda x: '%.3f' % x)

# Loading data
## Loading whisper transcribed data

In [12]:
df_transcripted_videos = pd.read_csv('../data/processed/transcripted_audios.csv')
print(df_transcripted_videos.shape)
df_transcripted_videos.head()

(1185, 3)


Unnamed: 0,audio_id,transcription,detected_language
0,../data/audios\6746590978280279301,Do you know what Dab Me Up is? Do you know wha...,en
1,../data/audios\6748582567995378949,"Go ahead, go ahead. No, not you, not you. Your...",en
2,../data/audios\6752944407395175686,There's not any.,en
3,../data/audios\6755172553632926981,If I give you one message to hold in your hear...,en
4,../data/audios\6758547542276852998,"What? Did I kill them, sir? Are you kidding me?",en


In [13]:
# fixing the audio_id column
df_transcripted_videos['audio_id'] = df_transcripted_videos['audio_id'].str.split('\\').str[-1]
df_transcripted_videos['audio_id'] = df_transcripted_videos['audio_id'].astype('Int64')
df_transcripted_videos.head()

Unnamed: 0,audio_id,transcription,detected_language
0,6746590978280279301,Do you know what Dab Me Up is? Do you know wha...,en
1,6748582567995378949,"Go ahead, go ahead. No, not you, not you. Your...",en
2,6752944407395175686,There's not any.,en
3,6755172553632926981,If I give you one message to hold in your hear...,en
4,6758547542276852998,"What? Did I kill them, sir? Are you kidding me?",en


## Parsed data

In [14]:
df_parsed_data = pd.read_csv('../data/processed/transcripted_processed_videos.csv')
print(df_parsed_data.shape)
df_parsed_data.head()

(2029, 42)


Unnamed: 0,id,search_tag,video_path,video_classes,desc,create_time,is_duet_enabled,hashtags,cover,play_address,...,author_digg_count,author_heart,video_contains_music,video_contains_male,video_contains_female,video_contains_noise,video_contains_no_energy,segments,total_music_duration,percentage_of_video_made_of_music
0,6892193566290889985,biden2020,data/videos/6892193566290889985.mp4,,"We'll be right, thanks. New Zealand.. anyone? ...",1604713867,True,"bidenharris2020, straya, aussie, biden2020",https://p16-sign-sg.tiktokcdn.com/obj/tos-alis...,https://v16-webapp-prime.tiktok.com/video/tos/...,...,7080,82800000,True,True,True,False,True,"[{'label': 'music', 'start': 0.0, 'end': 3.86}...",3.86,13.31
1,6839079433794505989,biden2020,data/videos/6839079433794505989.mp4,,Reply to @nickh1940 #repost share this so more...,1592347270,True,", repost, trump, viral, fyp, foryoupage, parat...",https://p16-sign-va.tiktokcdn.com/obj/tos-mali...,https://v16-webapp-prime.tiktok.com/video/tos/...,...,9305,6600000,True,True,False,False,False,"[{'label': 'music', 'start': 0.0, 'end': 1.54}...",1.54,6.696
2,6885403130297732357,biden2020,data/videos/6885403130297732357.mp4,,anyways vote biden💙#fyp #foryou #foryoupage #b...,1603132900,False,"fyp, foryou, foryoupage, biden2020",https://p16-sign-va.tiktokcdn.com/obj/tos-mali...,https://v16-webapp-prime.tiktok.com/video/tos/...,...,100400,15700000,True,False,False,False,False,"[{'label': 'music', 'start': 0.0, 'end': 6.54}]",6.54,109.0
3,6889924077721521413,biden2020,data/videos/6889924077721521413.mp4,,Oh my god Barack Obama 🥵🥵🥵 #biden2020,1604185495,True,biden2020,https://p16-sign-va.tiktokcdn.com/obj/tos-mali...,https://v16-webapp-prime.tiktok.com/video/tos/...,...,14300,81400000,True,False,False,True,False,"[{'label': 'noise', 'start': 0.0, 'end': 3.9},...",12.84,80.25
4,6891017029051108614,biden2020,data/videos/6891017029051108614.mp4,,Reply to @taybarnett262 IDK HOW TO FEEL RN HE...,1604439965,True,", fyp, election2020, biden2020, trump2020",https://p16-sign-va.tiktokcdn.com/obj/tos-mali...,https://v16-webapp-prime.tiktok.com/video/tos/...,...,78000,6200000,True,True,True,True,True,"[{'label': 'music', 'start': 0.0, 'end': 14.88...",14.88,26.105


## Selecting only 'EN' videos

In [15]:
df_transcripted_videos = df_transcripted_videos.loc[df_transcripted_videos['detected_language'] == 'en']

# Merging DataFrames

In [16]:
df_transcripted_videos = df_transcripted_videos.merge(df_parsed_data, how='left', right_on='id', left_on='audio_id')

## Dealing with NaN

In [17]:
null_values = df_transcripted_videos.isnull().sum().sort_values(ascending=False)
null_values[null_values > 0]

author_avatar          1156
author_signature       1156
author_is_verified     1156
video_classes          1156
author_duet_setting    1156
author_unique_id       1156
author_nickname        1138
stickers_on_video       508
music_url                18
transcription             4
desc                      4
music_author              3
dtype: int64

In [18]:
df_transcripted_videos.dropna(subset=['transcription'], inplace=True)
df_transcripted_videos['stickers_on_video'].fillna('', inplace=True)
df_transcripted_videos['author_signature'].fillna('', inplace=True)
df_transcripted_videos['music_url'].fillna('', inplace=True)
df_transcripted_videos['music_author'].fillna('', inplace=True)

## Datetime columns

In [19]:
df_transcripted_videos['create_time'] = pd.to_datetime(df_transcripted_videos['create_time'], unit='s')
df_transcripted_videos['create_time'] = df_transcripted_videos['create_time'].dt.tz_localize('UTC').dt.tz_convert('America/New_York')

# Creating new columns

In [20]:
# Column with video URL
df_transcripted_videos['video_url'] = 'https://www.tiktok.com/@' + df_transcripted_videos['author_unique_id'].fillna('') + '/video/' + df_transcripted_videos['id'].astype(str)

# Filtering columns

In [21]:
df_processed = df_transcripted_videos.loc[:, ['id', 'search_tag', 'create_time', 'desc', 'stickers_on_video', 'hashtags', 'duration', 
                                              'is_duet_enabled', 'duet_from_id', 'music_id', 'music_title', 'music_url', 'music_author', 
                                              'music_is_original_audio', 'digg_count', 'share_count', 'comment_count', 'play_count', 
                                              'author_unique_id', 'author_nickname', 'author_avatar', 'author_signature',
                                              'author_is_verified', 'author_duet_setting', 'author_following_count',
                                              'author_followers_count', 'author_heart_count', 'author_digg_count', 'author_heart',
                                              'video_path', 'video_url', 'transcription', 'video_contains_music', 'video_contains_male',
                                              'video_contains_female', 'video_contains_noise', 'video_contains_no_energy', 'segments',
                                              'total_music_duration', 'percentage_of_video_made_of_music']]
df_processed.head()

Unnamed: 0,id,search_tag,create_time,desc,stickers_on_video,hashtags,duration,is_duet_enabled,duet_from_id,music_id,...,video_url,transcription,video_contains_music,video_contains_male,video_contains_female,video_contains_noise,video_contains_no_energy,segments,total_music_duration,percentage_of_video_made_of_music
0,6746590978280279301,joebiden,2019-10-11 12:58:42-04:00,so the former Vice President and I are best bu...,,"joebiden, foryoupage",32,True,0,6746567236959406853,...,https://www.tiktok.com/@/video/674659097828027...,Do you know what Dab Me Up is? Do you know wha...,False,True,False,False,False,"[{'label': 'male', 'start': 0.0, 'end': 32.92}]",0.0,0.0
1,6748582567995378949,maga,2019-10-16 21:47:08-04:00,“Your organization is terrible” #maga #fyp,Trump talking to CNN reporter,"maga, fyp",20,True,0,6748577002363652870,...,https://www.tiktok.com/@/video/674858256799537...,"Go ahead, go ahead. No, not you, not you. Your...",False,True,False,False,False,"[{'label': 'male', 'start': 0.0, 'end': 20.580...",0.0,0.0
2,6752944407395175686,trump2020,2019-10-28 15:53:18-04:00,"The hat didn’t fit, but can the vid go viral s...",,"fyp, foryou, trump2020, republican",10,True,0,6744484944707406597,...,https://www.tiktok.com/@/video/675294440739517...,There's not any.,True,True,False,True,True,"[{'label': 'noise', 'start': 0.0, 'end': 2.260...",4.54,45.4
3,6755172553632926981,trump2020,2019-11-03 14:59:36-05:00,Keep America great #donaldtrump #trump2020 #tr...,Trump 20 20,"donaldtrump, trump2020, trump, trumptrain, ins...",38,True,0,6755152356901423877,...,https://www.tiktok.com/@/video/675517255363292...,If I give you one message to hold in your hear...,True,True,False,False,True,"[{'label': 'male', 'start': 0.0, 'end': 4.24},...",16.22,42.684
4,6758547542276852998,maga,2019-11-12 17:16:18-05:00,#trump2020 #foryourpage #maga #maga #kag,,"trump2020, foryourpage, maga, maga, kag",45,False,0,6758505275898039046,...,https://www.tiktok.com/@/video/675854754227685...,"What? Did I kill them, sir? Are you kidding me?",True,True,False,True,False,"[{'label': 'music', 'start': 0.0, 'end': 11.46...",29.76,66.133


In [22]:
print('Difference of columns of processed and transcripted dataset')
print(set(df_transcripted_videos.columns).difference(set(df_processed.columns)))

Difference of columns of processed and transcripted dataset
{'play_address', 'download_address', 'cover', 'detected_language', 'audio_id', 'video_classes'}


# Renaming columns

In [23]:
df_processed.rename(columns={'hashtags': 'video_hashtags',
                             'desc': 'video_desc',
                             'duration': 'video_duration_in_sec'},
                    inplace=True, errors='raise')

# Saving dataframe

In [24]:
df_processed.to_csv('../data/processed/cleaned_transcripted_dataset.csv', index=False)