# TikTok data parsing

In [1]:
# library imports
import pandas as pd
import glob, os

In [2]:
# functions

# get info about music track
def parse_music(s):
    music_list = s['music'][1:-1].split(', ')
    
    for item in music_list:
        if 'id' in item[:item.find(':')]:
            s['music_id'] = item[item.find('\'id\': \'')+len('\'id\': \'') : -1]
        if 'title' in item[:item.find(':')]:
            s['music_title'] = item[item.find('\'title\': \'')+len('\'title\': \'') : -1]
        if 'authorName' in item[:item.find(':')]:
            s['music_author'] = item[item.find('\'authorName\': \'')+len('\'authorName\': \'') : -1]
        if 'original' in item[:item.find(':')]:
            s['music_original'] = item[item.find('\'original\': \'')+len('\'original\': \'') : ]   
            
    return s


# extract hashtags used
def parse_hashtags(s):
    hashtag_list_of_lists = s['hashtags'][2:-2].split('}, {')
    
    reslist = []
    
    for hashtag_list in hashtag_list_of_lists:
        for item in hashtag_list.split(', '):
            if 'title' in item[:item.find(':')]:
                reslist.append(item[item.find('\'title\': \'')+len('\'title\': \'') : -1])
                
    if 'duet' in reslist:
        s['duet_used'] = True
        
    s['nr_hashtags'] = len(reslist)
    s['hashtags_used'] = ', '.join(reslist)
    
    return s


# extract tagged users
def parse_extras(s):
    extras_list_of_lists = s['extras'][2:-2].split('}, {')
    
    tagged_users = []
    reslist = []
    
    for extras_list in extras_list_of_lists:
        for item in extras_list.split(', '):
            if 'userId' in item[:item.find(':')]:
                if len(item) > 12:
                    tagged_users.append(extras_list)
    
    for tagged_user in tagged_users:
        for item in tagged_user.split(', '):
            if 'userId' in item[:item.find(':')]:
                reslist.append(item[item.find('\'userId\': \'')+len('\'userId\': \'') : -1])
    
    s['nr_tags'] = len(reslist)
    s['tagged_users'] = ', '.join(reslist)
    
    return s

# function that calls all of the other functions
def parse_table(df, hashtagname):
    df['music_id'] = ''
    df['music_title'] = ''
    df['music_author'] = ''
    df['music_original'] = ''
    df['hashtags_used'] = ''
    df['nr_hashtags'] = 0
    df['duet_used'] = False
    df['tagged_users'] = ''
    df['nr_tags'] = 0

    df = df.apply(parse_music, axis=1)
    df = df.apply(parse_hashtags, axis=1)
    df = df.apply(parse_extras, axis=1)
    
    del df['hashtags']
    del df['extras']
    del df['music']
    
    df['obtained_via'] = hashtagname
    df['rank'] = df.index
    
    return df

In [3]:
# which files should I parse?
for file in os.listdir('/Users/andreasipka/Desktop/TikTok'):
    if file.endswith('_videos.csv'):        
        print(file)

blacklivesmattermovement_videos.csv
alllivesmatterwhenblacklivesmatter_videos.csv
blacklivesmatter_videos.csv
bluelivesmatter_videos.csv
all_videos.csv
blacklivesmatterally_videos.csv
alllivesmattertrump2020_videos.csv
alllivesmatternotjustblacklives_videos.csv
blm_videos.csv
georgefloyd_videos.csv
justiceforgeorgefloyd_videos.csv
babylivesmatter_videos.csv
backtheblue_videos.csv
policelivesmatter_videos.csv
breonnataylor_videos.csv
alllivesmatter_videos.csv


In [4]:
blacklivesmattermovement = pd.read_csv('blacklivesmattermovement_videos.csv')
blacklivesmattermovement = parse_table(blacklivesmattermovement, 'blacklivesmattermovement')
print('Number of blacklivesmattermovement videos: ', blacklivesmattermovement.shape[0])

Number of blacklivesmattermovement videos:  1975


In [5]:
alllivesmatterwhenblacklivesmatter = pd.read_csv('alllivesmatterwhenblacklivesmatter_videos.csv')
alllivesmatterwhenblacklivesmatter = parse_table(alllivesmatterwhenblacklivesmatter, 'alllivesmatterwhenblacklivesmatter')
print('Number of alllivesmatterwhenblacklivesmatter videos: ', alllivesmatterwhenblacklivesmatter.shape[0])

Number of alllivesmatterwhenblacklivesmatter videos:  1973


In [6]:
blacklivesmatter = pd.read_csv('blacklivesmatter_videos.csv')
blacklivesmatter = parse_table(blacklivesmatter, 'blacklivesmatter')
print('Number of blacklivesmatter videos: ', blacklivesmatter.shape[0])

Number of blacklivesmatter videos:  1956


In [7]:
blacklivesmatterally = pd.read_csv('blacklivesmatterally_videos.csv')
blacklivesmatterally = parse_table(blacklivesmatterally, 'blacklivesmatterally')
print('Number of blacklivesmatterally videos: ', blacklivesmatterally.shape[0])

Number of blacklivesmatterally videos:  197


In [8]:
alllivesmatternotjustblacklives = pd.read_csv('alllivesmatternotjustblacklives_videos.csv')
alllivesmatternotjustblacklives = parse_table(alllivesmatternotjustblacklives, 'alllivesmatternotjustblacklives')
print('Number of alllivesmatternotjustblacklives videos: ', alllivesmatternotjustblacklives.shape[0])

Number of alllivesmatternotjustblacklives videos:  755


In [9]:
blm = pd.read_csv('blm_videos.csv')
blm = parse_table(blm, 'blm')
print('Number of blm videos: ', blm.shape[0])

Number of blm videos:  1894


In [10]:
georgefloyd = pd.read_csv('georgefloyd_videos.csv')
georgefloyd = parse_table(georgefloyd, 'georgefloyd')
print('Number of georgefloyd videos: ', georgefloyd.shape[0])

Number of georgefloyd videos:  1992


In [11]:
justiceforgeorgefloyd = pd.read_csv('justiceforgeorgefloyd_videos.csv')
justiceforgeorgefloyd = parse_table(justiceforgeorgefloyd, 'justiceforgeorgefloyd')
print('Number of justiceforgeorgefloyd videos: ', justiceforgeorgefloyd.shape[0])

Number of justiceforgeorgefloyd videos:  1997


In [12]:
# probably skip this, unless looking at anti abortion stuff
bluelivesmatter = pd.read_csv('bluelivesmatter_videos.csv')
bluelivesmatter = parse_table(bluelivesmatter, 'babylivesmatter')
print('Number of bluelivesmatter videos: ', bluelivesmatter.shape[0])

Number of bluelivesmatter videos:  1745


In [13]:
breonnataylor = pd.read_csv('breonnataylor_videos.csv')
breonnataylor = parse_table(breonnataylor, 'breonnataylor')
print('Number of breonnataylor videos: ', breonnataylor.shape[0])

Number of breonnataylor videos:  1958


In [14]:
alllivesmatter = pd.read_csv('alllivesmatter_videos.csv')
alllivesmatter = parse_table(alllivesmatter, 'alllivesmatter')
print('Number of alllivesmatter videos: ', alllivesmatter.shape[0])

Number of alllivesmatter videos:  1917


In [15]:
alllivesmattertrump2020 = pd.read_csv('alllivesmattertrump2020_videos.csv')
alllivesmattertrump2020 = parse_table(alllivesmattertrump2020, 'alllivesmattertrump2020')
print('Number of alllivesmattertrump2020 videos: ', alllivesmattertrump2020.shape[0])

Number of alllivesmattertrump2020 videos:  1969


In [16]:
backtheblue = pd.read_csv('backtheblue_videos.csv')
backtheblue = parse_table(backtheblue, 'backtheblue')
print('Number of backtheblue videos: ', backtheblue.shape[0])

Number of backtheblue videos:  1810


In [17]:
policelivesmatter = pd.read_csv('policelivesmatter_videos.csv')
policelivesmatter = parse_table(policelivesmatter, 'policelivesmatter')
print('Number of policelivesmatter videos: ', policelivesmatter.shape[0])

Number of policelivesmatter videos:  1636


In [18]:
# label hashtags as pro or against to track easily

# pro BLM hashtags
blacklivesmatter['leaning'] = 'pro'
blacklivesmattermovement['leaning'] = 'pro'
alllivesmatterwhenblacklivesmatter['leaning'] = 'pro'
blacklivesmatterally['leaning'] = 'pro'
blm['leaning'] = 'pro'
georgefloyd['leaning'] = 'pro'
justiceforgeorgefloyd['leaning'] = 'pro'
breonnataylor['leaning'] = 'pro'

# against BLM hashtags
alllivesmatter['leaning'] = 'con'
alllivesmatternotjustblacklives['leaning'] = 'con'
bluelivesmatter['leaning'] = 'con'
alllivesmattertrump2020['leaning'] = 'con'
backtheblue['leaning'] = 'con'
policelivesmatter['leaning'] = 'con'

In [19]:
# Merge all the individual tables into pro and con camp

merged_pro = pd.concat([blacklivesmatter, blacklivesmattermovement, alllivesmatterwhenblacklivesmatter,
      blacklivesmatterally, blm, georgefloyd, justiceforgeorgefloyd, breonnataylor])

merged_con = pd.concat([alllivesmatter, alllivesmatternotjustblacklives, bluelivesmatter, 
                        alllivesmattertrump2020, backtheblue, policelivesmatter])

all_videos = pd.concat([merged_pro, merged_con])

In [20]:
print('Number of unique pro videos: ', len(merged_pro.video_id.unique()))
print('Number of unique con videos: ', len(merged_con.video_id.unique()))
print('Total unique videos: ', len(all_videos.video_id.unique()))
print('Total unique users: ', len(all_videos.user_id.unique()))
print('Total videos with duet hashtag: ', all_videos[all_videos.duet_used].shape[0])
print('Total videos with a tagged used: ', all_videos[all_videos.nr_tags > 0].shape[0])
print('My best approximation for actual number of duets: ', all_videos[(all_videos.nr_tags > 0) & 
                                                                       (all_videos.duet_used)].shape[0])

Number of unique pro videos:  12339
Number of unique con videos:  9006
Total unique videos:  21202
Total unique users:  13291
Total videos with duet hashtag:  1736
Total videos with a tagged used:  4435
My best approximation for actual number of duets:  1548


In [21]:
# Offload to CSV
all_videos.to_csv('all_videos.csv')

all_videos[(all_videos.nr_tags > 0) & (all_videos.duet_used)].to_csv('duets_only.csv')

In [43]:
all_videos[all_videos.raw.str.contains('duetFromId')]

Unnamed: 0,user_id,user_name,user_signature,user_verified,video_id,video_desc,video_time,privateItem,duetEnabled,stitchEnabled,...,music_author,music_original,hashtags_used,nr_hashtags,duet_used,tagged_users,nr_tags,obtained_via,rank,leaning
