In [5]:
from langdetect import detect

In [4]:
import pandas as pd

In [6]:
def parse_gdf_channel(file_name):
    with open(file_name) as f:
        rows = [line.strip().split(',') for line in f.readlines()]
        # print(rows)
        metadata = [row for row in rows if len(row) > 3]
        graph_data = [row for row in rows if len(row) == 3]
        return pd.DataFrame(metadata[1:], columns=metadata[0]), pd.DataFrame(
            graph_data[1:], columns=graph_data[0])

def parse_gdf_video(file_name):
    with open(file_name, encoding='utf-8') as f:
        rows = [line.strip().split(',') for line in f.readlines()]
        # print(rows)
        metadata = [row[:14] for row in rows if len(row) > 3]
        graph_data = [row for row in rows if len(row) == 3]
        
        return pd.DataFrame(metadata[1:], columns=metadata[0]), pd.DataFrame(
            graph_data[1:], columns=graph_data[0])
    

def all_caps_to_proper(string):
    out_string = ''
    for token in string.split(' '):
        if token.isupper():
            out_string += ' ' + token.capitalize()
        else:
            out_string += ' ' + token
    return out_string.strip()
            

In [7]:
def join_update_clean_dfs(df_graph, df_metadata, title, replace_node_ids=True):
    df_metadata.rename(columns={
        'nodedef>name VARCHAR': 'nodeid',
        'label VARCHAR': 'label',
        'subscriberCount INT': 'subscriberCount'
    },
        inplace=True)
    df_graph.rename(columns={
        'edgedef>node1 VARCHAR': 'nodeid_1',
        'node2 VARCHAR': 'nodeid_2',
        'directed BOOLEAN': 'directed'
    },
        inplace=True)
    if replace_node_ids is True:
        df_graph = pd.merge(df_graph,
                            df_metadata[['label', 'nodeid']],
                            left_on='nodeid_1',
                            right_on='nodeid',
                            how='right')[['label', 'nodeid_2', 'directed'
                                          ]].rename(columns={'label': 'node1'})

        df_graph = pd.merge(df_graph,
                            df_metadata[['label', 'nodeid']],
                            left_on='nodeid_2',
                            right_on='nodeid',
                            how='right')[['node1', 'label', 'directed'
                                          ]].rename(columns={'label': 'node2'})
    return df_metadata, df_graph


def load_video_data(filename):
    df_video_metadata, df_video_rel = parse_gdf_video(filename + '.gdf')
    # write un-cleaned relations to file for use in gephi later
    df_video_rel.to_csv(filename + '_relations_orig.csv')
    # clean up titles for language classification
    df_video_metadata['label_clean'] = df_video_metadata['label VARCHAR'].apply(all_caps_to_proper)
    # detect video title language 
    df_video_metadata['title_language'] = df_video_metadata['label_clean'].apply(detect)
    # clean column names and update node ids
    df_video_metadata, df_video_rel = join_update_clean_dfs(df_video_rel, df_video_metadata, filename)
    df_video_metadata['url'] = 'https://www.youtube.com/watch?v=' + df_video_metadata['nodeid']
    # get channel ids for seed videos
    seed_channel_ids = list(df_video_metadata[df_video_metadata['isSeed VARCHAR'] == 'yes']['channelId VARCHAR'])
    # label whether or not all videos are in the seed channel
    df_video_metadata['seed_channel'] = df_video_metadata['channelId VARCHAR'].apply(lambda x : True if x in seed_channel_ids else False)
    
    df_video_metadata.to_csv(filename + '_metadata.csv')
    df_video_rel.to_csv(filename + '_relations.csv')
    return df_video_metadata, df_video_rel
    

def load_channel_data(filename):
    df_metadata, df_rel = parse_gdf_channel(filename + '.gdf')        
    # clean column names and update node ids
    df_metadata, df_graph = join_update_clean_dfs(df_rel, df_metadata, filename)
    df_metadata['url'] = 'https://www.youtube.com/channel/' + df_metadata['nodeid']
    df_metadata.to_csv(filename + '_metadata.csv')
    df_graph.to_csv(filename + '_relations.csv')
    return df_metadata, df_graph
    

    
df_video_metadata, df_video_graph = load_video_data('masha_and_shark_2019_05_14')

In [8]:
df_channel_metadata, df_channel_graph = load_channel_data('bad_channels_2019_05_18')

In [9]:
df_channel_graph

Unnamed: 0,node1,node2,directed
0,Patty Shukla Kids TV - Children s songs,Bounce Patrol Kids,true
1,Bounce Patrol Kids,[토이푸딩] ToyPudding TV,true
2,Bounce Patrol Kids,Patty Shukla Kids TV - Children s songs,true
3,PlayToys,Baby Dolls & Little Girls,true
4,PlayToys,Toys Unlimited,true
5,Superdoll adventures,divernic doll adventures,true
6,Numberer1,Cool 3D World,true
7,divernic doll adventures,Superdoll adventures,true
8,FunToys Collector Disney Toys Review,FunToys Collector Disney Toys Review,true
9,CichyCooLover,minco2,true


In [9]:
df_video_metadata

Unnamed: 0,nodeid,label,isSeed VARCHAR,seedRank INT,publishedAt INT,channelTitle VARCHAR,channelId VARCHAR,videoCategoryLabel VARCHAR,viewCount INT,likeCount INT,dislikeCount INT,dislikeLikeRatio FLOAT,favoriteCount INT,commentCount INT,label_clean,title_language,url,seed_channel
0,KYniUCGPGLs,Маша и Медведь (Masha and The Bear) - Маша плю...,yes,1,1328013326,Get Movies,UClZkHt2kNIgyrTTPnSQV3SA,Entertainment,3572394626,4511814,2450208,0.54306494017706,0,27170,Маша и Медведь (Masha and The Bear) - Маша плю...,bg,https://www.youtube.com/watch?v=KYniUCGPGLs,True
1,XqZsoesa55w,Baby Shark Dance | Sing and Dance! | Animal So...,yes,2,1466204430,Pinkfong! Kids Songs & Stories,UCcdwLMPsaU2ezNSJU1nFoBQ,Education,2762274288,7161771,2567562,0.35850936870224,0,,Baby Shark Dance | Sing and Dance! | Animal So...,en,https://www.youtube.com/watch?v=XqZsoesa55w,True
2,x1fe8-Qli9E,Маша и Медведь (Masha and The Bear) - Приятног...,no,,1348230424,Get Movies,UClZkHt2kNIgyrTTPnSQV3SA,Entertainment,1236260225,1418208,755111,0.5324402344367,0,9374,Маша и Медведь (Masha and The Bear) - Приятног...,bg,https://www.youtube.com/watch?v=x1fe8-Qli9E,True
3,_b9F05WE28Y,👶 Baby Songs | Dave and Ava | Nursery Rhymes 👶,no,,1557832469,Dave and Ava - Nursery Rhymes and Baby Songs,UC6zhI71atP7YLoZyIyCIGNw,Education,3262,308,160,0.51948051948052,0,0,👶 Baby Songs | Dave and Ava | Nursery Rhymes 👶,en,https://www.youtube.com/watch?v=_b9F05WE28Y,False
4,rKk9uJqi4hY,Surprise Eggs Wildlife Toys | Learn Wild Anima...,no,,1447556913,ChuChuTV Surprise Eggs Learning Videos,UCKcQ7Jo2VAGHiPMfDwzeRUw,Education,589326232,756885,467884,0.61817052788733,0,22364,Surprise Eggs Wildlife Toys | Learn Wild Anima...,en,https://www.youtube.com/watch?v=rKk9uJqi4hY,False
5,Bczkf9MjMBE,Couleurs Voitures Dessin Animé pour Enfants av...,no,,1557814110,Car For Kids,UCO0dv06t_DDJDthBlzY4RFA,Gaming,13137,157,70,0.44585987261146,0,1,Couleurs Voitures Dessin Animé pour Enfants av...,fr,https://www.youtube.com/watch?v=Bczkf9MjMBE,False
6,6VSaExP2anI,Mickey Mouse and Friends | Minnie s Bow-Toons ...,no,,1321036829,DisneyJuniorUK,UC7Gf2tZ8coTX2ckTPgn62iQ,Entertainment,121480020,64250,38704,0.60239688715953,0,247,Mickey Mouse and Friends | Minnie s Bow-Toons ...,en,https://www.youtube.com/watch?v=6VSaExP2anI,False
7,arqQd-nP-bc,Буба все серии Мультики для детей 🔴,no,,1544784790,KEDOO МУЛЬТИКИ для детей,UCwMABIDNcqe2GS_Sm2jHKGg,Film & Animation,44007408,68565,39448,0.57533727120251,0,0,Буба все серии Мультики для детей 🔴,ru,https://www.youtube.com/watch?v=arqQd-nP-bc,False
8,nShz5rkeX2E,Masha y el Oso - Episodios favoritos de Masha ...,no,,1490349603,Masha y el Oso,UCuSo4gcgxJRf4Bzu43wwVyg,Film & Animation,128762289,165663,93309,0.56324586660872,0,175,Masha y el Oso - Episodios favoritos de Masha ...,es,https://www.youtube.com/watch?v=nShz5rkeX2E,False
9,xIbrxsirEnA,Mascha und der Bär - Folge 2: Das Mascha Speziale,no,,1405440223,EUROPA Kinderprogramm,UCxkQwIshySkwOqEuZ28o8KA,Film & Animation,588631242,803031,387974,0.4831370146358,0,,Mascha und der Bär - Folge 2: Das Mascha Speziale,de,https://www.youtube.com/watch?v=xIbrxsirEnA,False


In [10]:
df_video_graph

Unnamed: 0,node1,node2,directed
0,Маша и Медведь (Masha and The Bear) - Маша плю...,Маша и Медведь (Masha and The Bear) - Приятног...,true
1,Маша и Медведь (Masha and The Bear) - Большая ...,Маша и Медведь (Masha and The Bear) - Приятног...,true
2,Маша и Медведь (Masha and The Bear) - Дальний ...,Маша и Медведь (Masha and The Bear) - Приятног...,true
3,Маша и Медведь (Masha and The Bear) - Новая ме...,Маша и Медведь (Masha and The Bear) - Приятног...,true
4,Маша и Медведь (Masha and The Bear) - Первая в...,Маша и Медведь (Masha and The Bear) - Приятног...,true
5,Маша и Медведь (Masha and The Bear) - Подкидыш...,Маша и Медведь (Masha and The Bear) - Приятног...,true
6,Маша и Медведь (Masha and The Bear) - Раз два...,Маша и Медведь (Masha and The Bear) - Приятног...,true
7,Маша и Медведь (Masha and The Bear) - Фокус-по...,Маша и Медведь (Masha and The Bear) - Приятног...,true
8,Маша и Медведь (Masha and The Bear) - Усатый-П...,Маша и Медведь (Masha and The Bear) - Приятног...,true
9,Маша и Медведь (Masha and The Bear) - Первый р...,Маша и Медведь (Masha and The Bear) - Приятног...,true
