# Reassemble .gdf

Take the annotated data and reassemble it to read into `R` and gephi.

In [1]:
from langdetect import detect

In [2]:
import pandas as pd
import math

After annotating video data, we want to reassemble gdf files for visualization in gephi or for loading back into `R` in `parse_load_gdf.ipynb`.

In [3]:
def df_row_to_csv_row(row_list):
    # for writing out gdf file
    row_list_new = []
    for x in row_list:
        if isinstance(x, float) and math.isnan(x):
            x = ''
        row_list_new.append(x)
    return ','.join((str(x) for x in row_list_new))
        

In [4]:
def load_reassemble_annotated_video_data(metatdata_filename, rels_filename):
    # load annotated metadata and relations and recreate gdf file for exporting to gephi 
    annotated_metadata = pd.read_csv(metatdata_filename)
    annotated_rels = pd.read_csv(rels_filename)
    # drop redundant row index column
    annotated_metadata = annotated_metadata.drop("Unnamed: 0", axis=1)
    annotated_rels = annotated_rels.drop("Unnamed: 0", axis=1)
    
    annotated_metadata.rename(columns={
        'nodeid': 'nodedef>name VARCHAR',
        'label': 'label VARCHAR',
        'subscriberCount': 'subscriberCount INT',
        'label_clean': 'labelClean VARCHAR',
        'title_language': 'titleLanguage VARCHAR',
        'url': 'url VARCHAR',
        'seed_channel': 'seedChannel BOOLEAN'

    },
        inplace=True)

    # todo set label VARCHAR to  is_inappropriate , only label if inappropriate :(  - this is for videos
    annotated_metadata['label_old VARCHAR'] = annotated_metadata['label VARCHAR']
    annotated_metadata['label VARCHAR'] = annotated_metadata.is_inappropriate.apply(lambda x : 'X' if x == 'YES' else '_')

    # write to .gdf file
    gdf_metadata_header = ','.join((str(x) for x in annotated_metadata.columns))
    gdf_rels_header = ','.join((str(x) for x in annotated_rels.columns))
    
    with open('annotated_videos_gdf.gdf', 'w') as f:
        f.write(gdf_metadata_header + '\n')
        for row in annotated_metadata.values:
            f.write(df_row_to_csv_row(row) + '\n')
        f.write(gdf_rels_header + '\n')
        for row in annotated_rels.values:
            f.write(','.join((str(x) for x in row))+ '\n')
    return annotated_metadata, annotated_rels
            
annotated_metadata, annotated_rels = load_reassemble_annotated_video_data('annotated_videonet_seeds_elsa_spiderman_2019_08_18_metadata.csv',
                                    'videonet_seeds_elsa_spiderman_2019_08_18_relations_orig.csv')


Join annotation of videos to channel level.

In [6]:
channel_ids = {}
for row in annotated_metadata.to_dict(orient='records'):
    channel_id = row['channelId VARCHAR']
    if channel_id in channel_ids:
        channel_ids[channel_id ].append(row['is_inappropriate'])
    else:
        channel_ids[channel_id] = [row['is_inappropriate']]

# get the unanimous video annotations of each channel
channels_videos_annotated = {k: list(set(v))[0] for k,v in channel_ids.items() if len(set(v)) == 1 }
channels_videos_annotated = pd.Series(channels_videos_annotated).to_frame()
channels_videos_annotated.index.names = ['channelId']
channels_videos_annotated.rename(columns={
        0: 'is_inappropriate VARCHAR'}, inplace=True)

In [7]:
def load_reassemble_annotated_channel_data(metatdata_filename, rels_filename, channel_video_annotations):
    # load annotated metadata and relations and recreate gdf file for exporting to gephi 
    annotated_metadata = pd.read_csv(metatdata_filename)
    annotated_rels = pd.read_csv(rels_filename)
    # drop redundant row index column
    annotated_metadata = annotated_metadata.drop("Unnamed: 0", axis=1)
    annotated_rels = annotated_rels.drop("Unnamed: 0", axis=1)
    
    # merge channel video annotations to channels
    print(annotated_metadata.columns)
    annotated_metadata = annotated_metadata.merge(channel_video_annotations, left_on='nodeid', right_on='channelId')
    annotated_metadata.rename(columns={
        'nodeid': 'nodedef>name VARCHAR',
        'label': 'label VARCHAR',
        'subscriberCount': 'subscriberCount INT',
        'label_clean': 'labelClean VARCHAR',
        'title_language': 'titleLanguage VARCHAR',
        'url': 'url VARCHAR',
        'seed_channel': 'seedChannel BOOLEAN'

    },
        inplace=True)
    
    annotated_rels.rename(columns={
        'node1':'edgedef>node1 VARCHAR',
        'node2': 'node2 VARCHAR',
        'directed': 'directed BOOLEAN'
    }, inplace=True)


    # write to .gdf file
    gdf_metadata_header = ','.join((str(x) for x in annotated_metadata.columns))
    gdf_rels_header = ','.join((str(x) for x in annotated_rels.columns))
    annotated_rels = annotated_rels.dropna()
    with open('annotated_channels_gdf.gdf', 'w') as f:
        f.write(gdf_metadata_header + '\n')
        for row in annotated_metadata.values:
            f.write(df_row_to_csv_row(row) + '\n')
        f.write(gdf_rels_header + '\n')
        for row in annotated_rels.values:
            f.write(','.join((str(x) for x in row))+ '\n')
    return annotated_metadata, annotated_rels

annotated_channels_metadata, annotated_channels_rels = load_reassemble_annotated_channel_data('bad_channels_2019_05_18_metadata.csv',
                                                                            'bad_channels_2019_05_18_relations.csv',
                                                                           channels_videos_annotated)

Index(['nodeid', 'label', 'isSeed VARCHAR', 'seedRank INT', 'subscriberCount',
       'videoCount INT', 'viewCount(100s) INT', 'country VARCHAR',
       'publishedAt VARCHAR', 'daysactive INT', 'url'],
      dtype='object')


Unnamed: 0,nodedef>name VARCHAR,label VARCHAR,isSeed VARCHAR,seedRank INT,subscriberCount INT,videoCount INT,viewCount(100s) INT,country VARCHAR,publishedAt VARCHAR,daysactive INT,url VARCHAR,is_inappropriate VARCHAR
0,UCir-ay37PYv8-lQOk_dxzFA,wai bom,yes,1,5346,1,24099,not set,2017-08-22T10:42:40.000Z,634,https://www.youtube.com/channel/UCir-ay37PYv8-...,YES
1,UC96vy6lHoIKWSevWJEC47xA,Mathias Jost,yes,2,4311,26,14301,not set,2016-12-25T14:30:29.000Z,874,https://www.youtube.com/channel/UC96vy6lHoIKWS...,YES
2,UC0xH3QQdLAIu7srkkbTG8CQ,Eve My Tube,yes,3,1673550,1036,3306465,TH,2016-01-29T15:24:19.000Z,1205,https://www.youtube.com/channel/UC0xH3QQdLAIu7...,YES
3,UCY9eEaH19ulZmdusctU1ejA,Glamour,yes,4,2109252,1654,6585187,US,2007-04-03T15:03:38.000Z,4428,https://www.youtube.com/channel/UCY9eEaH19ulZm...,NO
4,UCD4MbghmkGS3aF5ymsc8_bw,Post Hostel,yes,5,8652,5,78937,not set,2015-02-19T09:33:52.000Z,1549,https://www.youtube.com/channel/UCD4MbghmkGS3a...,NO
5,UCp_CBY2rEWs_CPI3HaC6tbg,Alex Ramirès,yes,6,324934,62,664676,not set,2011-07-19T10:28:08.000Z,2860,https://www.youtube.com/channel/UCp_CBY2rEWs_C...,NO
6,UCN6rjYMkYv5eUAHbFwDbQmg,ClubEngendros,yes,7,19446,40,196866,not set,2008-10-01T14:29:38.000Z,3881,https://www.youtube.com/channel/UCN6rjYMkYv5eU...,NO
7,UCRBhx25KHV4wUhKsO29BNgw,Way to Hero,yes,8,3700,62,6652,BR,2015-10-27T19:57:23.000Z,1299,https://www.youtube.com/channel/UCRBhx25KHV4wU...,YES
8,UCYDKUHScAVoI_UTVuA7ThcQ,Crafty Panda DIY,yes,9,0,65,256182,US,2018-04-17T11:07:20.000Z,396,https://www.youtube.com/channel/UCYDKUHScAVoI_...,YES
9,UCfaZw8XH_zmAVkBst_MPD6w,CKN Toys,yes,10,12488147,529,109306937,AU,2015-03-28T04:11:34.000Z,1513,https://www.youtube.com/channel/UCfaZw8XH_zmAV...,NO


In [8]:
def parse_gdf_channel(file_name):
    with open(file_name) as f:
        rows = [line.strip().split(',') for line in f.readlines()]
        # print(rows)
        metadata = [row for row in rows if len(row) > 3]
        graph_data = [row for row in rows if len(row) == 3]
        return pd.DataFrame(metadata[1:], columns=metadata[0]), pd.DataFrame(
            graph_data[1:], columns=graph_data[0])

def parse_gdf_video(file_name):
    with open(file_name, encoding='utf-8') as f:
        rows = [line.strip().split(',') for line in f.readlines()]
        # print(rows)
        metadata = [row[:14] for row in rows if len(row) > 3]
        graph_data = [row for row in rows if len(row) == 3]
        
        return pd.DataFrame(metadata[1:], columns=metadata[0]), pd.DataFrame(
            graph_data[1:], columns=graph_data[0])
    

def all_caps_to_proper(string):
    out_string = ''
    for token in string.split(' '):
        if token.isupper():
            out_string += ' ' + token.capitalize()
        else:
            out_string += ' ' + token
    return out_string.strip()
            

In [9]:
def join_update_clean_dfs(df_graph, df_metadata, title, replace_node_ids=True):
    df_metadata.rename(columns={
        'nodedef>name VARCHAR': 'nodeid',
        'label VARCHAR': 'label',
        'subscriberCount INT': 'subscriberCount'
    },
        inplace=True)
    df_graph.rename(columns={
        'edgedef>node1 VARCHAR': 'nodeid_1',
        'node2 VARCHAR': 'nodeid_2',
        'directed BOOLEAN': 'directed'
    },
        inplace=True)
    if replace_node_ids is True:
        df_graph = pd.merge(df_graph,
                            df_metadata[['label', 'nodeid']],
                            left_on='nodeid_1',
                            right_on='nodeid',
                            how='right')[['label', 'nodeid_2', 'directed'
                                          ]].rename(columns={'label': 'node1'})

        df_graph = pd.merge(df_graph,
                            df_metadata[['label', 'nodeid']],
                            left_on='nodeid_2',
                            right_on='nodeid',
                            how='right')[['node1', 'label', 'directed'
                                          ]].rename(columns={'label': 'node2'})
    return df_metadata, df_graph


def load_video_data(filename):
    df_video_metadata, df_video_rel = parse_gdf_video(filename + '.gdf')        
    # clean up titles for language classification
    df_video_metadata['label_clean'] = df_video_metadata['label VARCHAR'].apply(all_caps_to_proper)
    # detect video title language 
    df_video_metadata['title_language'] = df_video_metadata['label_clean'].apply(detect)
    # clean column names and update node ids
    df_video_metadata, df_video_graph = join_update_clean_dfs(df_video_rel, df_video_metadata, filename)
    df_video_metadata['url'] = 'https://www.youtube.com/watch?v=' + df_video_metadata['nodeid']
    # get channel ids for seed videos
    seed_channel_ids = list(df_video_metadata[df_video_metadata['isSeed VARCHAR'] == 'yes']['channelId VARCHAR'])
    # label whether or not all videos are in the seed channel
    df_video_metadata['seed_channel'] = df_video_metadata['channelId VARCHAR'].apply(lambda x : True if x in seed_channel_ids else False)
    df_video_metadata.to_csv(filename + '_metadata.csv')
    df_video_graph.to_csv(filename + '_relations.csv')
    return df_video_metadata, df_video_graph
    

def load_channel_data(filename):
    df_metadata, df_rel = parse_gdf_channel(filename + '.gdf')        
    # clean column names and update node ids
    df_metadata, df_graph = join_update_clean_dfs(df_rel, df_metadata, filename)
    df_metadata['url'] = 'https://www.youtube.com/channel/' + df_metadata['nodeid']
    df_metadata.to_csv(filename + '_metadata.csv')
    df_graph.to_csv(filename + '_relations.csv')
    return df_metadata, df_graph
    

    
df_video_metadata, df_video_graph = load_video_data('masha_and_shark_2019_05_14')