# LoadFullGraphs

This file takes in the jsons and outputs nice csv files for ML work. There is also some network analysis in this file.

In [261]:
# Standard imports
import requests
import regex as re
import json
import time
import random
import networkx as nx
import pandas as pd

In [218]:
def open_file(filepath:str='cache.json')->dict:
    '''Opens a json file at location filepath and outputs a dictionary'''
    cache_file = open(filepath, 'r')
    cache_contents = cache_file.read()
    artist_data = json.loads(cache_contents)
    cache_file.close()
    return artist_data

In [219]:
def load_full_network(playlist_link:str,random_link:str,cache_file:str)->nx.Graph:
    '''Takes in the playlist network, random network, and cache of info to output a nx.graph.'''
    # open files
    playlist_network = open_file(playlist_link)
    random_network = open_file(random_link)
    cache = open_file(cache_file)

    # playlist network doesn't have name key, my bad
    for artist in playlist_network.keys():
        playlist_network[artist]['name']=artist

    g = nx.Graph()
    for artist in playlist_network.keys():
        g.add_node(artist)
        colab = playlist_network[artist]['collaborators'].keys()
        for y in colab:
            if artist !=y:
                if y not in g.nodes:
                    g.add_node(y)
                g.add_edge(artist,y)

    for n in g.nodes():
        g.nodes[n]['source'] = 'playlist'
        if n in playlist_network.keys():
            for key in playlist_network[n].keys():
                g.nodes[n][key]= playlist_network[n][key]
        else:
            for key in cache[n].keys():
                g.nodes[n][key]=cache[n][key]
            g.nodes[n]['in_playlist']=False
            g.nodes[n]['name']=n


    for artist in random_network.keys():
        if artist not in g.nodes:
            g.add_node(artist)
            g.nodes[artist]['source']='random'
        colab = random_network[artist]['collaborators'].keys()
        for y in colab:
            if artist !=y:
                if y not in g.nodes:
                    g.add_node(y)
                g.add_edge(artist,y)
                g.nodes[y]['source']='random'

    for n in g.nodes():
        if n in random_network.keys():
            for key in random_network[n].keys():
                g.nodes[n][key]= random_network[n][key]
            g.nodes[n]['in_playlist']=False
            g.nodes[n]['name']=n
            if n not in playlist_network.keys():
                for key in cache[n].keys():
                    g.nodes[n][key]=cache[n][key]
            g.nodes[n]['source']='random'
    return g

In [220]:
# loading complete networks
week1_g = load_full_network('10_26_playlist_layer2.json','10_26_random_2layer.json','cache.json')
week2_g = load_full_network('11_01_playlist_layer2.json','10_26_random_2layer.json','cache.json')
week3_g = load_full_network('11_08_playlist_layer2.json','10_26_random_2layer.json','cache.json')

Now I want to gather the info about the different networks and see how they change.

In [221]:
# make dataframe with networks attributes
week1_df =pd.DataFrame.from_dict(dict(week1_g.nodes(data=True)), orient='index')
week2_df =pd.DataFrame.from_dict(dict(week2_g.nodes(data=True)), orient='index')
week3_df =pd.DataFrame.from_dict(dict(week3_g.nodes(data=True)), orient='index')
# print the columns
print(week1_df.columns)

Index(['source', 'genres', 'img_info', 'popularity', 'followers',
       'collaborators', 'in_playlist', 'link', 'name', 'api_link'],
      dtype='object')


In [222]:
# making an easy way to iterate
graphs = {
    'week1': {'graph':week1_g, 'df':week1_df},
          'week2': {'graph':week2_g, 'df':week2_df},
          'week3': {'graph':week3_g, 'df':week3_df},}

In [223]:
# set the name to the index
week3_df['name']=week3_df.index
week2_df['name']=week2_df.index
week1_df['name']=week1_df.index

In [224]:
# This takes about an hour to run, hence why it was cancelled rn.

for week in graphs.keys():
    print(week)
    # set graph & df
    g, df = graphs[week]['graph'], graphs[week]['df']
    # get pagerank
    pr = nx.pagerank(g)
    df['pagerank']=df['name'].apply(lambda x: pr[x])
    # get closeness centrality
    cc = nx.closeness_centrality(g)
    df['closeness_cent']=df['name'].apply(lambda x: cc[x])
    # clustering
    clust = nx.clustering(g)
    df['clustering']=df['name'].apply(lambda x: clust[x])
    # degree centrality
    deg_cent = nx.degree_centrality(g)
    df['deg_cent']=df['name'].apply(lambda x: deg_cent[x])
    # betweenness centrality
    btwn_centr = nx.betweenness_centrality(g)
    df['btwn_centr']=df['name'].apply(lambda x: btwn_centr[x])
    # degree
    deg = nx.degree(g)
    df['degree']=df['name'].apply(lambda x: deg[x])

week1
Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "/Users/Skyeler/anaconda3/envs/si649f23/lib/python3.11/site-packages/IPython/core/interactiveshell.py", line 3508, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/var/folders/r0/2fgtj8y934zddv46dgfbq1xw0000gn/T/ipykernel_19298/1484722221.py", line 9, in <module>
    cc = nx.closeness_centrality(g)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/Skyeler/anaconda3/envs/si649f23/lib/python3.11/site-packages/networkx/algorithms/centrality/closeness.py", line 122, in closeness_centrality
    sp = path_length(G, n)
         ^^^^^^^^^^^^^^^^^
  File "/Users/Skyeler/anaconda3/envs/si649f23/lib/python3.11/site-packages/networkx/classes/backends.py", line 148, in wrapper
    return func(*args, **kwds)
           ^^^^^^^^^^^^^^^^^^^
  File "/Users/Skyeler/anaconda3/envs/si649f23/lib/python3.11/site-packages/networkx/algorithms/shortest_paths/unweighted.py", line 62, in single_source_shortest_path_length
    return dict(_singl

In [None]:
# example of how to filter graph by nodes from playlist & random queries
playlist_nodes = [x for x in week1_g.nodes if week1_g.nodes[x]['source']=='playlist'] #if week1_g.nodes[x]['source']=='playlist'
random_nodes = [x for x in week1_g.nodes if week1_g.nodes[x]['source']=='random']

In [None]:
# add a column wth the weeks
graphs['week1']['df']['week']=1
graphs['week2']['df']['week']=2
graphs['week3']['df']['week']=3
# combine each week into one large data frame for analysis
mega = pd.concat([graphs['week1']['df'],graphs['week2']['df'],graphs['week3']['df']],axis=0)

In [None]:
# The avg network values for each week, for whether node is in the playlist or not, each week
playlist_week =mega.groupby(['in_playlist','week'])[['pagerank','closeness_cent','clustering','deg_cent','degree','btwn_centr']].mean().reset_index()
playlist_week

Unnamed: 0,in_playlist,week,pagerank,closeness_cent,clustering,deg_cent,degree,btwn_centr
0,False,1,4.7e-05,0.182954,0.072061,0.000168,6.066004,0.000274
1,False,2,4.7e-05,0.183071,0.071849,0.000169,6.09353,0.000277
2,False,3,4.7e-05,0.183057,0.072211,0.00017,6.107378,0.000278
3,True,1,0.000116,0.223681,0.138634,0.000469,16.964789,0.002109
4,True,2,0.000114,0.222563,0.140451,0.000466,16.798561,0.002047
5,True,3,0.000113,0.222744,0.143961,0.000461,16.6,0.002024


In [259]:
# The avg network values for each node
x = mega.groupby(['name'])[['pagerank','closeness_cent','clustering','deg_cent','degree','btwn_centr']].mean().reset_index()
x[['pagerank','closeness_cent','clustering','deg_cent','degree','btwn_centr']].mean()

pagerank          0.000028
closeness_cent    0.183444
clustering        0.081526
deg_cent          0.000094
degree            3.377028
btwn_centr        0.000123
dtype: float64

In [260]:
# The avg network values for each week, for whether node is in the playlist or not
average_node = mega.groupby(['in_playlist','name'])[['pagerank','closeness_cent','clustering','deg_cent','degree','btwn_centr']].mean().reset_index()
average_node.groupby('in_playlist')[['pagerank','closeness_cent','clustering','deg_cent','degree','btwn_centr']].mean()

Unnamed: 0_level_0,pagerank,closeness_cent,clustering,deg_cent,degree,btwn_centr
in_playlist,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
False,4.6e-05,0.182982,0.07118,0.000165,5.964489,0.00027
True,0.000113,0.224034,0.14018,0.000458,16.513514,0.002055


In [None]:
############# For all items in each week #############
# top pagerank
print('Top Pagerank',mega.sort_values('pagerank',ascending=False).head(9)['name'].unique())
# top degree
print('Highest degree',mega.sort_values('degree',ascending=False).head(9)['name'].unique())
# highest closeness cent
print('Highest closeness cent',mega.sort_values('closeness_cent',ascending=False).head(9)['name'].unique())
# highest closeness cent
print('Highest btwn cent',mega.sort_values('btwn_centr',ascending=False).head(9)['name'].unique())
# clustering of 1
print('There are these many nodes with clustering of 1: ',len(mega[mega.clustering==1]['name'].unique()))

Top Pagerank ['Various Artists' 'Gioachino Rossini' 'Christoph Willibald Gluck']
Highest degree ['Various Artists' 'Christoph Willibald Gluck' 'Gioachino Rossini']
Highest closeness cent ['Various Artists' "Orchestre National de l'O.R.T.F." 'Maurice André']
Highest btwn cent ['Various Artists' "Orchestre National de l'O.R.T.F." 'Maurice André']
There are these many nodes with clustering of 1:  2021


The most important nodes are 'various artists', 'Giochino Rossini', and 'Christiph Willibald Gluck' via PageRank. They are also the artists with the highest degree.

In [None]:
############# For nodes in playlist #############
# top pagerank
print('Top Pagerank',mega[mega.in_playlist==True].sort_values('pagerank',ascending=False).head(9)['name'].unique())
# top degree
print('Highest degree',mega[mega.in_playlist==True].sort_values('degree',ascending=False).head(9)['name'].unique())
# highest closeness cent
print('Highest closeness cent',mega[mega.in_playlist==True].sort_values('closeness_cent',ascending=False).head(9)['name'].unique())
# highest closeness cent
print('Highest btwn cent',mega[mega.in_playlist==True].sort_values('btwn_centr',ascending=False).head(9)['name'].unique())
# clustering of 1
print('There are these many nodes with clustering of 1: ',mega[(mega.in_playlist==True) & (mega.clustering==1)]['name'].unique())

Top Pagerank ['Jetason' 'mavzy grx' 'Egzod']
Highest degree ['Jetason' 'mavzy grx' 'Egzod']
Highest closeness cent ['Bruno Mars' 'Waxel' 'Playboi Carti']
Highest btwn cent ['Egzod' 'Gulmee' 'Jetason' 'Hanumankind']
There are these many nodes with clustering of 1:  ['JOVINCII' 'Rave The Storm' 'Arnie Way' 'HH韩湘子' 'Hasan Aydın'
 'Sanjesh Meshram']


In [None]:
############# Analysis of only Classical Music #############
t = mega.copy()
t.genres.fillna('',inplace=True)
t['classical'] = t.genres.apply(lambda z: 'classical' in str(z))
t=t[t.classical]
t=t.groupby('name')[['pagerank','closeness_cent','clustering','deg_cent','btwn_centr','degree']].mean().reset_index()
print('Classical Musicians')
t[['pagerank','closeness_cent','clustering','deg_cent','btwn_centr','degree']].mean()

Classical Musicians


pagerank           0.000572
closeness_cent     0.225286
clustering         0.129495
deg_cent           0.002648
btwn_centr         0.002903
degree            95.534722
dtype: float64

In [None]:
############# For nodes NOT in playlist #############
# top pagerank
print('Top Pagerank',mega[mega.in_playlist==False].sort_values('pagerank',ascending=False).head(9)['name'].unique())
# top degree
print('Highest degree',mega[mega.in_playlist==False].sort_values('degree',ascending=False).head(9)['name'].unique())
# highest closeness cent
print('Highest closeness cent',mega[mega.in_playlist==False].sort_values('closeness_cent',ascending=False).head(9)['name'].unique())
# highest closeness cent
print('Highest btwn cent',mega[mega.in_playlist==False].sort_values('btwn_centr',ascending=False).head(9)['name'].unique())
# clustering of 1
print('There are these many nodes with clustering of 1: ',mega[(mega.in_playlist==False) & (mega.clustering==1)]['name'].unique())

Top Pagerank ['Various Artists' 'Gioachino Rossini' 'Christoph Willibald Gluck']
Highest degree ['Various Artists' 'Christoph Willibald Gluck' 'Gioachino Rossini']
Highest closeness cent ['Various Artists' "Orchestre National de l'O.R.T.F." 'Maurice André']
Highest btwn cent ['Various Artists' "Orchestre National de l'O.R.T.F." 'Maurice André']
There are these many nodes with clustering of 1:  ['Yoko Gold' 'Chris Sonic' 'Arya mewada' 'Arya Mewada' 'Marc Rain' 'NYK'
 'Jon Nelson' 'Lady Yasmina' 'Cast of Joker: Folie à Deux' 'Zenit'
 'Trevor Spitta' 'Meta' 'Axciss' 'Nalty' 'AKSD' 'GJBR' 'Filip Melvan'
 'Daniel Prisco-Buxbaum' 'Marcello' 'YUEM' 'The Applepolishers' 'The Yes'
 'NONS!DES' 'ech0' 'Funk Orquestra' 'JOTTA R' 'Feelix' 'H4rdEdge'
 'Lacuna Manti' 'Black Auerbach' 'Schipe' 'WVWII' 'Nikke Yumnam'
 'Studio Rio' 'Tokiah' 'azmaze.' 'Racha' 'Txuilson' 'Quinncy' 'Dj Dags'
 'Aytide' 'SOFI' 'MARRVELLO' 'MAC KYINJING' 'HEVALEN' 'Charon'
 'Neil Sircar' 'Faaves Music' 'Eagle-Eye Cherry' 'Rea

In [None]:
# Seeing how the values differ from source of query (random or playlist), in playlist, and week
mega.groupby(['in_playlist','source','week'])[['pagerank','closeness_cent','clustering','deg_cent','degree','btwn_centr']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,pagerank,closeness_cent,clustering,deg_cent,degree,btwn_centr
in_playlist,source,week,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
False,playlist,1,2.2e-05,0.175267,0.053805,6.7e-05,2.406095,5e-05
False,playlist,2,2.2e-05,0.175277,0.052675,6.7e-05,2.401583,5e-05
False,playlist,3,2.2e-05,0.175233,0.053135,6.7e-05,2.402312,5e-05
False,random,1,9.9e-05,0.198879,0.109884,0.000377,13.648421,0.00074
False,random,2,9.8e-05,0.198846,0.11066,0.000376,13.566504,0.000735
False,random,3,9.9e-05,0.198842,0.110696,0.000377,13.582117,0.000738
True,playlist,1,0.000103,0.223608,0.154641,0.00041,14.850467,0.001935
True,playlist,2,0.000102,0.222161,0.159711,0.000406,14.637255,0.001866
True,playlist,3,0.000102,0.221795,0.162075,0.000408,14.693069,0.00187
True,random,1,0.000153,0.223906,0.089698,0.000648,23.428571,0.002643


Cannot get ASP because graph is unconnected, let's get ASP of largest connected component.

In [265]:
print('Number of Connected Components per Graph: ')
nx.number_connected_components(graphs['week1']['graph']),nx.number_connected_components(graphs['week2']['graph']),nx.number_connected_components(graphs['week3']['graph'])

Number of Connected Components per Graph: 


(29, 29, 29)

In [266]:
print('Size of largest component per graph:')
len(sorted(nx.connected_components(graphs['week1']['graph']), key=len, reverse=True)[0]),len(sorted(nx.connected_components(graphs['week2']['graph']), key=len, reverse=True)[0]),len(sorted(nx.connected_components(graphs['week3']['graph']), key=len, reverse=True)[0])

Size of largest component per graph:


(36043, 35915, 35881)

In [267]:
nodes1=sorted(nx.connected_components(graphs['week1']['graph']), key=len, reverse=True)[0]
nodes2=sorted(nx.connected_components(graphs['week2']['graph']), key=len, reverse=True)[0]
nodes3 = sorted(nx.connected_components(graphs['week3']['graph']), key=len, reverse=True)[0]

lc_1 = nx.subgraph(graphs['week1']['graph'],nodes1)
lc_2 = nx.subgraph(graphs['week2']['graph'],nodes2)
lc_3 = nx.subgraph(graphs['week3']['graph'],nodes3)

In [269]:
nx.average_shortest_path_length(lc_1),nx.average_shortest_path_length(lc_2),nx.average_shortest_path_length(lc_3)

Save data & ensure that the data is saved in a csv for my peers.

In [227]:
# open the cache, we need to fill all NAs before saving data
cache = open_file('cache.json')

In [None]:
def update_null_collaborators(row:pd.Series):
    '''Checks if collaborators column is null, if so fill it.'''
    collab_na = row.isna()['collaborators']
    name = row['name']
    if collab_na:
        if name in cache.keys() and ('collaborators' in cache[name].keys()):
            output = cache[name]['collaborators']
        else:
            output = row['collaborators']
    else:
        output = row['collaborators']
    return output

In [229]:
# make sure to fill all  null data
graphs['week3']['df']['collaborators']=graphs['week3']['df'].apply(lambda x: update_null_collaborators(x), axis=1)
graphs['week2']['df']['collaborators']=graphs['week2']['df'].apply(lambda x: update_null_collaborators(x), axis=1)
graphs['week1']['df']['collaborators']=graphs['week1']['df'].apply(lambda x: update_null_collaborators(x), axis=1)

In [None]:
# drop NA because that is the fourth layer of the network
graphs['week3']['df'].dropna(inplace=True)
graphs['week2']['df']['collaborators'].dropna(inplace=True)
graphs['week1']['df']['collaborators'].dropna(inplace=True)

In [254]:
# save to csv for my peers
graphs['week1']['df'].to_csv('week1.csv')
graphs['week2']['df'].to_csv('week2.csv')
graphs['week3']['df'].to_csv('week3.csv')
