# EDA 


In [10]:
import networkx as nx
import json 
import pandas
import matplotlib.pyplot as plt
import altair as alt
import nx_altair as nxa
from itertools import count

alt.data_transformers.disable_max_rows() # No maximum dataset

DataTransformerRegistry.enable('default')

In [11]:
# Define the custom theme, modified from https://www.geeksforgeeks.org/setting-a-custom-color-theme-in-altair-as-default/
def brat():
    return {
        "config": {
            "view": {"continuousWidth": 400, "continuousHeight": 300},
            "mark": {"color": "steelblue"},
            "axis": {
                "labelFontSize": 12,
                "titleFontSize": 14,
                "labelColor": "gray",
                "titleColor": "black"
            },
            "range": {
                "category": ["#8ACE00","#D81E5B", "#35605A", "#420039", "#BDD4E7"]
            }
        }
    }

# Register and enable the custom theme
alt.themes.register('brat', brat)
alt.themes.enable('brat')

ThemeRegistry.enable('brat')

In [12]:
def open_file(path:str)->dict:
    cache_file = open(path, 'r')
    cache_contents = cache_file.read()
    artist_data = json.loads(cache_contents)
    cache_file.close()
    return artist_data


def get_graph(network:dict)->nx.Graph:
    g = nx.Graph()
    for artist in network.keys():
        g.add_node(artist)
        colab = network[artist]['collaborators'].keys()
        for y in colab:
            if artist !=y:
                if y not in g.nodes:
                    g.add_node(y)
                g.add_edge(artist,y)
    for n in g.nodes():
        if n in network.keys():
            for key in network[n].keys():
                g.nodes[n][key]= network[n][key]
        else:
            g.nodes[n]['in_playlist']=False
            g.nodes[n]['name']=n
    return g

def SCC_plot (network:dict,title:str):

# This function takes a network dictionary and a title and returns a plot of the strongest connected component in the network  

    g = get_graph(network)
    largest_cc = max(nx.connected_components(g), key=len)
    g_scc = g.subgraph(largest_cc).copy()
    position = nx.spring_layout(g_scc)
    color_map = []
    return nxa.draw_networkx_edges(g_scc,pos=position).interactive()+nxa.draw_networkx_nodes(g_scc,pos=position,node_color='in_playlist',tooltip=['name']).properties(
        width=800,
        height=800,
        title=title
    ).interactive()



#function to get network statistics, average degree, average clustering, and number of connected components, betweenness centrality as a dataframe
def network_statistics(network:dict)->pandas.DataFrame:
    g = get_graph(network)
    
    avg_degree = sum(dict(g.degree()).values())/len(g.nodes())
    avg_clustering = nx.average_clustering(g)
    #number of connected components of in teh undirected graph
    num_connected_components = nx.number_connected_components(g)

    #average path length for scc
    largest_cc = max(nx.connected_components(g), key=len)
    g_scc = g.subgraph(largest_cc).copy()
    avg_path_length = nx.average_shortest_path_length(g_scc)
   
    return pandas.DataFrame({'avg_degree':[avg_degree],'avg_clustering':[avg_clustering],'num_connected_components':[num_connected_components],'avg_path_length':[avg_path_length]})




In [13]:
#scc plot for the random network and layer network
random_network = open_file('10_26_random_1layer.json')
layer_network = open_file('10_26_playlist_layer1.json')
SCC_plot(random_network,'Random Network Strongest Connected Component 10/26/24')



In [14]:
SCC_plot(layer_network,'Instagrapm playlist Network Strongest Connected Component 10/26/24')

In [15]:
random_network = open_file('10_26_random_1layer.json')
layer_network = open_file('10_26_playlist_layer1.json')
random2_network = open_file('10_26_random_2layer.json')
layer2_network = open_file('10_26_playlist_layer2.json')

random_network_stats = network_statistics(random_network)
layer_network_stats = network_statistics(layer_network)

#combine the two dataframes
stats = pandas.concat([random_network_stats,layer_network_stats],axis=0)
stats['network'] = ['random','layer']
#network column to the first column
stats = stats[['network','avg_degree','avg_clustering','num_connected_components', 'avg_path_length']]
stats


Unnamed: 0,network,avg_degree,avg_clustering,num_connected_components,avg_path_length
0,random,1.963466,5.6e-05,72,5.938039
0,layer,2.390382,0.050586,17,5.311345


In [16]:
SCC_plot(random_network,'Random Artists Collaborators SCC 10/26/24')



In [17]:


SCC_plot(layer_network,'Instagram Reels Top Songs Artists 10/26/24 - 1st Layer SCC')



In [18]:
#get the top genres for the random and layer 1 networks
def get_top_genres(network:dict)->pandas.DataFrame:
    genres = {}
    for artist in network.keys():
        for genre in network[artist]['genres']:
            if genre in genres:
                genres[genre]+=1
            else:
                genres[genre]=1
    return pandas.DataFrame(genres.items(),columns=['genre','count']).sort_values('count',ascending=False).head(10)

random_genres = get_top_genres(random_network)
layer_genres = get_top_genres(layer_network)
