In [1]:
# Importando os módulos do NetworkX e Seaborn
import json
import numpy as np
import pandas as pd
import seaborn as sns
import networkx as nx

# Generating the network
As a first step for the analysis of the dataset, it is needed to generate a proper network for all the following up analysis. This is done by creating a co-occurence network of the spotify's artists names from samples of the dataset's playlists.

For this, we'll need the help of the an adaption of the code found in [vaslnk's repository](https://github.com/vaslnk/Spotify-Song-Recommendation-ML/blob/master/restructureData.py)

In [2]:
from utils import list_slices_filepaths

def full_artists_data() -> pd.DataFrame:
    '''
    Concats all the .csv's in the data_CSV folder and returns a DataFrame

    Parameters:
    -----------
        None
    Output:
    -------
        slice_df: pd.DataFrame
    '''
    slice_df_array = []
    for slice_filename in list_slices_filepaths('data_CSV/'):
        slice_path = 'data_CSV/' + slice_filename
        slice_df = pd.read_csv(slice_path)
        slice_df_array.append(slice_df)
    slice_df = pd.concat(slice_df_array)

    return slice_df

In [3]:
artists_df = full_artists_data()
artists_df.head()

Unnamed: 0,artist_name,pid
0,Nina Simone,414000
1,The Jackson 5,414000
2,The Temptations,414000
3,Bill Withers,414000
4,Bill Withers,414000


In [4]:
print('Data shape:', artists_df.shape)
print('Number of unique artists: ', artists_df.artist_name.nunique())

Data shape: (269580, 2)
Number of unique artists:  22626


In [5]:
# pd.Series where each row is a index=PID and value=list of all artists (repeated)
# then needing a count on these 'value' values
artist_df_minimal = artists_df.groupby('pid')['artist_name'].apply(list)
artist_df_minimal.head()

pid
161000    [Shawn Mendes, Cheat Codes, DJ Khaled, Natasha...
161001    [Counting Crows, Sister Hazel, Gin Blossoms, N...
161002    [George Strait, George Strait, George Strait, ...
161003    [Gipsy Kings, Gipsy Kings, Texas Tornados, Fla...
161004    [N2DEEP, Warren G, Westside Connection, Digabl...
Name: artist_name, dtype: object

In [6]:
for PID, artists in artist_df_minimal.iteritems():
    # count each artist in the list as a Series and then reatribute to the corresponding PID
    artist_df_minimal[PID] = pd.Series(artists).value_counts().to_dict()

In [7]:
artist_df_minimal.head()

pid
161000    {'The Chainsmokers': 4, 'Halsey': 3, 'Borgeous...
161001    {'Green Day': 6, 'Everclear': 5, 'Sugar Ray': ...
161002    {'George Strait': 13, 'Kenny Chesney': 10, 'Bl...
161003    {'Gipsy Kings': 4, 'Mongo Santamaria': 2, 'Tex...
161004    {'Ice Cube': 2, 'Westside Connection': 2, 'The...
Name: artist_name, dtype: object

In [9]:
ARTISTS_GRAPH = nx.Graph()

for PID, artist_dict in artist_df_minimal.iteritems():
    artist_array = [(artist, frequency) for artist, frequency in artist_dict.items()]

    for artist in artist_array:
        node_name = artist[0]
        node_frequecy = artist[1]
        
        if not ARTISTS_GRAPH.has_node(node_name):
            ARTISTS_GRAPH.add_node(node_name, PID=[PID])
            ARTISTS_GRAPH.add_edge(node_name, node_name, weight=(node_frequecy - 1))
        elif PID not in ARTISTS_GRAPH.nodes[node_name]['PID']:
            ARTISTS_GRAPH.nodes[node_name]['PID'].append(PID)

        

print(ARTISTS_GRAPH.nodes(data=True))


[('The Chainsmokers', {'PID': [161000, 161018, 161028, 161033, 161044, 161062, 161073, 161076, 161086, 161098, 161105, 161129, 161156, 161162, 161167, 161169, 161180, 161184, 161190, 161200, 161225, 161227, 161230, 161238, 161242, 161245, 161284, 161289, 161292, 161298, 161300, 161312, 161335, 161339, 161340, 161347, 161348, 161355, 161371, 161385, 161420, 161422, 161432, 161440, 161452, 161456, 161474, 161514, 161519, 161525, 161537, 161538, 161554, 161570, 161583, 161584, 161587, 161589, 161590, 161602, 161625, 161630, 161631, 161633, 161654, 161665, 161667, 161677, 161678, 161706, 161731, 161735, 161742, 161743, 161793, 161797, 161811, 161832, 161845, 161846, 161858, 161868, 161873, 161889, 161890, 161902, 161907, 161911, 161917, 161927, 161930, 161942, 161951, 161955, 161981, 161989, 161993, 161994, 414011, 414022, 414023, 414031, 414037, 414064, 414067, 414077, 414081, 414086, 414090, 414099, 414105, 414109, 414131, 414135, 414143, 414164, 414175, 414204, 414206, 414209, 414210, 4