In [1]:
# Importando os módulos necessários para a execução do projeto
import json
import numpy as np
import pandas as pd
import seaborn as sns
import networkx as nx
from collections import deque
import matplotlib.pyplot as plt

# Generating the network
As a first step for the analysis of the dataset, it is needed to generate a proper network for all the following up analysis. This is done by creating a co-occurence network of the spotify's artists names from samples of the dataset's playlists.

For this, we'll need the help of the an adaption of the code found in [vaslnk's repository](https://github.com/vaslnk/Spotify-Song-Recommendation-ML/blob/master/restructureData.py)

In [2]:
from utils import list_slices_filepaths

def full_artists_data() -> pd.DataFrame:
    """
    Concat all the .csv's in the data_CSV folder and returns a DataFrame
    Parameters:
    -----------
        None
    Output:
    -------
        slice_df: pd.DataFrame
    """
    slice_df_array = []
    for slice_filename in list_slices_filepaths('data_CSV/'):
        slice_path = 'data_CSV/' + slice_filename
        slice_df = pd.read_csv(slice_path)
        slice_df_array.append(slice_df)
    slice_df = pd.concat(slice_df_array)

    return slice_df

In [3]:
artists_df = full_artists_data()
artists_df.head()

Unnamed: 0,artist_name,pid
0,Nina Simone,414000
1,The Jackson 5,414000
2,The Temptations,414000
3,Bill Withers,414000
4,Bill Withers,414000


In [4]:
artists_df['artist_name'] = artists_df['artist_name'].apply(lambda x: x.replace('$', 'S'))
artists_df.head()

Unnamed: 0,artist_name,pid
0,Nina Simone,414000
1,The Jackson 5,414000
2,The Temptations,414000
3,Bill Withers,414000
4,Bill Withers,414000


In [5]:
print(f'Data shape: {artists_df.shape}')
print(f'Number of unique artists: {artists_df.artist_name.nunique()}')

Data shape: (269580, 2)
Number of unique artists: 22626


In [6]:
# pd.Series where each row is a index=PID and value=list of all artists (repeated)
# then needing a count on these 'value' values
artist_df_minimal = artists_df.groupby('pid')['artist_name'].apply(list)
artist_df_minimal.head()

pid
161000    [Shawn Mendes, Cheat Codes, DJ Khaled, Natasha...
161001    [Counting Crows, Sister Hazel, Gin Blossoms, N...
161002    [George Strait, George Strait, George Strait, ...
161003    [Gipsy Kings, Gipsy Kings, Texas Tornados, Fla...
161004    [N2DEEP, Warren G, Westside Connection, Digabl...
Name: artist_name, dtype: object

In [7]:
for PID, artists in artist_df_minimal.iteritems():
    # count each artist in the list as a Series and then reatribute to the corresponding PID
    artist_df_minimal[PID] = pd.Series(artists).value_counts().to_dict()

In [8]:
artist_df_minimal.head()

pid
161000    {'The Chainsmokers': 4, 'Halsey': 3, 'Borgeous...
161001    {'Green Day': 6, 'Everclear': 5, 'Sugar Ray': ...
161002    {'George Strait': 13, 'Kenny Chesney': 10, 'Bl...
161003    {'Gipsy Kings': 4, 'Mongo Santamaria': 2, 'Tex...
161004    {'Ice Cube': 2, 'Westside Connection': 2, 'The...
Name: artist_name, dtype: object

In [9]:
ARTISTS_GRAPH = nx.Graph()

# Iterate over all playlist IDs and create a node for each artist found in the playlist and add an edge between them
for PID, artist_dict in artist_df_minimal.iteritems():
    # Create a node for each artist in the playlist PID
    artist_array = [(artist, frequency) for artist, frequency in artist_dict.items()]
    for node_name, node_frequecy in artist_array:
        if not ARTISTS_GRAPH.has_node(node_name):
            ARTISTS_GRAPH.add_node(node_name, PID=[PID])
            ARTISTS_GRAPH.add_edge(node_name, node_name, weight=(node_frequecy - 1))
        elif PID not in ARTISTS_GRAPH.nodes[node_name]['PID']:
            ARTISTS_GRAPH.nodes[node_name]['PID'].append(PID)
    
    # Add edges between each artist in the playlist PID
    artist_deque = deque(artist_array)
    while artist_deque:
        leftier_artist = artist_deque.popleft()
        leftier_artist_name, leftier_artist_frequency = leftier_artist
        for artist in artist_deque:
            artist_name, artist_frequency = artist
            if ARTISTS_GRAPH.has_edge(leftier_artist_name, artist_name):
                ARTISTS_GRAPH[leftier_artist_name][artist_name]['weight'] += leftier_artist_frequency * artist_frequency
            else:
                ARTISTS_GRAPH.add_edge(leftier_artist_name, artist_name, weight=leftier_artist_frequency * artist_frequency)

In [10]:
nx.is_connected(ARTISTS_GRAPH)

False

In [11]:
artists_subgraph = nx.subgraph(ARTISTS_GRAPH, sorted(nx.connected_components(ARTISTS_GRAPH), key=len, reverse=True)[0])
print(artists_subgraph)

Graph with 22565 nodes and 2727091 edges


In [14]:
fig, ax = plt.subplots(1, 1, figsize=(20, 18))
nx.draw_networkx(artists_subgraph, ax=ax, with_labels=False, node_size=1000)
plt.axis("off")
plt.show()