What genre of music contains the most connected network?

This is the question that I will attempt to answer

Data:
The data comes from: 

First We Will Import the necessary packages

In [105]:
## Import Packages
import pandas as pd
import numpy as np
import networkx as nx
import nxviz as nv
import matplotlib.pyplot as plt
import json
import igraph as ig

Next we will read in the data.
The data consists of:

* An excel file that has information on the edges in the network

* A json file that has metadata information on the nodes in the dataset. The data contains genres that the nodes (users) liked


In [72]:
## Read in our dataframes with our edge information
df_ro = pd.read_csv("RO_edges.csv")
df_hr = pd.read_csv("HR_edges.csv")
df_hu = pd.read_csv("HU_edges.csv")



## Read in the json files from paths and then load them using the json.loads() function

json_list = ["RO_genres.json", "HR_genres.json", "HU_genres.json"]

with open(json_list[0], "r") as j:
    json1 = json.loads(j.read())
with open(json_list[1], "r") as j:
    json2 = json.loads(j.read())
with open(json_list[2], "r") as j:
    json3 = json.loads(j.read())

In [73]:
## Look at dataframe and datatypes
## Currently each edge relationship is denoted by two colummns with a node in each
print(df_ro.head())
print(df_ro.dtypes)

   node_1  node_2
0       0   37825
1       0   17224
2       0    6356
3       1   18053
4       1   31265
node_1    int64
node_2    int64
dtype: object


In [74]:
## Look at the metadata that is contained within the json file
print(list(json1.items())[:2])

[('0', ['Films/Games', 'Dance', 'Pop', 'International Pop', 'Techno/House', 'R&B', 'Contemporary R&B', 'Film Scores', 'Electro']), ('1', ['Reggae', 'Electro', 'Dance', 'Rap/Hip Hop', 'Metal', 'Pop', 'East Coast', 'Techno/House', 'Rock', 'Alternative', 'Hard Rock'])]


In [75]:
## Convert the jsons to dataframes

genre_ro = pd.DataFrame(list(json1.items()), columns = ["node", "genre_list"])
genre_hr = pd.DataFrame(list(json2.items()), columns = ["node", "genre_list"]) 
genre_hu = pd.DataFrame(list(json3.items()), columns = ["node", "genre_list"]) 

## Look at the structure
genre_ro.head()

Unnamed: 0,node,genre_list
0,0,"[Films/Games, Dance, Pop, International Pop, T..."
1,1,"[Reggae, Electro, Dance, Rap/Hip Hop, Metal, P..."
2,2,[Pop]
3,3,"[Dance, Rap/Hip Hop, Pop, Disco, Alternative, ..."
4,4,"[Dance, Rap/Hip Hop, Contemporary R&B, Pop, Di..."


In [76]:
## look at data types:
genre_ro.dtypes

node          object
genre_list    object
dtype: object

In [77]:
genre_ro["node"] = genre_ro["node"].astype(int)
genre_hr["node"] = genre_hr["node"].astype(int)
genre_hu["node"] = genre_hu["node"].astype(int)

##  make sure the node is an int

genre_ro.dtypes

node           int32
genre_list    object
dtype: object

In [78]:
## Data needs to be in a specific format to be easily read into a network. I will make

## Romanian Dataset
genre_ro["genre_list_dict"] = pd.DataFrame(genre_ro["genre_list"]).to_dict("records")
genre_ro["create_nodes"] = list(zip(genre_ro.node, genre_ro.genre_list_dict))

## Hungarian Dataset
genre_hu["genre_list_dict"] = pd.DataFrame(genre_hu["genre_list"]).to_dict("records")
genre_hu["create_nodes"] = list(zip(genre_hu.node, genre_hu.genre_list_dict))

## Croatian Dataset

genre_hr["genre_list_dict"] = pd.DataFrame(genre_hr["genre_list"]).to_dict("records")
genre_hr["create_nodes"] = list(zip(genre_hr.node, genre_hr.genre_list_dict))

In [79]:
## A column for the edge list will make working with the data more convenient
df_ro["edges"] = list(zip(df_ro.node_1, df_ro.node_2))
df_hr["edges"] = list(zip(df_hr.node_1, df_hr.node_2))
df_hu["edges"] = list(zip(df_hu.node_1, df_hu.node_2))

The information on the dataset said that there were 84 total genres that the nodes could have

In [80]:
## Initiate an empty list to store genres
list_of_genres = []
for row in genre_ro["genre_list_dict"]:
    for music_list in row.values():
        for item in music_list:
            if item not in list_of_genres:
                list_of_genres.append(item)

## 84 values are expected
len(list_of_genres)

84

Lets look at what genres are included in the networks:

In [81]:
list_of_genres[:5]

['Films/Games',
 'Dance',
 'Pop',
 'International Pop',
 'Techno/House',
 'R&B',
 'Contemporary R&B',
 'Film Scores',
 'Electro',
 'Reggae',
 'Rap/Hip Hop',
 'Metal',
 'East Coast',
 'Rock',
 'Alternative',
 'Hard Rock',
 'Disco',
 'Singer & Songwriter',
 'Trance',
 'Latin Music',
 'Indie Rock',
 'Dancefloor',
 'Indie Pop/Folk',
 'Indie Rock/Rock pop',
 'Dancehall/Ragga',
 'Indie Pop',
 'Country',
 'Rock & Roll/Rockabilly',
 'Blues',
 'Dubstep',
 'Electro Hip Hop',
 'Kids',
 'Chill Out/Trip-Hop/Lounge',
 'Electro Pop/Electro Rock',
 'Contemporary Soul',
 'Classical',
 'Vocal jazz',
 'Jazz',
 'Dub',
 'Soundtracks',
 'Comedy',
 'Alternative Country',
 'TV Soundtracks',
 'Musicals',
 'Folk',
 'Grime',
 'Asian Music',
 'Oldschool R&B',
 'Brazilian Music',
 'Spirituality & Religion',
 'Old school soul',
 'Game Scores',
 'Dirty South',
 'Soul & Funk',
 'Old School',
 'Modern',
 'West Coast',
 'Instrumental jazz',
 'Opera',
 'Urban Cowboy',
 'Sports',
 'Chicago Blues',
 'Bolero',
 'Baroque',


Lets determine the frequency of genres in each of the datasets 

In [82]:
country_genre_counts = pd.DataFrame()
country_genre_counts["category"] = list_of_genres
df_list = [genre_ro, genre_hu, genre_hr]
df_str = ["romanian_counts",  "hungarian_counts", "croatian_counts"]
index = 0
for df in df_list:
    genre_counts = {}
    for row in df["genre_list_dict"]:
        for music_list in row.values():
            for item in music_list:
                if item not in genre_counts.keys():
                    genre_counts[item] = 1
                else:
                    genre_counts[item] += 1
    country_genre_counts[df_str[index]] = genre_counts.values()
    index += 1

Sorted by Romanian Genres Counts:

In [83]:
country_genre_counts.sort_values(by  = ["romanian_counts"], ascending= False).head(10)

Unnamed: 0,category,romanian_counts,hungarian_counts,croatian_counts
2,Pop,33420,18620,12030
1,Dance,21208,21033,12111
10,Rap/Hip Hop,18849,17585,10436
13,Rock,16630,5236,9435
8,Electro,16550,3820,27101
14,Alternative,14966,1848,7192
5,R&B,11933,25201,25998
3,International Pop,11620,9458,13723
20,Indie Rock,7365,10181,5333
0,Films/Games,7353,22749,47303


Sorted by Hungarian Genre Counts:

In [84]:
country_genre_counts.sort_values(by = ["hungarian_counts"], ascending= False).head(10)

Unnamed: 0,category,romanian_counts,hungarian_counts,croatian_counts
7,Film Scores,6924,37889,20707
5,R&B,11933,25201,25998
0,Films/Games,7353,22749,47303
1,Dance,21208,21033,12111
2,Pop,33420,18620,12030
10,Rap/Hip Hop,18849,17585,10436
16,Disco,5841,13830,8926
17,Singer & Songwriter,5520,11134,3797
20,Indie Rock,7365,10181,5333
3,International Pop,11620,9458,13723


Sorted by Croatian Genre Counts:

In [85]:
country_genre_counts.sort_values(by = ["croatian_counts"], ascending= False).head(10)

Unnamed: 0,category,romanian_counts,hungarian_counts,croatian_counts
0,Films/Games,7353,22749,47303
8,Electro,16550,3820,27101
5,R&B,11933,25201,25998
4,Techno/House,6811,5158,21142
7,Film Scores,6924,37889,20707
11,Metal,4264,3765,19654
15,Hard Rock,3189,8878,15164
3,International Pop,11620,9458,13723
1,Dance,21208,21033,12111
2,Pop,33420,18620,12030


Interestingly, the top genres for all three countries are quite different. It will be cool to see if this has an impact on the connectedness of the networks, or if the most connected genre will still be consistent accross all of the networks. Romania enjoys more conventional genres, while Croatia and Hungary prefer movie/game scores as well as electronic or dance music

Now lets Create our Networks:

In [86]:
## initiate all networks
graph_ro = nx.Graph()
graph_hu = nx.Graph()
graph_cr = nx.Graph()

## Create romanian Network
graph_ro.add_nodes_from(genre_ro["create_nodes"], key =int)
graph_ro.add_edges_from(df_ro["edges"])

## Create hungarian Network
graph_hu.add_nodes_from(genre_hu["create_nodes"], key =int)
graph_hu.add_edges_from(df_hu["edges"])


## Create croatian network
graph_cr.add_nodes_from(genre_hr["create_nodes"], key =int)
graph_cr.add_edges_from(df_hr["edges"])


Lets look at the data contained within our network nodes:

In [101]:
list(graph_ro.nodes(data = True))[:2]

[(0,
  {'key': int,
   'genre_list': ['Films/Games',
    'Dance',
    'Pop',
    'International Pop',
    'Techno/House',
    'R&B',
    'Contemporary R&B',
    'Film Scores',
    'Electro']}),
 (1,
  {'key': int,
   'genre_list': ['Reggae',
    'Electro',
    'Dance',
    'Rap/Hip Hop',
    'Metal',
    'Pop',
    'East Coast',
    'Techno/House',
    'Rock',
    'Alternative',
    'Hard Rock']})]

Check the number of nodes and edges:

In [88]:
## Romanian Counts
print("Romanian Nodes: " + str(graph_ro.number_of_nodes()))
print("Romanian Edges: " + str(graph_ro.number_of_edges()))

## Croatian Counts

print("Croatian Nodes: " + str(graph_cr.number_of_nodes()))
print("Croatian Edges: " + str(graph_cr.number_of_edges()))

## Hungarian Counts

print("Hungarian Nodes: " + str(graph_hu.number_of_nodes()))
print("Hungarian Edges: " + str(graph_hu.number_of_edges()))

Romanian Nodes: 41773
Romanian Edges: 125826
Croatian Nodes: 54573
Croatian Edges: 498202
Hungarian Nodes: 47538
Hungarian Edges: 222887


The Croatian network is the largest and likely the most connected. Now the centrality measures should be examined.

In [103]:
## Romanian Centrality measures
degree_centrality_ro = nx.degree_centrality(graph_ro)
eigen_centrality_ro = nx.eigenvector_centrality(graph_ro)

## Croatian Centrality measures
degree_centrality_cr = nx.degree_centrality(graph_cr)
eigen_centrality_cr = nx.eigenvector_centrality(graph_cr)

## Hungarian Centrality measures
degree_centrality_hu = nx.degree_centrality(graph_hu)
eigen_centrality_hu = nx.eigenvector_centrality(graph_hu)

## In order to find betweenness centrality, we must convert the network to an igraph network. The algorithm that networkx uses is too slow for networks that are as large as the three that are being examined

igraph_ro = ig.Graph.from_networkx(graph_ro)
igraph_cr = ig.Graph.from_networkx(graph_cr)
igraph_hu = ig.Graph.from_networkx(graph_hu)


In [107]:
## Calculate beteenness centrality

betweenness_centrality_ro = igraph_ro.betweenness(directed = False)

In [108]:
betweenness_centrality_cr = igraph_cr.betweenness(directed = False)

In [109]:
betweenness_centrality_hu = igraph_hu.betweenness(directed =False)

In [112]:
## lets add centrality values to our genre dataframe

## Romanian Dataframe
genre_ro["degree_centrality"] = degree_centrality_ro.values()
genre_ro["eigenvector_centrality"] = eigen_centrality_ro.values()
genre_ro["betweenness_centrality"] = betweenness_centrality_ro
## Croation Dataframe
genre_hr["degree_centrality"] = degree_centrality_cr.values()
genre_hr["eigenvector_centrality"] = eigen_centrality_cr.values()
genre_hr["betweenness_centrality"] = betweenness_centrality_cr

## Hungarian Dataframe
genre_hu["degree_centrality"] = degree_centrality_hu.values()
genre_hu["eigenvector_centrality"] = eigen_centrality_hu.values()
genre_hu["betweenness_centrality"] = betweenness_centrality_hu

In [122]:
## Sort hungarian values by descending degree_centrality
genre_hu.sort_values(by = ["betweenness_centrality"], ascending= False)

Unnamed: 0,node,genre_list,genre_list_dict,create_nodes,degree_centrality,eigenvector_centrality,betweenness_centrality
14900,14900,"[Techno/House, Dance, Electro, Pop]","{'genre_list': ['Techno/House', 'Dance', 'Elec...","(14900, {'genre_list': ['Techno/House', 'Dance...",0.002356,3.175087e-02,8.877837e+06
40491,40491,"[Comedy, Dance, Rap/Hip Hop, Pop, R&B, Rock, E...","{'genre_list': ['Comedy', 'Dance', 'Rap/Hip Ho...","(40491, {'genre_list': ['Comedy', 'Dance', 'Ra...",0.002041,8.660385e-03,7.066625e+06
24218,24218,"[International Pop, Rap/Hip Hop, Latin Music, ...","{'genre_list': ['International Pop', 'Rap/Hip ...","(24218, {'genre_list': ['International Pop', '...",0.001851,7.635477e-03,5.833603e+06
14597,14597,"[Rap/Hip Hop, Pop]","{'genre_list': ['Rap/Hip Hop', 'Pop']}","(14597, {'genre_list': ['Rap/Hip Hop', 'Pop']})",0.001725,8.568430e-03,5.663767e+06
15724,15724,"[Films/Games, Dancefloor, International Pop, R...","{'genre_list': ['Films/Games', 'Dancefloor', '...","(15724, {'genre_list': ['Films/Games', 'Dancef...",0.001830,9.452062e-03,4.975937e+06
...,...,...,...,...,...,...,...
16573,16573,[Rap/Hip Hop],{'genre_list': ['Rap/Hip Hop']},"(16573, {'genre_list': ['Rap/Hip Hop']})",0.000021,7.779920e-07,0.000000e+00
41217,41217,"[Pop, Rock]","{'genre_list': ['Pop', 'Rock']}","(41217, {'genre_list': ['Pop', 'Rock']})",0.000021,9.109243e-07,0.000000e+00
6653,6653,"[International Pop, Rap/Hip Hop, Pop, R&B]","{'genre_list': ['International Pop', 'Rap/Hip ...","(6653, {'genre_list': ['International Pop', 'R...",0.000021,2.575258e-05,0.000000e+00
6666,6666,"[Films/Games, Indie Rock, R&B, International P...","{'genre_list': ['Films/Games', 'Indie Rock', '...","(6666, {'genre_list': ['Films/Games', 'Indie R...",0.000021,9.625089e-06,0.000000e+00
