In [None]:
# Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from matplotlib.pyplot import figure
import seaborn as sns
import os


In [None]:
df = pd.read_csv("/kaggle/input/discogs-database-all-release-data/release_data/release_data.csv")
df.describe(include = "all")

In [None]:
# Let's explore our dataset, and get rid of the nan values
df = df.dropna()
print(df.genre.unique())
print()
print(df.format.unique())
print()
print(df.year.unique())

In [None]:
# Let's arrange our datasets in order to plot ! We're gonna count the # of releases and group by year/format/genre/country

df["year"] = df["year"].astype(int)
    
df_country = df.groupby("country")["release_id"].count().reset_index(name='count').sort_values(['count'], ascending=False).head(10)
df_year = df.groupby("year")["release_id"].count()
df_genre = df.groupby("genre")["release_id"].count().reset_index(name='count').sort_values(['count'], ascending=False)
df_format = df.groupby("format")["release_id"].count().reset_index(name='count').sort_values(['count'], ascending=False).head(5)

In [None]:
# Now let's plot ! 
figure(figsize=(20, 20), dpi=80)
plt.bar(df_country["country"], df_country["count"])
plt.title("country")
plt.show()

plt.bar(df_format["format"], df_format["count"])
plt.title("format")
plt.show()

figure(figsize=(20, 20), dpi=80)
plt.bar(df_genre["genre"], df_genre["count"])
plt.title("genre")
plt.show()

plt.plot(df_year)
plt.title("year")
plt.show()

In [None]:
# Let's make stacked bar charts to see the evolution of formats/genres over time !
# Considering the huge number of genres/formats, we'll only keep the most relevant ones (threshold is arbitrary here)
# Let's prepare our xlabels...

years = sorted([year for year in df.year.unique()])

genres = df_genre[df_genre["count"] > df_genre["count"].sum() * 0.03]["genre"].to_list()

formats = df_format[df_format["count"] > df_format["count"].sum() * 0.01]["format"].to_list()

In [None]:
# ... now the ylabels...
# ...Starting with the table with the # of releases per genre per year...
df_genre_year = df.groupby(["genre", "year"]).agg({"release_id":["count"]})
df_genre_year.columns = ["count_releases"]
df_genre_year = df_genre_year.reset_index().set_index("year")

# problem is: there are a lot of missing values (scarce data for older years). 
# => We'll fix that by first adding the missing years for each genre, then filling them with 0s
df_temp_genre = pd.DataFrame(columns = df_genre_year.columns, index = ["year"])

for i in df_genre_year.genre.unique():
    df_temp1 = df_genre_year[df_genre_year["genre"] == i].reindex(list(range(df_genre_year.index.min(),df_genre_year.index.max()+1)),fill_value=0)
    df_temp1["genre"] = str(i)
    df_temp_genre = pd.concat([df_temp_genre, df_temp1])

df_genre_year = df_temp_genre.dropna().reset_index().rename(columns={"index":"year"})

# ...On to the table with the # of releases per genre per year...

df_format_year = df.groupby(["format", "year"]).agg({"release_id":["count"]})
df_format_year.columns = ["count_releases"]
df_format_year = df_format_year.reset_index().set_index("year")

df_temp_format = pd.DataFrame(columns = df_format_year.columns, index = ["year"])
for i in df_format_year.format.unique():
    df_temp2 = df_format_year[df_format_year["format"] == i].reindex(list(range(df_format_year.index.min(),df_format_year.index.max()+1)),fill_value=0)
    df_temp2["format"] = str(i)
    df_temp_format = pd.concat([df_temp_format, df_temp2])

df_format_year = df_temp_format.dropna().reset_index().rename(columns={"index":"year"})

In [None]:
# Time to plot some nice stacked bar charts !

fig1,ax1 = plt.subplots(figsize=(15,15))
colors = ['g','r','b','c','m','y','k','lime']
margin_bottom1 = np.zeros(len(df_genre_year['year'].drop_duplicates()))
for num,i in enumerate(genres):
    values = list(df_genre_year[df_genre_year["genre"] == i].loc[:, "count_releases"])
    df_genre_year[df_genre_year["genre"] == i].plot.bar(x="year", y="count_releases", ax=ax1, color=colors[num], bottom = margin_bottom1, label = i)
    margin_bottom1 += values

ax1.set_ylabel('# of releases per genre')
ax1.set_title('# of releases per genre per year')
ax1.xaxis.set_major_locator(ticker.MultipleLocator(5))
ax1.legend()
plt.show()
  
    
fig2,ax2 = plt.subplots(figsize=(15,15))
margin_bottom2 = np.zeros(len(df_format_year['year'].drop_duplicates()))
for num,i in enumerate(formats):
    values = list(df_format_year[df_format_year["format"] == i].loc[:, "count_releases"])
    df_format_year[df_format_year["format"] == i].plot.bar(x="year", y="count_releases", ax=ax2, color=colors[num], bottom = margin_bottom2, label = i)
    margin_bottom2 += values

ax2.set_ylabel('# of releases per format')
ax2.set_title('# of releases per format per year')
ax2.xaxis.set_major_locator(ticker.MultipleLocator(5))
ax2.legend()
plt.show()

In [None]:
# These graphs are sooooo interesting: we can see the rise and fall of formats and styles over time. Too bad we can't see the rise of streaming services...
# I'm actually surprised of the not so dominant position of hip-hop. I thought there would be waaaay more releases than that...
# The only reason I can think of is that mixtapes, a format beloved by both rookie and superstar rappers, don't usually have a physical release...

In [None]:
# Now let's see what's the dominant genre per country
df_genre_country = df.groupby(["genre", "country"]).agg({"release_id":["count"]})
df_genre_country.columns = ["count_releases"]
df_genre_country = df_genre_country.reset_index()

countries = df_genre_country["country"].unique()
df_temp_country = pd.DataFrame(columns = df_genre_country.columns)

for i in countries:
    df_temp = df_genre_country[df_genre_country["country"] == i]
    df_temp_max = df_temp[df_temp["count_releases"] == df_temp["count_releases"].max()]
    df_temp_country = pd.concat([df_temp_country, df_temp_max])
    
df_genre_country = df_temp_country.dropna().drop_duplicates(subset = ["country"]).reset_index().drop(columns = "index").sort_values(by=["count_releases"], ascending = False)
df_genre_country.describe(include="all")

In [None]:
# We will now plot a world map showing the dominant genre per country using pygal !

import pygal
from pygal_maps_world.maps import World

music_map = World()
music_map.title = 'Dominant music genre per country'

# pygal library uses a two-letter code per country. We're gonna load the pygal country code table, then merge it with our dataset
country_code = pd.read_csv("/kaggle/input/pygal-table/pygal_table.csv", sep=";") 

df_genre_country = pd.merge(df_genre_country, country_code, how="left", on = "country")
df_genre_country.head(10)

In [None]:
# The two biggest countries (number of releases-wise), US and UK, could not find a code in the pygal table (United States and United Kingdom)
# We're gonna manually add their code, then drop the 

df_genre_country.loc[df_genre_country.country == "US", "code"] = "us"
df_genre_country.loc[df_genre_country.country == "UK", "code"] = "gb"

df_genre_country.describe(include="all")

In [None]:
# 163 countries out of 278 has been found in the pygal table, not too bad !
# We could improve this number, but we're gonna keep things simple for the moment and drop all the code-less countries for the moment.

df_genre_country = df_genre_country.dropna(subset = ["code"])

In [None]:
# time to plot our world map !

for i in df_genre_country.genre.unique():
    music_map.add(i, list(j for j in df_genre_country.loc[df_genre_country.genre == i, "code"].tolist()))
    
music_map.render_to_file('music_map.html')