In [None]:
# DISCLAIMER: THIS WAS MY VERY FIRST "IN-DEPTH" DATASET ANALYSIS. SO TAKE IT AS IT IS: A ROOKIE WORK, ACHIEVED BY A WANNABE DATA ENGINEER.
# FEEL FREE TO LEAVE ANY CONSTRUCTIVE COMMENTS, AS I WILL WELCOME THEM AS A MEAN OF SELF-IMPROVEMENT !

# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math

In [None]:
# Dataset overview
df = pd.read_csv("/kaggle/input/music-sales/musicdata.csv")
df.shape
df.describe(include="all")

In [None]:
# Dataset exploration
# Comments:
# 1. Useless data => every value of the "number_of_records" column is equal to 1.0
# 2. Missing data => 1351 values for the "value_actual" column, and 3008 for the other columns !
# 3. Unusual values => min(value_actual) < 0 ? How can a sales value be negative ?
# 4. format/metrics => What are the 3 different metrics ? And the 24 values ?

# Let's start by 4 in order to have a better understanding of our dataset
print(df.metric.unique())
print(df.format.unique())


In [None]:
# Our question 4 has thus been answered:
## 3 metrics => Units (number of format sold), Value ($ earned), and Value (Adjusted) (constant 2019 $)
## 24 formats => Some are recent (streaming, CD...), some others are old and gone (8-track, SACD...).

# Additional comments:
# 5. format mispelling => "Paid Subscription" and "Paid SubscriptionS" both exist within the format distinct values list
# 6. There should be a total of:
## total format * total metric * total year = (24-1) * 3 *(2019-1973+1) = 3243 values but the table overview showed only 3008 values...

# Let's dig deeper on comment 5
df[df["format"] == "Paid Subscription"].describe(include="all")


In [None]:
df[df["format"] == "Paid Subscriptions"].describe(include="all")

In [None]:
# Our question 5 has thus been answered:
## The differences between "Paid subscription" and "Paid SubscriptionS" are the metrics: the former has both "Value" and "Value (adjusted)", while the latter has "Units"

# Let's now investigate comment 6: why are there only 3008 values when there should be 3243 values ?
# Let's break down the number of values per metric and format
for i in df.metric.unique():
    print(i)
    print(df[df["metric"] == i].metric.count())
    print()

for j in df.format.unique():
    print(j)
    print(df[df["format"] == j].format.count())
    print()

In [None]:
# Our question 6 has thus been answered:
## The 235 missing data come from the metric "Units" of recent formats (streaming, synchronization...)
## This makes sense since these formats are hardly measurable in units

# Now let's go back to the first comments and investigate comment 2: what are the missing data in the "value_actual" column ?
# It's time to plot !
for i in df.format.unique():
    fig, ax = plt.subplots()
    ax.plot(df[(df["format"] == i) & (df["metric"] == "Units")]["year"], df[(df["format"] == i) & (df["metric"] == "Units")]["value_actual"], label = "Units")
    ax.plot(df[(df["format"] == i) & (df["metric"] == "Value")]["year"], df[(df["format"] == i) & (df["metric"] == "Value")]["value_actual"], label = "Value")
    ax.plot(df[(df["format"] == i) & (df["metric"] == "Value (Adjusted)")]["year"], df[(df["format"] == i) & (df["metric"] == "Value (Adjusted)")]["value_actual"], label = "Value (Adjusted)")
    legend = ax.legend()
    plt.title(i)
    plt.xlabel("year")
    plt.ylabel("sales")
    plt.show()
        


In [None]:
# Our question 2 has thus been answered:
## The missing data in the "value_actual" column come from years where the format either did not exist or either disappeared.
## Example: "Synchronization" did not exist until 2009, hence the missing values.

# To end with data exploration, let's investigate comment 3: what are the negative values ?
df[df["value_actual"] < 0]

In [None]:
# Our question 3 has thus been answered:
## There are few values and they are close to zero. We will choose here to replace them all with zeros.

# DATASET CLEANING
df = df.drop(columns = ["number_of_records"]) # Comment 1 => Useless column
df.value_actual = df.value_actual.fillna(0) # Comment 2 => Missing data
df.loc[df.value_actual < 0, "value_actual"] = 0 # Comment 3 => negative value
df.loc[df["format"] == "Paid Subscriptions", "format"] = "Paid Subscription" # Comment 5 => Mispelling

In [None]:
# Let's make a pretty stacked bar chart to summarize !
fig,ax = plt.subplots(figsize=(15,15))
for i in df.format.unique():
    ax.bar(df[(df.format == i) & (df.metric == "Value (Adjusted)")]["year"], df[(df.format == i) & (df.metric == "Value (Adjusted)")]["value_actual"], label = i)

ax.set_ylabel('Value (Adjusted)')
ax.set_title('Value (adjusted) per format per year')
ax.legend()
plt.show()

In [None]:
# I love this chart because it clearly shows the whole History of recorded music in America:
## The rise and fall of many formats such as the tape, the vinyl,the CD...
## The industry valuation was at an all time high in the late 90s/early 00s thanks to the CD itself ! 
## But the dynamic changed when Napster (and piracy at large) came in... Within a bare 10 years, the music industry lost 3/4 of its value !
## The digital downloads did not help much, and it really is streaming that helped the industry to recover.