# Kpop Dashboard Project

## Import libraries

In [None]:
from dash import Dash, dcc, html, Output, Input
import dash_bootstrap_components as dbc
import numpy as np
import pandas as pd
# from plotly_calplot import calplot
import plotly.express as px
from summarytools import dfSummary

## Exploratory data analysis

### Load dataset

In [None]:
# Data retrieved from https://www.kaggle.com/datasets/nicolsalayoarias/all-kpop-idols/?select=kpopidolsv3.csv
data = pd.read_csv('kpopidolsv3.csv')
data.head()

### Data cleaning

In [None]:
# Extract required columns
df = data[["Full Name", "Group", "Former Group", "Date of Birth", "Debut", "Country", "Gender"]]

dfSummary(df)

In [None]:
# Drop rows where Full Name is missing (unable to identify) and Debut is filled with a placeholder date (not yet debuted)
df = df.loc[df["Full Name"].isnull() == False]
df = df.loc[df["Debut"] != "0/01/1900"]

# Convert columns to required datatypes
df = df.astype({"Full Name": str, "Group": str, "Former Group": str, "Country": str, "Gender": str})
df["Date of Birth"] = pd.to_datetime(df["Date of Birth"], format = "%d/%m/%Y")
df["Debut"] = pd.to_datetime(df["Debut"], format = "%d/%m/%Y")

# Filter for idols from 2nd Generation onwards (debuted in or after 2005)
df = df.loc[df["Debut"] >= "01/01/2005"]

df.dtypes

In [None]:
dfSummary(df)

### Feature engineering

In [None]:
# Add new columns
df["Debut Year"] = df["Debut"].dt.year
df["Debut Month"] = df["Debut"].dt.month
df["Birth Year"] = df["Date of Birth"].dt.year
df["Birth Month"]  =df["Date of Birth"].dt.month
df["Debut Age"] = df["Debut Year"] - df["Birth Year"]

In [None]:
# Convert data types of the new columns
# Fill Nan values in the new columns with 0 in order to allow conversion to int
df[["Debut Year", "Debut Month", "Birth Year", "Birth Month", "Debut Age"]] = df[["Debut Year", "Debut Month", "Birth Year", "Birth Month", "Debut Age"]].fillna(0).astype(int)

df.dtypes

## Data visualisation

### Idol birthday analysis

1. Calendar plot: idol birthday distribution

In [None]:
# Group by

2. Bar plot: idol birth year distribution (all / male / female)

In [None]:
# Part 1: Group by birth year on all idols
bday2 = df.groupby(['Birth Year']).count()['Full Name'].iloc[1:]
bday2.rename('Idol count', inplace = True)

In [None]:
fig_bday2 = px.bar(bday2, x = bday2.index, y = 'Idol count', title = 'Birth year distribution of K-Pop idols', 
             color = 'Idol count')

fig_bday2.update_layout({'plot_bgcolor': 'rgba(0, 0, 0, 0)', 'paper_bgcolor': 'rgba(0, 0, 0, 0)'})
fig_bday2.update_layout(
    xaxis = dict(
        tickmode = 'linear',
        tick0 = 1,
        dtick = 1
    )
)

fig_bday2.show()

In [None]:
# Part 2: Group by birth year and gender on all idols
bday2a = df.groupby(['Gender', 'Birth Year']).count()['Full Name']
bday2a = pd.concat([bday2a.iloc[:26], bday2a.iloc[27:]], axis = 0)
bday2a.rename('Idol count', inplace = True)
bday2a = bday2a.reset_index()

In [None]:
fig_bday2a = px.bar(bday2a, x = 'Birth Year', y = 'Idol count', title = 'Birth year distribution of K-Pop idols by gender', 
             color = 'Gender', color_discrete_sequence = px.colors.sequential.Purp, barmode = 'group')

fig_bday2a.update_layout({'plot_bgcolor': 'rgba(0, 0, 0, 0)', 'paper_bgcolor': 'rgba(0, 0, 0, 0)'})
fig_bday2a.update_layout(
    xaxis = dict(
        tickmode = 'linear',
        tick0 = 1,
        dtick = 1
    )
)

fig_bday2a.show()

3. Bar plot: idol birth month distribution (all / male / female)

In [None]:
# Part 1: Group by birth month on all idols
bday3 = df.groupby(['Birth Month']).count()['Full Name'].iloc[1:]
bday3.rename('Idol count', inplace = True)

In [None]:
fig_bday3 = px.bar(bday3, x = bday3.index, y = 'Idol count', title = 'Birth month distribution of K-Pop idols', 
             color = 'Idol count')

fig_bday3.update_layout({'plot_bgcolor': 'rgba(0, 0, 0, 0)', 'paper_bgcolor': 'rgba(0, 0, 0, 0)'})
fig_bday3.update_layout(
    xaxis = dict(
        tickmode = 'array',
        tickvals = list(range(1, 13)),
        ticktext = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
    )
)

fig_bday3.show()

In [None]:
# Part 2: Group by birth month and gender on all idols
bday3a = df.groupby(['Gender', 'Birth Month']).count()['Full Name']
bday3a = pd.concat([bday3a.iloc[:12], bday3a.iloc[13:]], axis = 0)
bday3a.rename('Idol count', inplace = True)
bday3a = bday3a.reset_index()

In [None]:
fig_bday3a = px.bar(bday3a, x = 'Birth Month', y = 'Idol count', title = 'Birth month distribution of K-Pop idols by gender', 
             color = 'Gender', color_discrete_sequence = px.colors.sequential.Purp, barmode = 'group')

fig_bday3a.update_layout({'plot_bgcolor': 'rgba(0, 0, 0, 0)', 'paper_bgcolor': 'rgba(0, 0, 0, 0)'})
fig_bday3a.update_layout(
    xaxis = dict(
        tickmode = 'array',
        tickvals = list(range(1, 13)),
        ticktext = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
    )
)

fig_bday3a.show()

### Debut analysis

1. Calendar plot: debut anniversary distribution

In [None]:
# Group by Debut Month

2. Bar plot: debut anniversary distribution by month

In [None]:
# Group by Debut Month and exclude rows with placeholder / invalid dates
debut2 = df.groupby(['Debut Month']).count()['Full Name'].iloc[1:]
debut2.rename('Idol count', inplace = True)

In [None]:
fig_debut2 = px.bar(debut2, x = debut2.index, y = 'Idol count', title = 'Debut month distribution of K-Pop idols', 
             color = 'Idol count')

fig_debut2.update_layout({'plot_bgcolor': 'rgba(0, 0, 0, 0)', 'paper_bgcolor': 'rgba(0, 0, 0, 0)'})
fig_debut2.update_layout(
    xaxis = dict(
        tickmode = 'array',
        tickvals = list(range(1, 13)),
        ticktext = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
    )
)

fig_debut2.show()

3. Bubble plot: debut age distribution (all / male / female)

4. Bar plot: number of debuted idols per year

In [None]:
# Group by Debut Year and count by idol name; exclude rows with placeholder / invalid dates
debut4 = df.groupby(['Debut Year']).count()['Full Name']
debut4.rename('Idol count', inplace = True)

In [None]:
fig_debut4 = px.bar(debut4, x = debut4.index, y = 'Idol count', title = 'Number of debuted K-Pop idols per year', 
             color = 'Idol count')

fig_debut4.update_layout({'plot_bgcolor': 'rgba(0, 0, 0, 0)', 'paper_bgcolor': 'rgba(0, 0, 0, 0)'})
fig_debut4.update_layout(
    xaxis = dict(
        tickmode = 'linear',
        tick0 = 1,
        dtick = 1
    )
)

fig_debut4.show()

5. Bar plot: number of debuted groups per year

In [None]:
# Group by Debut Year and count by group name; exclude rows with placeholder / invalid dates
debut5 = df[['Group', 'Debut Year']].groupby(['Debut Year']).nunique()['Group']
debut5.rename('Group count', inplace = True)

In [None]:
fig_debut5 = px.bar(debut5, x = debut5.index, y = 'Group count', title = 'Number of debuted K-Pop groups per year', 
             color = 'Group count')

fig_debut5.update_layout({'plot_bgcolor': 'rgba(0, 0, 0, 0)', 'paper_bgcolor': 'rgba(0, 0, 0, 0)'})
fig_debut5.update_layout(
    xaxis = dict(
        tickmode = 'linear',
        tick0 = 1,
        dtick = 1
    )
)

fig_debut5.show()

### Meta-analysis

1. Pie chart: distribution of groups by gender

In [None]:
# Group by Gender
meta1 = df.groupby(['Gender']).count()['Full Name']
meta1.rename('Idol count', inplace = True)

In [None]:
fig_meta1 = px.pie(meta1, values = 'Full Name', names = meta1.index, title = 'Gender ratio of K-Pop idols', 
             color_discrete_sequence = px.colors.sequential.Purp)

fig_meta1.show()

2. Bar plot: number of members per group

In [None]:
# Group by Group and count by existing group members, then group by the number of members
meta2 = df[['Group', 'Full Name']].groupby(['Group']).nunique()['Full Name']
meta2 = meta2.groupby(meta2.values).count()[:-1]
meta2.rename('Group count', inplace = True)
meta2.index.name = 'Number of members'

In [None]:
fig_meta2 = px.bar(meta2, x = meta2.index, y = 'Group count', title = 'Number of members per K-Pop group', 
             color = 'Group count')

fig_meta2.update_layout({'plot_bgcolor': 'rgba(0, 0, 0, 0)', 'paper_bgcolor': 'rgba(0, 0, 0, 0)'})
fig_meta2.update_layout(
    xaxis = dict(
        tickmode = 'linear',
        tick0 = 1,
        dtick = 1
    )
)

fig_meta2.show()

3. Pie chart: idol's country of origin distribution

In [None]:
# Group by Country and relabel countries with few counts
meta3 = df.groupby(['Country']).count()['Full Name']
meta3.rename('Idol count', inplace = True)
meta3.index = np.where(meta3 < 3, 'Other countries', meta3.index)
meta3.index.name = 'Country of origin'
# meta3

In [None]:
fig_meta3 = px.pie(meta3, values = 'Idol count', names = meta3.index, title = 'Country of origin ratio of K-Pop idols', 
             color_discrete_sequence = px.colors.sequential.Purpor)

fig_meta3.show()

## Dashboard

### Dash app setup

In [None]:
app = Dash(__name__, external_stylesheets=[dbc.themes.PULSE])

### Dashboard layout and callback

### Run dashboard 

In [None]:
if __name__ == '__main__':
    app.run_server(debug=True)