# Hololive Insights

In [None]:
import numpy as np
import pandas as pd
from glob import iglob
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [None]:
stats = pd.read_csv('../input/vtuber-livechat/chat_stats.csv')
sc_stats = pd.read_csv('../input/vtuber-livechat/superchat_stats.csv')
channels = pd.read_csv('../input/vtuber-livechat/channels.csv')

# select only active Hololive-affiliated channels
channels = channels[(channels['affiliation'] == 'Hololive') & (channels['group'] != 'INACTIVE')]
channels['group'].fillna('No Group', inplace=True)

# exclude official/secondary/graduated channels
officialChannels = [
    'UCJFZiqLMntJufDCHc6bQixg',
    'UCfrWoRGlawPQDQxxeIDRP0Q',
    'UCotXwY6s8pWmuWd_snKYjhg',
    'UCWsfcksUUpoEvhia0_ut0bA',
]
subChannels = [
    'UCHj_mh57PVMXhAUDphUQDFA',
    'UCLbtM3JZfRTg8v2KGag-RMw',
    'UCp3tgHXw_HI0QMk1K8qh3gQ',
]
graduated = [
    'UCS9uQI-jC3DE0L4IpXyvr6w'
]
channels = channels[~channels['channelId'].isin(officialChannels + subChannels + graduated)]

# merge stats columns
stats_all = pd.merge(stats, sc_stats, on=['channelId', 'period'], how='left')
numeric_columns = stats_all.select_dtypes(include=['number']).columns
stats_all[numeric_columns] = stats_all[numeric_columns].fillna(0).astype(
        'int')

channels = pd.merge(channels, stats_all, on=['channelId'], how='left')

# sex
channels['sex'] = channels['group'].apply(lambda g: 'Male' if g.startswith('Holostars') else 'Female')

# language
def langmatch(channel):
    if channel['group'].startswith('English') or channel['name.en'] == 'IRyS':
        return 'English'
    elif channel['group'].startswith('Indonesia'):
        return 'Indonesian'
    return 'Japanese'
channels['language'] = channels.apply(langmatch, axis=1)

# aggregate data
overall = channels.groupby('name.en').agg({
    'subscriptionCount': 'first',
    'videoCount': 'first',
    'chatCount': 'sum',
    'chatNunique': 'mean',
    'banCount': 'sum',
    'banNunique': 'mean',
    'deletionCount': 'sum',
    'scCount': 'sum',
    'scNunique': 'mean',
    'scTotalJPY': 'sum',
    'scMeanJPY': 'last',
    'affiliation': 'first',
    'group': 'first',
    'name': 'first',
    'sex': 'first',
    'language': 'first'
}).reset_index()
overall['chatCountPerUser'] = overall['chatCount'] / overall['chatNunique']

In [None]:
px.bar(overall.sort_values(by='subscriptionCount', ascending=False),
       x='name.en',
       y='subscriptionCount',
       color='videoCount',
       hover_name='name',
       hover_data=['videoCount', 'group'],
       labels={
           'subscriptionCount': '# of Subscribers',
           'videoCount': '# of Videos',
           'name.en': 'Name',
           'group': 'Group',
       },
       title='Most Subscribed Channels')

In [None]:
px.bar(overall.sort_values(by='subscriptionCount', ascending=False),
       x='name.en',
       y='subscriptionCount',
       color='group',
       hover_name='name',
       hover_data=['videoCount'],
       labels={
           'sub_count': '# of Subscribers',
           'video_count': '# of Videos',
           'name_en': 'Name',
           'group': 'Group',
       },
       title='Most Subscribed Channels per Group')

In [None]:
px.treemap(overall, path=['group', 'name'], values='subscriptionCount')

In [None]:
px.bar(overall.sort_values(by='videoCount', ascending=False),
       x='name.en',
       y='videoCount',
       color='subscriptionCount',
       hover_name='name',
       hover_data=['videoCount', 'subscriptionCount', 'group'],
       labels={
           'subscriptionCount': '# of Subscribers',
           'videoCount': '# of Videos',
           'name.en': 'Name',
           'group': 'Group',
       },
       title='Most Active Channels in terms of Number of Videos')

In [None]:
px.bar(overall.sort_values(by='chatCount', ascending=False),
       x='name.en',
       y='chatCount',
       color='chatCountPerUser',
       hover_name='name',
       labels={
           'chatCountPerUser': 'Average chats per user',
           'videoCount': '# of Videos',
           'name.en': 'Name',
           'chatCount': '# of Chat',
       },
       title='Live Chat Intensity (2021-01 to 2021-07)'
).update_layout(xaxis={'categoryorder': 'total descending'}
).update_traces(texttemplate='%{y:.2s}', textposition='outside')
# ).add_annotation(x='IRyS', y=1200000,
#             text='Avg. chats per user: 9.9',
#             showarrow=True,
#             arrowhead=1)

In [None]:
px.bar(overall.sort_values(by='banCount', ascending=False),
       x='name.en',
       y='banCount',
       log_y=True,
       color='chatCount',
       hover_name='name',
       hover_data=['chatCount', 'subscriptionCount'],
       labels={
           'chatCount': "# of Chats",
           'subscriptionCount': '# of Subscribers',
           'videoCount': '# of Videos',
           'name.en': 'Name',
           'banCount': '# of Ban',
       },
       title='Ban Events (y-axis is log-scaled)'
)

In [None]:
px.scatter(overall,
       x='subscriptionCount',
       y='banCount',
       log_y=True,
       color='subscriptionCount',
       trendline='ols',
       hover_name='name',
       hover_data=['videoCount'],
       labels={
           'subscriptionCount': '# of Subscribers',
           'name.en': 'Name',
           'banCount': '# of Ban',
       },
       title='Correlation between # of Subscriptions and Ban Events')

Channels placed in the lower right corner are considered to be less "trolled".

In [None]:
px.scatter(overall,
       x='subscriptionCount',
       y='chatCount',
       color='chatCountPerUser',
       trendline='ols',
       hover_name='name',
       hover_data=['videoCount'],
       labels={
           'subscriptionCount': '# of subscribers',
           'chatCountPerUser': 'avg. # of chats per user',
           'name.en': 'Name',
           'banCount': '# of ban',
           'chatCount': '# of chats'
       },
       title='Correlation between # of Subscriptions and Chats')

Those above the line perform well.

In [None]:
overall.sort_values('chatNunique', ascending=False).groupby('group').head(1).reset_index(drop=True)

In [None]:
groupTops = overall.sort_values('chatNunique', ascending=False).groupby('group').head(1).reset_index(drop=True)
px.scatter(groupTops,
       x='subscriptionCount',
       y='chatNunique',
       color='chatCountPerUser',
       trendline='ols',
       hover_name='name',
       hover_data=['chatNunique'],
       text='chatNunique',
       labels={
           'chatNunique': '<b>Unique Chatters</b>',
           'chatCountPerUser': 'Avg. <b>Chats</b> per User</b>',
           'subscriptionCount': '<b>Subscribers</b>',
           'name.en': 'Name',
           'group': 'Group',
           'banCount': '# of <b>ban</b>',
           'chatCount': '# of <b>chats</b>'
       },
       custom_data=['name.en'],
       title='How many people are joining the live chat?'
).update_traces(texttemplate='%{customdata[0]} <b>%{text:.2s}</b> UC', textposition='top center'
).update_layout(uniformtext_minsize=2, uniformtext_mode='hide'
)

Those above the line perform well.

In [None]:
px.scatter(overall,
       x='chatCount',
       y='scTotalJPY',
       color='subscriptionCount',
       hover_name='name',
       hover_data=['scTotalJPY', 'chatNunique'],
       trendline='ols',
       labels={
           'chatNunique': '# of unique users',
           'chatCountPerUser': 'avg. # of chats per user',
           'subscriptionCount': '# of subscribers',
           'name.en': 'Name',
           'group': 'Group',
           'banCount': '# of ban',
           'chatCount': '# of chats',
           'scTotalJPY': 'Total amount (JPY)'
       },
       title='Correlation between # of chats and total amount of superchats')

In [None]:
px.bar(overall.sort_values(by='scTotalJPY', ascending=False),
       x='name.en',
       y='scTotalJPY',
       color='scCount',
       hover_name='name',
       hover_data=['chatCount', 'subscriptionCount'],
       labels={
           'chatCount': '# of <b>Chats</b>',
           'subscriptionCount': '# of <b>Subscribers</b>',
           'videoCount': '# of <b>Videos</b>',
           'name.en': 'Name',
           'scCount': '# of <b>Super Chats</b>',
           'banCount': '# of <b>Ban</b>',
           'scTotalJPY': 'Total amount (JPY)'
       },
       title='Most superchatted channels'
)

In [None]:
overall['qScTotalJPY'] = pd.qcut(overall['scTotalJPY'], 3, labels=['Lower', 'Medium', 'Higher'])
overall['qScTotalJPY_N'] = pd.qcut(overall['scTotalJPY'], 3, labels=False)

# glabels, glevels = pd.factorize(channels['group'])
# channels['groupid'] = glabels

px.parallel_categories(overall.sort_values(by="group"),
       color='qScTotalJPY_N',
       color_continuous_scale=px.colors.sequential.Inferno,
       labels={
           'scTotalJPY': 'Total amount (JPY)',
           'qScTotalJPY': 'Income Level',
           'language': 'Language',
           'sex': 'Sex',
           'group': 'Group',
       },
       dimensions=['sex', 'language', 'qScTotalJPY', 'group'],
       title='Superchat <b>income distribution</b>'
).update_layout(coloraxis_showscale=False)

# Monthly Trend

In [None]:
monthly = channels[channels['period'] >= '2021-03'].groupby(['period', 'group']).agg({
    'chatCount': ['sum', 'mean', 'median'],
    'chatNunique': ['mean', 'median'],
    'banCount': ['sum', 'mean', 'median'],
    'banNunique': ['mean', 'median'],
    'deletionCount': ['sum', 'mean', 'median'],
    'scCount': ['sum', 'mean', 'median'],
    'scNunique': ['mean', 'median'],
    'scTotalJPY': ['sum', 'mean', 'median'],
    'affiliation': 'max',
})
monthly.columns = ["_".join(c) for c in monthly.columns.to_flat_index()]

In [None]:
sectorMap = {
    'Hololive': ['1st Generation', '2nd Generation', '3rd Generation', '4th Generation', '5th Generation', 'GAMERS', 'No Group'],
    'Holostars': ['Holostars 1st Gen', 'Holostars 2nd Generation', 'Holostars 3rd Gen'],
    'HoloEN': ['English 1st Gen'],
    'HoloID': ['Indonesia 1st Gen', 'Indonesia 2nd Gen', 'Indonesia 3rd Gen']
}
sector = pd.DataFrame()
for k, v in sectorMap.items():
    aggsec = channels[(channels['period'] >= '2021-03') & (channels['group'].isin(v))].groupby('period').agg({
        'chatCount': 'sum',
        'chatNunique': 'mean',
        'banCount': 'sum',
        'banNunique': 'mean',
        'deletionCount': 'sum',
        'scCount': 'sum',
        'scNunique': 'mean',
        'scTotalJPY': 'sum',
    }).reset_index()
    aggsec['sector'] = k
    sector = sector.append(aggsec, ignore_index=True)

In [None]:
px.bar(sector,
       x='period',
       y='chatCount',
       color='sector',
       labels={
           'chatCount': 'Total number of <b>chats</b>',
           'sector': 'Sector',
           'period': 'Period',
       },
       title='<b>Chats</b> intensity',
).update_xaxes(dtick='M1')

In [None]:
px.scatter(sector,
       x='period',
       y='scTotalJPY',
       color='sector',
       labels={
           'scTotalJPY': 'Total amount of <b>super chats</b>',
           'sector': 'Sector',
           'period': 'Period',
       },
       title='Monthly <b>super chats</b> income',
).update_xaxes(dtick='M1').update_traces(mode='markers+lines')

Note that YouTube would take roughly 30% of that of the total income from superchats.