# Hololive Insights

In [1]:
import numpy as np
import pandas as pd
from glob import iglob
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

ModuleNotFoundError: No module named 'plotly'

In [2]:
!conda install plotly

^C


In [None]:
stats = pd.read_csv('../input/vtuber-livechat-elements/chat_stats.csv')
sc_stats = pd.read_csv('../input/vtuber-livechat-elements/superchat_stats.csv')
channels = pd.read_csv('../input/vtuber-livechat-elements/channels.csv')

stats = pd.merge(stats, sc_stats, on=['channelId', 'period'], how='left')

# select only active Hololive-affiliated channels
channels = channels[(channels['affiliation'] == 'Hololive') & (channels['group'] != 'INACTIVE')]
channels['group'].fillna('No Group', inplace=True)

# exclude official/secondary/graduated channels
officialChannels = [
    'UCJFZiqLMntJufDCHc6bQixg',
    'UCfrWoRGlawPQDQxxeIDRP0Q',
    'UCotXwY6s8pWmuWd_snKYjhg',
    'UCWsfcksUUpoEvhia0_ut0bA',
]
subChannels = [
    'UCHj_mh57PVMXhAUDphUQDFA',
    'UCLbtM3JZfRTg8v2KGag-RMw',
    'UCp3tgHXw_HI0QMk1K8qh3gQ',
]
graduated = [
    'UCS9uQI-jC3DE0L4IpXyvr6w'
]
channels = channels[~channels['channelId'].isin(officialChannels + subChannels + graduated)]

channels = pd.merge(channels, stats, on=['channelId'], how='left')

# sex
channels['sex'] = channels['group'].apply(lambda g: 'Male' if g.startswith('Holostars') else 'Female')

# language
def langmatch(channel):
    if channel['group'].startswith('English') or channel['englishName'] == 'IRyS':
        return 'English'
    elif channel['group'].startswith('Indonesia'):
        return 'Indonesian'
    return 'Japanese'
channels['language'] = channels.apply(langmatch, axis=1)

# aggregate data
overall = channels.groupby('englishName').agg({
    'subscriptionCount': 'first',
    'videoCount': 'first',
    'chats': 'sum',
    'uniqueChatters': 'mean',
    'bannedChatters': 'sum',
    'deletedChats': 'sum',
    'superChats': 'sum',
    'uniqueSuperChatters': 'mean',
    'totalSC': 'sum',
    'averageSC': 'last',
    'affiliation': 'first',
    'group': 'first',
    'name': 'first',
    'sex': 'first',
    'language': 'first'
}).reset_index()
overall['chatCountPerUser'] = overall['chats'] / overall['uniqueChatters']

In [None]:
px.bar(overall.sort_values(by='subscriptionCount', ascending=False),
       x='englishName',
       y='subscriptionCount',
       color='videoCount',
       hover_name='name',
       hover_data=['videoCount', 'group'],
       labels={
           'subscriptionCount': '# of Subscribers',
           'videoCount': '# of Videos',
           'englishName': 'Name',
           'group': 'Group',
       },
       title='Most Subscribed Channels')

In [None]:
overall['subPerVideo'] = overall.apply(lambda x: x['subscriptionCount'] / max(0,1, x['videoCount']), axis=1)
px.bar(overall.sort_values(by='subPerVideo', ascending=False),
       x='englishName',
       y='subPerVideo',
       color='videoCount',
       hover_name='name',
       hover_data=['videoCount', 'group'],
       labels={
           'subscriptionCount': '# of Subscribers',
           'videoCount': '# of Videos',
           'englishName': 'Name',
           'group': 'Group',
           'subPerVideo': 'AASS'
       },
       title='Average Acquired Subscribers per Stream (AASS)')

In [None]:
px.bar(overall.sort_values(by='subscriptionCount', ascending=False),
       x='englishName',
       y='subscriptionCount',
       color='group',
       hover_name='name',
       hover_data=['videoCount'],
       labels={
           'subscriptionCount': '# of Subscribers',
           'videoCount': '# of Videos',
           'englishName': 'Name',
           'group': 'Group',
       },
       title='Most Subscribed Channels per Group')

In [None]:
px.treemap(overall, path=['group', 'name'], values='subscriptionCount')

In [None]:
px.bar(overall.sort_values(by='videoCount', ascending=False),
       x='englishName',
       y='videoCount',
       color='subscriptionCount',
       hover_name='name',
       hover_data=['videoCount', 'subscriptionCount', 'group'],
       labels={
           'subscriptionCount': '# of Subscribers',
           'videoCount': '# of Videos',
           'englishName': 'Name',
           'group': 'Group',
       },
       title='Most Active Channels in terms of Number of Videos')

In [None]:
px.bar(overall.sort_values(by='chats', ascending=False),
       x='englishName',
       y='chats',
       color='chatCountPerUser',
       hover_name='name',
       labels={
           'chatCountPerUser': 'Average chats per user',
           'videoCount': '# of Videos',
           'englishName': 'Name',
           'chats': '# of Chat',
       },
       title='Live Chat Intensity (All-time)'
).update_layout(xaxis={'categoryorder': 'total descending'}
).update_traces(texttemplate='%{y:.2s}', textposition='outside')
# ).add_annotation(x='IRyS', y=1200000,
#             text='Avg. chats per user: 9.9',
#             showarrow=True,
#             arrowhead=1)

In [None]:
px.bar(overall.sort_values(by='bannedChatters', ascending=False),
       x='englishName',
       y='bannedChatters',
       color='chats',
       hover_name='name',
       hover_data=['chats', 'subscriptionCount'],
       labels={
           'chats': "# of Chats",
           'subscriptionCount': '# of Subscribers',
           'videoCount': '# of Videos',
           'englishName': 'Name',
           'bannedChatters': '# of Banned Chatters',
       },
       title='Ban Events (y-axis is log-scaled)'
)

In [None]:
px.scatter(overall,
       x='subscriptionCount',
       y='bannedChatters',
       log_y=True,
       color='subscriptionCount',
       trendline='ols',
       hover_name='name',
       hover_data=['videoCount'],
       labels={
           'subscriptionCount': '# of Subscribers',
           'englishName': 'Name',
           'bannedChatters': '# of Banned Chatters',
       },
       title='Correlation between # of Subscriptions and Ban Events')

Channels placed in the lower right corner are considered to be less "trolled".

In [None]:
px.scatter(overall,
       x='subscriptionCount',
       y='chats',
       color='chatCountPerUser',
       trendline='ols',
       hover_name='name',
       hover_data=['videoCount'],
       labels={
           'subscriptionCount': '# of subscribers',
           'chatCountPerUser': 'avg. # of chats per user',
           'englishName': 'Name',
           'chats': '# of chats'
       },
       title='Correlation between # of Subscriptions and Chats')

Those above the line perform well.

In [None]:
overall.sort_values('uniqueChatters', ascending=False).groupby('group').head(1).reset_index(drop=True)

In [None]:
groupTops = overall.sort_values('uniqueChatters', ascending=False).groupby('group').head(1).reset_index(drop=True)
px.scatter(groupTops,
       x='subscriptionCount',
       y='uniqueChatters',
       color='chatCountPerUser',
       trendline='ols',
       hover_name='name',
       hover_data=['uniqueChatters'],
       text='uniqueChatters',
       labels={
           'uniqueChatters': '<b>Unique Chatters (UC)</b>',
           'chatCountPerUser': 'Avg. <b>Chats</b> per User</b>',
           'subscriptionCount': '<b>Subscribers</b>',
           'englishName': 'Name',
           'group': 'Group',
           'chats': '# of <b>chats</b>'
       },
       custom_data=['englishName'],
       title='How many people are joining live chat?'
).update_traces(texttemplate='%{customdata[0]} <b>%{text:.2s}</b> UC', textposition='top center'
).update_layout(uniformtext_minsize=2, uniformtext_mode='hide'
)

Those above the line perform well.

In [None]:
px.scatter(overall,
       x='chats',
       y='totalSC',
       color='subscriptionCount',
       hover_name='name',
       hover_data=['totalSC', 'uniqueChatters'],
       trendline='ols',
       labels={
           'uniqueChatters': '# of unique users',
           'chatCountPerUser': 'avg. # of chats per user',
           'subscriptionCount': '# of subscribers',
           'englishName': 'Name',
           'group': 'Group',
           'chats': '# of chats',
           'totalSC': 'Total amount (JPY)'
       },
       title='Correlation between # of chats and total amount of superchats')

In [None]:
px.bar(overall.sort_values(by='totalSC', ascending=False),
       x='englishName',
       y='totalSC',
       color='superChats',
       hover_name='name',
       hover_data=['chats', 'subscriptionCount'],
       labels={
           'chats': '# of <b>Chats</b>',
           'subscriptionCount': '# of <b>Subscribers</b>',
           'videoCount': '# of <b>Videos</b>',
           'englishName': 'Name',
           'superChats': '# of <b>Super Chats</b>',
           'totalSC': 'Total amount (JPY)'
       },
       title='Most superchatted channels'
)

In [None]:
overall['qScTotalJPY'] = pd.qcut(overall['totalSC'], 3, labels=['Lower', 'Medium', 'Higher'])
overall['qScTotalJPY_N'] = pd.qcut(overall['totalSC'], 3, labels=False)

# glabels, glevels = pd.factorize(channels['group'])
# channels['groupid'] = glabels

px.parallel_categories(overall.sort_values(by="group"),
       color='qScTotalJPY_N',
       color_continuous_scale=px.colors.sequential.Inferno,
       labels={
           'totalSC': 'Total amount (JPY)',
           'qScTotalJPY': 'Income Level',
           'language': 'Language',
           'sex': 'Sex',
           'group': 'Group',
       },
       dimensions=['sex', 'language', 'qScTotalJPY', 'group'],
       title='Superchat <b>income distribution</b>'
).update_layout(coloraxis_showscale=False)

# Monthly Trend

In [None]:
monthly = channels[channels['period'] >= '2021-03'].groupby(['period', 'group']).agg({
    'chats': ['sum', 'mean', 'median'],
    'uniqueChatters': ['mean', 'median'],
    'deletedChats': ['sum', 'mean', 'median'],
    'superChats': ['sum', 'mean', 'median'],
    'uniqueSuperChatters': ['mean', 'median'],
    'totalSC': ['sum', 'mean', 'median'],
    'affiliation': 'max',
})
monthly.columns = ["_".join(c) for c in monthly.columns.to_flat_index()]

In [None]:
sectorMap = {
    'Hololive': ['1st Generation', '2nd Generation', '3rd Generation', '4th Generation', '5th Generation', 'GAMERS', 'No Group'],
    'Holostars': ['Holostars 1st Gen', 'Holostars 2nd Generation', 'Holostars 3rd Gen'],
    'HoloEN': ['English (Myth)', 'English (Council)'],
    'HoloID': ['Indonesia 1st Gen', 'Indonesia 2nd Gen', 'Indonesia 3rd Gen']
}
sector = pd.DataFrame()
for k, v in sectorMap.items():
    aggsec = channels[(channels['period'] >= '2021-03') & (channels['group'].isin(v))].groupby('period').agg({
        'chats': 'sum',
        'uniqueChatters': 'mean',
        'bannedChatters': 'sum',
        'deletedChats': 'sum',
        'superChats': 'sum',
        'uniqueSuperChatters': 'mean',
        'totalSC': 'sum',
    }).reset_index()
    aggsec['sector'] = k
    sector = sector.append(aggsec, ignore_index=True)

In [None]:
px.bar(sector,
       x='period',
       y='chats',
       color='sector',
       labels={
           'chats': 'Total number of <b>chats</b>',
           'sector': 'Sector',
           'period': 'Period',
       },
       title='<b>Chats</b> intensity',
).update_xaxes(dtick='M1')

In [None]:
px.scatter(sector,
       x='period',
       y='totalSC',
       log_y=True,
       color='sector',
       labels={
           'totalSC': 'Total amount of <b>super chats</b>',
           'sector': 'Sector',
           'period': 'Period',
       },
       title='Monthly <b>super chats</b> income',
).update_xaxes(dtick='M1').update_traces(mode='markers+lines')

Note that YouTube would take roughly 30% of that of the total income from superchats.