In [None]:
import pandas as pd
import numpy as np
from glob import iglob
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [None]:
df = pd.concat([
    pd.read_csv(f,
                na_values='',
                keep_default_na=False)
    for f in iglob('../input/vtuber-livechat/superchats_*.csv')
],
               ignore_index=True)

# body length
df['bodylength'] = df['body'].str.len().fillna(0).astype('int')

channels = pd.read_csv('../input/vtuber-livechat/channels.csv')
df = pd.merge(df, channels, how='left', left_on='originChannelId', right_on='channelId').drop(columns=['originChannelId', 'channelId_y']).rename(columns={'channelId_x': 'authorChannelId'})

df.index = pd.to_datetime(df['timestamp'])
df.sort_index(inplace=True)

df.dropna(inplace=True)

# Convert UTC to JST
# df.index = df.index.tz_convert('Asia/Tokyo')
df.info()

In [None]:
ftsc = df[df['body'].str.contains(r'(?:初(?:めて|の)?スー?パ|[Ff]irst[-\s][Tt]ime\s(?:[Ss]uper|SC))')]

In [None]:
px.bar(ftsc.groupby('name.en').agg({'name.en': 'first', 'body': 'count'}),
       color='name.en',
       y='body',
       title='Who got first-time superchat the most?',
       labels={'body': 'Number of first-time SC'}
).update_xaxes(categoryorder='total descending').update_layout(showlegend=False)

In [None]:
fc = ftsc.groupby(['name.en']).agg({ 'authorChannelId': ['count', 'nunique']}).reset_index()
fc['fakesc'] = fc['authorChannelId']['count'] - fc['authorChannelId']['nunique']
# fc['fakerate(%)'] = fc['fakesc'] / fc['authorChannelId']['count'] * 100
fc.columns = ['_'.join(filter(None, col)) for col in fc.columns.values]

# fc = fc[fc['fakesc'] > 0].sort_values('fakesc', ascending=False)
px.bar(fc,
       x='name.en',
       y='fakesc',
       color='fakesc',
       labels={'fakesc': 'Number of <b>Fake First-time SC</b>', 'name.en': 'Name'},
       title='Number of users who faked first-time super chat'
).update_xaxes(categoryorder='total descending'
).update_layout(showlegend=False)
