# Preparation

In [None]:
import pandas as pd
import numpy as np

In [None]:
chat_df = pd.read_csv('../input/vtuber-livechat/chats_2021-07.csv', usecols=['id', 'channelId', 'originVideoId', 'body'], nrows=10000000)
ban_df = pd.read_csv('../input/vtuber-livechat/ban_events.csv', usecols=['channelId', 'originVideoId'])
deletion_df = pd.read_csv('../input/vtuber-livechat/deletion_events.csv', usecols=['id', 'retracted'])

# Events

## Ban Events

In [None]:
ban_df['banned'] = True
chat_df = pd.merge(chat_df, ban_df, on=['channelId', 'originVideoId'], how='left')
chat_df['banned'].fillna(False, inplace=True)
chat_df.describe()

In [None]:
chat_df.query('banned').sample(10)

## Deletion Events

In [None]:
deletion_df['deleted'] = True
chat_df = pd.merge(chat_df, deletion_df[deletion_df['retracted'] == 0][['id', 'deleted']], how='left')
chat_df['deleted'].fillna(False, inplace=True)
chat_df.describe()

In [None]:
chat_df.query('deleted').sample(10)

# Plotting Sentence Vector

In [None]:
%pip install -q sentence-transformers
from sentence_transformers import SentenceTransformer
from sklearn.manifold import TSNE
import plotly.express as px
import plotly.graph_objects as go

In [None]:
model = SentenceTransformer('paraphrase-xlm-r-multilingual-v1')

In [None]:
n_sample = 1000

toxic = chat_df.query('deleted').sample(n_sample)['body'].to_list()
safe  = chat_df.query('not (banned or deleted)').sample(n_sample)['body'].to_list()

toxic_embeds = model.encode(toxic)
safe_embeds = model.encode(safe)
tsne = TSNE(n_components=2,
            perplexity=40
).fit_transform(np.vstack([toxic_embeds, safe_embeds]))

In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=tsne[:n_sample, 0], y=tsne[:n_sample, 1], mode='markers', name='Deleted Chat', text=toxic, marker=dict(color='OrangeRed', size=4)))
fig.add_trace(go.Scatter(x=tsne[n_sample:, 0], y=tsne[n_sample:, 1], mode='markers', name='Chat', text=safe, marker=dict(color='CornflowerBlue', size=4)))
fig.update_layout(title='Sentence vector distribution')