In [1]:
from os.path import join

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import glob
from sentence_transformers import SentenceTransformer

from operator import itemgetter

import zlib
import itertools
import re
import math

from tqdm import tqdm

tqdm.pandas()

data_dir = '../../data/'
raw_data_dir = join(data_dir, 'raw')



In [2]:


def suffix_array(arr):
    arr_size = len(arr)
    arr_int = {v: k for k, v in enumerate(sorted(set(arr)))}
    arr = [arr_int[x] for x in arr]
    arr.append(-1)
    suf = [[i, arr[i], arr[i + 1]] for i in range(arr_size)]
    suf.sort(key=itemgetter(1, 2))
    idx = [0] * arr_size
    k = 2
    while k < arr_size:
        r = 0
        prev_r = suf[0][1]
        for i in range(arr_size):
            if suf[i][1] != prev_r or suf[i - 1][2] != suf[i][2]:
                r += 1
            prev_r = suf[i][1]
            suf[i][1] = r
            idx[suf[i][0]] = i
        for i in range(arr_size):
            next_idx = suf[i][0] + k
            suf[i][2] = suf[idx[next_idx]][1] if next_idx < arr_size else -1
        suf.sort(key=itemgetter(1, 2))
        k <<= 1
    return [x[0] for x in suf]

def bwt_encode(data):
    data_ref = suffix_array(data)
    bwt_ref = (x - 1 for x in data_ref)
    return ''.join([source[x] for x in bwt_ref])


In [3]:
files = glob.glob(join(raw_data_dir, "*.jsonl"))
print(f'# of files: {str(len(files))}')
files = [pd.read_json(open(f), lines=True) for f in files]

# concat df
df = pd.concat(files, axis=0)
df.info()


# of files: 24
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1852551 entries, 0 to 11099
Data columns (total 17 columns):
 #   Column           Dtype         
---  ------           -----         
 0   type             object        
 1   id               object        
 2   timestampUsec    datetime64[ns]
 3   rawMessage       object        
 4   authorName       object        
 5   authorChannelId  object        
 6   authorPhoto      object        
 7   isVerified       float64       
 8   isOwner          float64       
 9   isModerator      float64       
 10  message          object        
 11  membership       object        
 12  originVideoId    object        
 13  originChannelId  object        
 14  targetId         object        
 15  purchase         object        
 16  channelId        object        
dtypes: datetime64[ns](1), float64(3), object(13)
memory usage: 254.4+ MB


In [12]:
# transform
chat_df = df[df["type"] == "addChatItemAction"]
chat_df.drop(columns=[
    'type',
    'channelId',
    'targetId',
], inplace=True)
chat_df.drop_duplicates('id', inplace=True)

# remove old chat which has no origin info
chat_df.dropna(subset=['originVideoId'], inplace=True)

# sort by time
chat_df.sort_values(by=['timestampUsec'], inplace=True)

ban_df = df[df["type"] == "markChatItemsByAuthorAsDeletedAction"]
ban_df = ban_df[['channelId', 'originVideoId', 'originChannelId']]
ban_df.rename(columns={'channelId': 'authorChannelId'}, inplace=True)
ban_df.dropna(subset=['originVideoId'], inplace=True)
ban_df.drop_duplicates(inplace=True)

delete_actions_df = df[df["type"] == "markChatItemAsDeletedAction"]
delete_actions_df = delete_actions_df[[
    'targetId', 'originVideoId', 'originChannelId'
]]

# remove custom emojis
chat_df['message'] = chat_df['message'].replace(to_replace ='<.+?>', value = '', regex = True)

# remove chat with empty message
chat_df = chat_df[chat_df['message'].astype(bool)]

spam_excludes = pd.read_csv(join(data_dir, 'spam_exclusion.txt'),
                            header=None,
                            squeeze=True)

# # assume chat with flagged as deletion as spam
# chat_df['spam'] = chat_df['authorChannelId'].isin(spam_ids)

ban_df['marked'] = True
markedChatByModerators = pd.merge(chat_df, ban_df, on=['authorChannelId', 'originVideoId', 'originChannelId'])

In [13]:
print(chat_df.info())
print(len(pd.merge(chat_df, ban_df, on=['authorChannelId', 'originVideoId', 'originChannelId'])))
markedChatByModerators = pd.merge(chat_df, ban_df, on=['authorChannelId', 'originVideoId', 'originChannelId'], how='left')
markedChatByModerators.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1314386 entries, 237237 to 11099
Data columns (total 14 columns):
 #   Column           Non-Null Count    Dtype         
---  ------           --------------    -----         
 0   id               1314386 non-null  object        
 1   timestampUsec    1314386 non-null  datetime64[ns]
 2   rawMessage       1313772 non-null  object        
 3   authorName       1314181 non-null  object        
 4   authorChannelId  1314386 non-null  object        
 5   authorPhoto      1314386 non-null  object        
 6   isVerified       1314386 non-null  float64       
 7   isOwner          1314386 non-null  float64       
 8   isModerator      1314386 non-null  float64       
 9   message          1313772 non-null  object        
 10  membership       551260 non-null   object        
 11  originVideoId    1314386 non-null  object        
 12  originChannelId  1314386 non-null  object        
 13  purchase         6872 non-null     object        
dtyp

In [10]:
markedChatByModerators.tail()

Unnamed: 0,id,timestampUsec,rawMessage,authorName,authorChannelId,authorPhoto,isVerified,isOwner,isModerator,message,membership,originVideoId,originChannelId,purchase,marked
1319149,CjkKGkNOemtuXzJTci00Q0ZjZzNyUVlkMkhnTVBREhtDTk...,2021-01-22 08:43:45.270415,{'runs': [{'text': 'まだー３８痛い'}]},くろの。,UChdPR-yQm3nfyyC2Y2JGlCw,https://yt4.ggpht.com/ytc/AAUvwnh6riGnaM85bnqh...,0.0,0.0,0.0,まだー３８痛い,,x2k0FCEHNqM,UC1opHUrw8rvnsadT-iGp7Cg,,
1319150,CjkKGkNQajQwUDJTci00Q0Zid1RyUVlkeF93RFZREhtDTm...,2021-01-22 08:43:46.075821,{'runs': [{'text': 'BIG BRAIN'}]},Yisrine Yuuki,UCVuaf43_WWvhLyb6fwpUclw,https://yt4.ggpht.com/ytc/AAUvwnhzSFy0fA5y9VGb...,0.0,0.0,0.0,BIG BRAIN,,x2k0FCEHNqM,UC1opHUrw8rvnsadT-iGp7Cg,,
1319151,CkUKGkNJYV9oZi1Tci00Q0ZVS1B3Z0VkWG04RDNBEidDTm...,2021-01-22 08:43:49.033902,{'runs': [{'text': 'you got this Aqua'}]},Celestial Archon,UCMaTPWrw8p_Hm43l2IzcYTQ,https://yt4.ggpht.com/ytc/AAUvwngxxISK6xwI-GNG...,0.0,0.0,0.0,you got this Aqua,,x2k0FCEHNqM,UC1opHUrw8rvnsadT-iGp7Cg,,
1319152,CjkKGkNQQ0l2Zi1Tci00Q0ZiZ1VyUVlkSS00S1lBEhtDT1...,2021-01-22 08:43:49.944465,{'runs': [{'text': 'Sololive oniooon clutch'}]},Jslz cbbd,UChoODK7uJraJSpUjaBN0_zQ,https://yt4.ggpht.com/ytc/AAUvwnggslWMjvBC7d2-...,0.0,0.0,0.0,Sololive oniooon clutch,,x2k0FCEHNqM,UC1opHUrw8rvnsadT-iGp7Cg,,
1319153,CjoKGkNJckt2Zi1Tci00Q0Zjd0RyUVlkdl93RzRREhxDTz...,2021-01-22 08:43:49.952804,{'runs': [{'text': '速攻促進剤つかっても無理っぽかったな'}]},STINGER,UCMBqEKxkpI3htzdTckodWdw,https://yt4.ggpht.com/ytc/AAUvwnjXNaiw3ZUCPYLu...,0.0,0.0,0.0,速攻促進剤つかっても無理っぽかったな,,x2k0FCEHNqM,UC1opHUrw8rvnsadT-iGp7Cg,,


In [None]:
spam_rows = chat_df['authorChannelId'].isin(spam_ids)
chat_df.loc[spam_rows, 'message']

In [None]:

spam_df = chat_df[chat_df['spam']==1.0].reset_index(drop=True)
# print(spam_df.head())
spam_df['timestampUsec'] = pd.to_datetime(spam_df['timestampUsec'], unit='us')
def calc_bwtrl(df):
    hist = spam_df[(spam_df['authorChannelId'] == df['authorChannelId']) & (spam_df['originVideoId'] == df['originVideoId']) & (spam_df['timestampUsec'] <= df['timestampUsec'])][:100]
    return accum_entropy(hist['message'].to_list())
s = spam_df
s['bwtrl'] = s.progress_apply(lambda x: calc_bwtrl(x), axis=1)
s.head()

In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
pd.options.display.max_rows = 500
high = s[s['bwtrl'] > 50]
high.sort_values(by=['bwtrl','authorName'])[['bwtrl','message','authorName','authorChannelId']]
# pd.Series(high['authorChannelId'].unique()).to_csv('highly_spam.csv')

In [None]:


# preprocessing

# delete redundant chat
deletedChatIds = delete_actions_df['targetId']
chat_df.drop(chat_df[chat_df['id'].isin(deletedChatIds)].index,
                inplace=True)

# remove chat with empty message (mostly superchat?)
# for later processing in sentence encoding
chat_df = chat_df[chat_df['message'].notnull()]

# boolean features
chat_df['isMember'] = pd.notna(chat_df['membership'])
chat_df['isSuperchat'] = chat_df['purchase'].notna()
chat_df['isOwner'].fillna(False)
chat_df['isVerified'].fillna(False)
chat_df['isModerator'].fillna(False)

# manually exclude wrongly flagged users
spam_ids = ban_df["channelId"]
spam_excludes = pd.read_csv(join(data_dir, 'spam_exclusion.txt'),
                            header=None,
                            squeeze=True)
spam_ids = spam_ids[~spam_ids.isin(spam_excludes)]

# assume chat with flagged as deletion as spam
chat_df['spam'] = chat_df['authorChannelId'].isin(spam_ids).astype(
    'float64')

# count features
chat_df['authorLength'] = chat_df['authorName'].apply(lambda x: len(str(x)))
chat_df['messageLength'] = chat_df['message'].apply(lambda x: len(str(x)))
chat_df['messageUniqueness'] = chat_df['message'].apply(
    lambda x: len(set(str(x))))

def calc_bwtrl(df):
    hist = chat_df[chat_df['authorChannelId'] == df['authorChannelId']]['message']
    print(hist)

# bwt rl entropy
chat_df['bwtRlEntropy'] = chat_df['message'].apply(
    lambda x: len(set(str(x))))

# encode message to embedding vector
# model = SentenceTransformer('paraphrase-xlm-r-multilingual-v1')
# message_array = chat_df['message'].to_list()
# embeds = model.encode(message_array, show_progress_bar=True)
# emb_columns = ['emb_' + str(i) for i in range(embeds.shape[1])]
# edf = pd.DataFrame(embeds, columns=emb_columns)
# chat_df[emb_columns] = edf

chat_df['isMember'] = chat_df['isMember'].astype('category')
chat_df['isSuperchat'] = chat_df['isSuperchat'].astype('category')
chat_df['isOwner'] = chat_df['isOwner'].astype('category')
chat_df['isVerified'] = chat_df['isVerified'].astype('category')
chat_df['isModerator'] = chat_df['isModerator'].astype('category')

chat_df.drop(columns=[
    'id', 'type', 'authorName', 'authorPhoto', 'authorChannelId',
    'membership', 'purchase', 'rawMessage', 'timestampUsec', 'channelId',
    'targetId', 'originChannelId', 'originVideoId', 'message'
],
                inplace=True)

# review
chat_df.info()
print(chat_df.head())

print(f"# of chat: {str(len(chat_df))}")
print(f"# of banActions: {str(len(ban_df))}")
print(f"# of deleteActions: {str(len(delete_actions_df))}")

markedAsSpam = chat_df[chat_df["spam"] == 1.0]
harmless = chat_df[chat_df["spam"] == 0.0]
print(f'# of spam: {str(len(markedAsSpam))}')
print(markedAsSpam.describe())
print(f'# of not spam: {str(len(harmless))}')
print(harmless.describe())
print("spam ratio: ", len(markedAsSpam) / len(harmless))

# split data frame

# markedAsSpam = markedAsSpam.sort_values('authorName')
# target_col = ["authorName", "authorChannelId", "message"]
# markedAsSpam[target_col].to_csv(join(data_dir, 'spam.csv'))

# create dataset

chat_df.to_parquet(join(data_dir, 'train.parquet'))