# YouTube Comment Cleaning and Exploration

Takes data from a CSV file and cleans the data to isolate comments of interest

In [1]:
import re
import pandas as pd
from fugashi import Tagger
from pathlib import Path

In [2]:
input_path = Path('Resources/youtube_comments.csv')
comments_df = pd.read_csv(input_path, index_col=0)

comments_df.head(10)

Unnamed: 0,channel,video_id,category_id,text,date_published,comment_type
0,mwamjapan,Jb6Zlg30rgk,10,This season is going to be a masterpiece <3,2023-04-16T15:39:10Z,top-level
1,mwamjapan,Jb6Zlg30rgk,10,Every season is masterpiece 🔥🔥🔥,2023-04-17T16:41:52Z,reply
2,mwamjapan,Jb6Zlg30rgk,10,@HM cry about it,2023-04-17T16:15:19Z,reply
3,mwamjapan,Jb6Zlg30rgk,10,@HM dude demon slayer has no story but it has ...,2023-04-17T15:44:29Z,reply
4,mwamjapan,Jb6Zlg30rgk,10,@HM 🙂🙃😒😒😒,2023-04-17T13:44:05Z,reply
5,mwamjapan,Jb6Zlg30rgk,10,Honestamente no entiendo porqué esta canción r...,2023-04-17T03:58:25Z,top-level
6,mwamjapan,Jb6Zlg30rgk,10,​@みっくん𓈒𓂂◌𝙼𝙸𝚈𝚄 llora pues,2023-04-17T17:15:35Z,reply
7,mwamjapan,Jb6Zlg30rgk,10,たかがOPで世界観が壊れるアニメじゃないわはげたこ,2023-04-17T16:00:09Z,reply
8,mwamjapan,Jb6Zlg30rgk,10,悩めば、イイ！,2023-04-17T14:53:19Z,reply
9,mwamjapan,Jb6Zlg30rgk,10,@みっくん𓈒𓂂◌𝙼𝙸𝚈𝚄 ニワカほどそう言うよね！,2023-04-17T13:11:13Z,reply


In [3]:
# RE patterns needed for JP text
hiragana = r'\u3041-\u3096'
katakana = r'\u30A1-\u30F6'
kanji = r'\u3006\u4E00-\u9FFF'

# Matches to test for existence of text that uses a Japanese script
jp_text = rf'[{hiragana}{katakana}{kanji}]'

# Accepted characters
accepted_char = rf'[a-zA-Z0-9!?()\u30FC\u3005「」、。{hiragana}{katakana}{kanji}]'

In [4]:
# Remove @username from replies
comments_df['text'] = comments_df['text'].str.replace(r'@\S+\s', '', regex=True)

In [5]:
# Filter out comments that don't have any Japanese characters
has_jp_char = comments_df.loc[comments_df['text'].str.contains(jp_text, regex=True)]

has_jp_char.sample(10)

Unnamed: 0,channel,video_id,category_id,text,date_published,comment_type
90,緑仙 / Ryushen,Sis3y7l8G98,24,めちゃめちゃ面白かった！素敵な企画ありがとうございました🙏🏻🙏🏻お疲れ様です！,2023-04-09T11:01:55Z,top-level
65,ベーコン家のポテとひだり,1FK4rh7iOWs,15,こういう動画が伸びて欲しい。チャーは温かい家庭に引き取られて幸せやね。捨て犬、保護犬引き取っ...,2023-04-14T05:06:55Z,top-level
33,東海オンエア,ogfbE7ymLRU,23,結婚直前の人が言うと説得力ありますよね。,2023-04-16T07:40:42Z,reply
18,Cateen かてぃん,HSaYwPfx0oc,10,ホセとのgroovyなコラボ最高！ノリノリになりました！\nチャンネル登録120万人おめでと...,2023-04-16T02:58:33Z,top-level
94,THE RAMPAGE from EXILE TRIBE,cnxtRkvWG-M,10,クオリティが想像を超えすぎて毎回毎回びっくりさせられる👀.′.′\nランペはほんと凄いよ…🥺...,2023-04-12T14:28:33Z,top-level
139,プレチャン 【プレミアリーグ情報チャンネル】,j9gZfR3j6cU,17,ABEMAのコメ欄はより一層荒れるんやろうけど、正直レオザの解説楽しみ,2023-04-13T13:41:10Z,top-level
92,中町綾チャンネル,H69zvy56BPg,24,綾ちゃん見てたら嫌なことも忘れてまた頑張ろってなる😮,2023-04-15T11:07:10Z,top-level
38,Snow Man,5-iJl-fKHwI,10,新生活が始まって不安だらけの中毎朝この歌を聞かせていただいています😊この歌はひとりじゃないっ...,2023-04-16T21:49:08Z,top-level
109,spitzclips,KbGPM9jFeGg,10,同感です。\n声が変わってないと書かれているコメントよく見かけますが、変わってますよ。そう書...,2023-04-17T17:30:40Z,reply
114,東海オンエア,I0jAyfCsL_8,23,意外とドキドキする場面多いし、トークおじさんすぎておもろいし、一生みていられる😃,2023-04-16T14:50:25Z,top-level


In [6]:
# Remove other unnecessary characters - 
def clean_text(text):
    
    text = re.sub(r'\n', '', text)
    text = re.sub(r'\t', '', text)
    text = re.sub(r'\r', '', text)
    text = re.sub(r'https?://[a-zA-Z0-9.-]*[/?[a-zA-Z0-9.-_]*]*', '', text)
    text = re.sub(r'笑+', '笑', text)
    
    filtered_text = ''
    
    for char in text:
        if re.match(accepted_char, char):
            
            filtered_text += char
            
    return filtered_text

In [7]:
# Clean
cleaned_df = has_jp_char.copy()
cleaned_df['text'] = cleaned_df['text'].apply(clean_text)

cleaned_df.head(10)

Unnamed: 0,channel,video_id,category_id,text,date_published,comment_type
7,mwamjapan,Jb6Zlg30rgk,10,たかがOPで世界観が壊れるアニメじゃないわはげたこ,2023-04-17T16:00:09Z,reply
8,mwamjapan,Jb6Zlg30rgk,10,悩めば、イイ,2023-04-17T14:53:19Z,reply
9,mwamjapan,Jb6Zlg30rgk,10,ニワカほどそう言うよね,2023-04-17T13:11:13Z,reply
18,mwamjapan,Jb6Zlg30rgk,10,珍しくショート動画が出てますね最高すぎる,2023-04-17T10:49:06Z,reply
19,mwamjapan,Jb6Zlg30rgk,10,めっちゃ同意,2023-04-17T08:30:39Z,reply
44,mwamjapan,Jb6Zlg30rgk,10,miletとのコラボには最初びっくりしたけど、声の相性が良くて特に重なった時に何とも言えない...,2023-04-17T00:54:46Z,top-level
45,mwamjapan,Jb6Zlg30rgk,10,全く仰るとおりで、作中にはいろんな絆ノ奇跡がありますよね。特に刀鍛冶の里編のラストは「絆ノ奇...,2023-04-17T13:48:13Z,reply
46,mwamjapan,Jb6Zlg30rgk,10,「我が命果てようとも繋いでいこう」自分が志半ばで死んでも繋いだ命がきっと果たしてくれる、と二...,2023-04-17T13:37:24Z,reply
47,mwamjapan,Jb6Zlg30rgk,10,我が命果てようともには無一郎を思い浮かべました。,2023-04-17T13:07:22Z,reply
48,mwamjapan,Jb6Zlg30rgk,10,鬼滅の刃には、色んな絆ノ奇跡がありますね,2023-04-17T11:53:14Z,reply


In [8]:
# Check size of DF
cleaned_df.shape

(11297, 6)

In [9]:
# Check for null values
cleaned_df.isna().sum()

channel           0
video_id          0
category_id       0
text              0
date_published    0
comment_type      0
dtype: int64

In [10]:
# Check unique values in each
cleaned_df.nunique()

channel              82
video_id             92
category_id          12
text              10670
date_published    11104
comment_type          2
dtype: int64

In [11]:
# Check amounts of top-level vs. replies for comments
cleaned_df['comment_type'].value_counts()

top-level    8320
reply        2977
Name: comment_type, dtype: int64