In [1]:
import pandas as pd
import boto3
import re
import unicodedata
from lingua import Language, LanguageDetectorBuilder

## Load subreddit comments from S3

In [2]:
%%time

df_dict = dict()

subreddits = ["SingaporeRaw", "indonesia", "malaysia"]
path = 'path'

# load sample from each subreddit: random 1k from most recent 100k
for sub in subreddits:
    df_dict[sub] = pd.read_csv(f"{path}/{sub}_comments_all.csv", nrows=1000000)
    df_dict[sub] = df_dict[sub][["body"]]
    df_dict[sub].columns = ["text"]


CPU times: user 17 s, sys: 3.13 s, total: 20.1 s
Wall time: 51.1 s


## Clean comments

In [3]:
%%time

# clean text

def clean_text(text):
    
    text = unicodedata.normalize("NFKD", text)
    text = re.sub(r"http[s]?:\/\/\S+", "[URL]", text)

    # remove newline and tab characters
    text = text.replace('\n',' ')
    text = text.replace('\t',' ')
    
    # strip whitespace
    text = text.strip()
    
    return text

for sub in df_dict:
    df_dict[sub].text = df_dict[sub].text.apply(clean_text)
    

CPU times: user 4.2 s, sys: 0 ns, total: 4.2 s
Wall time: 4.21 s


In [4]:
%%time

# drop "useless" comments

def keep_text(text):    
    if len(text)>3: # heuristic -> shorter comments are very likely not useful
        # may want some more rules here
        return True
    else:
        return False

for sub in df_dict:
    print("#"*40)
    print(sub.upper())
    print("before:\t", df_dict[sub].shape[0])
    df_dict[sub] = df_dict[sub][df_dict[sub].text.apply(keep_text)]
    print("now:\t", df_dict[sub].shape[0],"\n")

########################################
SINGAPORERAW
before:	 218097
now:	 215149 

########################################
INDONESIA
before:	 1000000
now:	 988786 

########################################
MALAYSIA
before:	 1000000
now:	 993562 

CPU times: user 1.11 s, sys: 0 ns, total: 1.11 s
Wall time: 1.11 s


## Take random sample of clean comments (for testing / debugging)

In [5]:
for sub in df_dict:
    df_dict[sub] = df_dict[sub].sample(10000, random_state=123)

## Run language detection over comments

In [6]:
# initialise language detector
languages = [Language.ENGLISH, Language.INDONESIAN, Language.MALAY, Language.CHINESE]
detector = LanguageDetectorBuilder.from_languages(*languages).build()

def compute_language_scores(detector, text):
    return [(lang.name.lower(), score) for lang, score in detector.compute_language_confidence_values(text)]

def extract_primary_language(scores):
    if len(scores)>0:
        return scores[0][0]
    else:
        return "none"

In [7]:
%%time

for sub in df_dict:
    
    df_dict[sub]["scores_lingua"] = df_dict[sub].text.apply(lambda x: compute_language_scores(detector, x))
    df_dict[sub]["lang_lingua"] = df_dict[sub]["scores_lingua"].apply(lambda x: extract_primary_language(x))
    
    print("#"*40)
    print(sub.upper())
    display(df_dict[sub].lang_lingua.value_counts())
    print()

########################################
SINGAPORERAW


english       9781
malay           97
indonesian      85
chinese         22
none            15
Name: lang_lingua, dtype: int64


########################################
INDONESIA


indonesian    5998
english       2804
malay         1172
none            26
Name: lang_lingua, dtype: int64


########################################
MALAYSIA


english       9304
malay          406
indonesian     283
none             7
Name: lang_lingua, dtype: int64


CPU times: user 3min 29s, sys: 0 ns, total: 3min 29s
Wall time: 3min 29s


In [8]:
for sub in df_dict:
    print(sub)
    display(df_dict[sub][df_dict[sub].lang_lingua!="english"].sample(10,random_state=123))

SingaporeRaw


Unnamed: 0,text,scores_lingua,lang_lingua
409,Urhhhhhhh,"[(indonesian, 1.0), (malay, 0.8941998602375961...",indonesian
212162,$1200,[],none
69818,Murta baaa,"[(malay, 1.0), (indonesian, 0.9514943107655638...",malay
81001,China,"[(malay, 1.0), (indonesian, 0.910805304210028)...",malay
138082,Huh lampar,"[(malay, 1.0), (indonesian, 0.9806449303890341...",malay
153864,🤣💀💀💀,[],none
179649,Delta variant,"[(indonesian, 1.0), (english, 0.97515500498794...",indonesian
194230,Hmm Simpang Kiri?,"[(indonesian, 1.0), (malay, 0.9844061903024319...",indonesian
141405,🙋‍♂️,[],none
158330,K lol,"[(indonesian, 1.0), (malay, 0.8544381945397174...",indonesian


indonesia


Unnamed: 0,text,scores_lingua,lang_lingua
653235,Ooh Soalnya dulu kan ga kepikiran pake vpn bu...,"[(indonesian, 1.0), (malay, 0.9752853488526585...",indonesian
823693,Makan sushi emang pake tangan kan?,"[(indonesian, 1.0), (malay, 0.9984022509586495...",indonesian
709283,"Gw ga tahu dia pemasarannya dimana, orang kete...","[(indonesian, 1.0), (malay, 0.965337169551563)...",indonesian
870734,Kalau tanya mahal atau nggak yang pasti ya ngg...,"[(indonesian, 1.0), (malay, 0.9367237540939745...",indonesian
229923,Selama gak salah pilih partisi pas install dan...,"[(indonesian, 1.0), (malay, 0.9425328849576655...",indonesian
653841,"gak kepikiran wkwk...next time aja, ini test r...","[(indonesian, 1.0), (malay, 0.9137137198273385...",indonesian
91816,Yes tapi mechanya bentukan pegasus. Omg so cool.,"[(indonesian, 1.0), (malay, 0.9682273192216441...",indonesian
786041,"Iya ya, mau gimana lagi","[(indonesian, 1.0), (malay, 0.959329939323095)...",indonesian
724243,Hahaha nasib ga bs login. Brp hari lalu baru a...,"[(indonesian, 1.0), (malay, 0.9460270287803553...",indonesian
707714,Untuk para suhu2 yg fasih Mandarin dan/atau ng...,"[(indonesian, 1.0), (malay, 0.9768961559315827...",indonesian


malaysia


Unnamed: 0,text,scores_lingua,lang_lingua
107417,Ini semua agenda dap. Kautim.,"[(malay, 1.0), (indonesian, 0.9760410611991557...",malay
381009,Mana boleh? Our korupsi is syariah compliance ...,"[(indonesian, 1.0), (malay, 0.9262467314455257...",indonesian
682238,a lot of hongkies,"[(malay, 1.0), (english, 0.9487693896219637), ...",malay
230436,"""Fujiswara""","[(malay, 1.0), (indonesian, 0.7742256806978776...",malay
648108,ni kalau cakap mcm ni habis la meme 1 malaysia,"[(malay, 1.0), (indonesian, 0.9303457422832289...",malay
726317,"""he is a very good King"".. Wou, Wou, Wou... D...","[(indonesian, 1.0), (english, 0.96421797017926...",indonesian
188423,Dalam dilema diantara jalan derita...,"[(indonesian, 1.0), (malay, 0.9821288075581246...",indonesian
721816,Yang penting habis kol 5:30pm ..,"[(indonesian, 1.0), (malay, 0.9811339295003904...",indonesian
980716,Meme ni dari movie ape ek?,"[(indonesian, 1.0), (malay, 0.9713829151660712...",indonesian
482352,![gif](giphy|jswmTe8SNg0HaBx5W4|downsized),"[(indonesian, 1.0), (english, 0.99004998150944...",indonesian


## Select in-scope comments
Entries that are predicted as Malaysian first and Indonesian second, or the other way round.

In [9]:
def is_in_scope(scores):
    if len(scores)<2:
        return False
    if scores[0][0]=="malay" or scores[0][0]=="indonesian":
        if scores[1][0]=="malay" or scores[1][0]=="indonesian":
            return True
        else:
            return False
    else:
        return False

In [10]:
for sub in df_dict:
    
    print("#"*40)
    print(sub.upper(), "- in scope comments")
    display(df_dict[sub].scores_lingua.apply(is_in_scope).value_counts())
    print()
    
    #display(df_dict[sub][df_dict[sub].scores_lingua.apply(is_in_scope)])

########################################
SINGAPORERAW - in scope comments


False    9903
True       97
Name: scores_lingua, dtype: int64


########################################
INDONESIA - in scope comments


True     6795
False    3205
Name: scores_lingua, dtype: int64


########################################
MALAYSIA - in scope comments


False    9434
True      566
Name: scores_lingua, dtype: int64




In [11]:
for sub in df_dict:
    
    print("#"*40)
    print(sub.upper(), "- in scope comments")
    display(df_dict[sub][df_dict[sub].scores_lingua.apply(is_in_scope)])
    print()
    

########################################
SINGAPORERAW - in scope comments


Unnamed: 0,text,scores_lingua,lang_lingua
12004,Majulah Singapura,"[(malay, 1.0), (indonesian, 0.9601283935213767...",malay
59141,tf is ayum,"[(malay, 1.0), (indonesian, 0.9854713766834882...",malay
203237,Huh?,"[(malay, 1.0), (indonesian, 0.9455941567562506...",malay
184753,Huh?,"[(malay, 1.0), (indonesian, 0.9455941567562506...",malay
112767,Phua Chu Kang,"[(malay, 1.0), (indonesian, 0.9532817014324827...",malay
...,...,...,...
213203,bruh,"[(indonesian, 1.0), (malay, 0.8517523142492602...",indonesian
177477,Ah ok,"[(malay, 1.0), (indonesian, 0.9983850129198967...",malay
191341,Bowl of laksa in NY is US$25,"[(indonesian, 1.0), (malay, 0.9815370128341464...",indonesian
16393,It's at Malaysia?,"[(malay, 1.0), (indonesian, 0.8392560218049496...",malay



########################################
INDONESIA - in scope comments


Unnamed: 0,text,scores_lingua,lang_lingua
751304,u/bergumul shockingly genshin bisa di samsung ...,"[(indonesian, 1.0), (malay, 0.9725431981668183...",indonesian
83104,cfw itu identik dengan Custom firmware.,"[(malay, 1.0), (indonesian, 0.9208760636372765...",malay
353820,"biasanya programmer web gituan cuma 1 orang, b...","[(malay, 1.0), (indonesian, 0.9745695868760844...",malay
508929,"""mas kira-kira gaya rmabut yang cocok buat say...","[(indonesian, 1.0), (malay, 0.8975774180493827...",indonesian
900681,Lol guru bk sekolah gw ga ada yg bener. Yang p...,"[(indonesian, 1.0), (malay, 0.9781697462079033...",indonesian
...,...,...,...
787740,"Ya gimana ya, bukannya emang ""lahir sebagai co...","[(indonesian, 1.0), (malay, 0.9752252559540702...",indonesian
989616,Sate ayam 🤤,"[(indonesian, 1.0), (malay, 0.9980510464981515...",indonesian
483750,ga muat sela gigi gw pake benang floss! &amp;...,"[(indonesian, 1.0), (malay, 0.9995898670770142...",indonesian
192606,"""gimana mantan lu udah masuk sekolah?""","[(indonesian, 1.0), (malay, 0.9923245485815518...",indonesian



########################################
MALAYSIA - in scope comments


Unnamed: 0,text,scores_lingua,lang_lingua
463261,Tu pun masih ramai yang pelat,"[(malay, 1.0), (indonesian, 0.9952596703409529...",malay
241681,Buhonan,"[(indonesian, 1.0), (malay, 0.9536726231850479...",indonesian
786177,Aku rasa cam nak demam so hilang selera makan....,"[(malay, 1.0), (indonesian, 0.9947883967476997...",malay
133705,Tak semua manusia bawak grab bro...,"[(malay, 1.0), (indonesian, 0.9952314652061162...",malay
117282,So kepoh,"[(malay, 1.0), (indonesian, 0.9811348088531189...",malay
...,...,...,...
672638,After langkah kelima kot baru OYO. Bajet lompa...,"[(malay, 1.0), (indonesian, 0.9776017719023853...",malay
536299,Erm ...,"[(malay, 1.0), (indonesian, 0.9504675946063506...",malay
680742,Apa beza relay yang premium dengan percuma dan...,"[(malay, 1.0), (indonesian, 0.9391630624695672...",malay
332723,Berperisa buntut,"[(malay, 1.0), (indonesian, 0.9744905324300055...",malay



