In [None]:
import re
import unicodedata

from linggapy.utils import Loader
import pandas as pd
import requests

In [186]:
balinese_words = Loader().load_words()
len(balinese_words)

42194

In [None]:
response = requests.get(
    "https://raw.githubusercontent.com/Wikidepia/indonesian_datasets/master/dictionary/wordlist/data/wordlist.txt"
).text
indonesian_words = set(response.split())

In [143]:
len(indonesian_words)

74891

In [144]:
df = pd.read_json("instagram.json")
df = df[["data"]]

In [145]:
all_texts = []
for item in df['data']:
    all_texts.append(item['caption'])
    all_texts.extend(item['comments'])

df_texts = pd.DataFrame({'text': all_texts}).reset_index(drop=True)

In [146]:
df_texts

Unnamed: 0,text
0,#bahasabali #basabali #basabaline #belajarbaha...
1,"Sukseme , tiang melajah mebase bali"
2,@petuah_petuah Ngiring sareng-sareng malajah 🙏
3,@basabali.id sawire timpal timpal dini uli Bali🙏
4,Anadap & alus lebih sopan mana min?
...,...
18219,Ade ne kene? Nawang lagune tuah abesik ne pali...
18220,❤😊🙏!!!
18221,Hahahah nice post
18222,Jayanti ring angga semeton dewata! Suksma Bali...


In [170]:
def clean_text(text: str) -> str:
    text = text.lower()
    # remove emoji/unicode
    text = unicodedata.normalize("NFKD", text).encode("ASCII", "ignore").decode("utf-8")
    # keep only @ and #
    text = re.sub(r"[^\w\s@#]", " ", text)
    # remove multiple spaces
    text = re.sub(r"\s+", " ", text).strip()
    # remove tag and hashtag
    for word in text.split():
        if word.startswith("@") or word.startswith("#"):
            text = text.replace(word, "")
    # remove non alphabetic
    text = re.sub(r"[^a-zA-Z]", " ", text)
    # remove multiple spaces
    text = re.sub(r"\s+", " ", text).strip()
    return text

In [171]:
def is_dominant_balinese(text: str, threshold: float = 0.5) -> bool:
    if text is None:
        return False

    text = clean_text(text)
    words = text.split()
    if not words:
        return False

    balinese_count = sum(1 for word in words if word in balinese_words)
    indonesian_count = sum(1 for word in words if word in indonesian_words)

    is_dominant = False
    # if balinese is more than threshold
    # and indonesian is less than 1 - threshold
    if (balinese_count / len(words) > threshold) and (
        indonesian_count / len(words) < 1 - threshold
    ):
        is_dominant = True
    return is_dominant

In [172]:
# dominant balinese
df_filtered = df_texts[df_texts['text'].apply(is_dominant_balinese)].reset_index(drop=True)

In [173]:
df_filtered

Unnamed: 0,text
0,@petuah_petuah Ngiring sareng-sareng malajah 🙏
1,@trioe_f nginem
2,@erhasonwafa22 wareg
3,@gnyrs Kalau kantor Gubernurnya mimin nenten t...
4,lue apa lue?
...,...
2785,"Ngiring mlajah angka Bali, durusang ketik jawa..."
2786,Becik pisan Baligrafi puniki 😍😍\n#Repost @rai_...
2787,#Repost @mlajahbasabali\n• • • • • •\nNgiring ...
2788,#Repost @balilango\n• • • • • •\nRahajeng rahi...


In [174]:
def filter_length_word(text: str, length: int = 5) -> bool:
    if text is None:
        return False
    text = clean_text(text)
    words = text.split()
    if not words:
        return False
    return len(words) >= length

In [177]:
df_filtered = df_filtered[df_filtered['text'].apply(filter_length_word)].reset_index(drop=True)

In [178]:
df_filtered

Unnamed: 0,text
0,@gnyrs Kalau kantor Gubernurnya mimin nenten t...
1,@mangabdiii munyi gen wi besik ne
2,"Yen di desan nyama patuh masih nyambat ""pang j..."
3,@mangpink84 Nggih pateh taler nganggen ra repa 🙏
4,"Om swastiastu, nyama sareng sami 🙏🏻\nKenken ka..."
...,...
1405,Buatin kakak nama komang dimas merta sedana
1406,"Ngiring mlajah angka Bali, durusang ketik jawa..."
1407,Becik pisan Baligrafi puniki 😍😍\n#Repost @rai_...
1408,#Repost @mlajahbasabali\n• • • • • •\nNgiring ...


In [182]:
df_filtered["cleaned_text"] = df_filtered["text"].apply(clean_text)
df_filtered = df_filtered.drop_duplicates(subset=['cleaned_text']).reset_index(drop=True)

In [183]:
df_filtered

Unnamed: 0,text,cleaned_text
0,@gnyrs Kalau kantor Gubernurnya mimin nenten t...,kalau kantor gubernurnya mimin nenten tatas un...
1,@mangabdiii munyi gen wi besik ne,munyi gen wi besik ne
2,"Yen di desan nyama patuh masih nyambat ""pang j...",yen di desan nyama patuh masih nyambat pang jo...
3,@mangpink84 Nggih pateh taler nganggen ra repa 🙏,nggih pateh taler nganggen ra repa
4,"Om swastiastu, nyama sareng sami 🙏🏻\nKenken ka...",om swastiastu nyama sareng sami kenken kabare ...
...,...,...
1345,Buatin kakak nama komang dimas merta sedana,buatin kakak nama komang dimas merta sedana
1346,"Ngiring mlajah angka Bali, durusang ketik jawa...",ngiring mlajah angka bali durusang ketik jawab...
1347,Becik pisan Baligrafi puniki 😍😍\n#Repost @rai_...,becik pisan baligrafi puniki baligrafi singa raja
1348,#Repost @mlajahbasabali\n• • • • • •\nNgiring ...,ngiring malajah aksara bali repost fb made


In [184]:
df_filtered = df_filtered.drop(columns=['cleaned_text'])

In [185]:
df_filtered.to_excel("dataset.xlsx", index=False)