In [1]:
import re
from notebook import TFIDF, MultinomialNB, chi_square, SMOTE
from linggapy import Stemmer

from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

In [2]:
np.random.seed(42)

## Data Collection

In [3]:
sheet_id_list = [
    "1Spn8g26B1oiTdAIQpf7VTptBMS50oVesE-f4QwaGCuw",  # p1
    "1dCSj3772FI7B6i1ioD1Nq5w6siZuAj1p7i4FizHTqcY",  # p2
    "1a66EfiTsLL8qO3KQ9ahgAuj-w5dfj2_rFS5v5Yyg84g",  # p3
    "1nyOotMZx_nVPaRPqpsDpeIfo0RGQu3rueBDLcsxbmYY",  # p4
]
dataframes = []

for sheet_id in sheet_id_list:
    url = f"https://docs.google.com/spreadsheets/d/{sheet_id}/export?gid=0&format=xlsx"
    dataframes.append(pd.read_excel(url, "Sheet1", usecols=[1, 2]))

df = pd.concat(dataframes, ignore_index=True)

In [4]:
df

Unnamed: 0,Teks Postingan / Comment,Tingkatan Bahasa/Sor Singgih
0,@gnyrs Kalau kantor Gubernurnya mimin nenten t...,Alus Madia
1,@mangabdiii munyi gen wi besik ne,Basa Andap
2,"Yen di desan nyama patuh masih nyambat ""pang j...",Basa Andap
3,@mangpink84 Nggih pateh taler nganggen ra repa 🙏,Alus Mider
4,"Om swastiastu, nyama sareng sami 🙏🏻\n Kenken k...",
...,...,...
1353,1. Kalimat alus singgih memiliki rasa bahasa y...,
1354,2. Kalimat alus sor merupakan kalimat yang ras...,
1355,3. Kalimat alus madia merupakan kalimat yang m...,
1356,4. Kalimat alus mider merupakan kalimat yang r...,


## Data Preprocessing

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1358 entries, 0 to 1357
Data columns (total 2 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        --------------  ----- 
 0   Teks Postingan / Comment      1357 non-null   object
 1   Tingkatan Bahasa/Sor Singgih  1340 non-null   object
dtypes: object(2)
memory usage: 21.3+ KB


In [6]:
df = df.dropna()

In [7]:
df = df.rename(columns={"Teks Postingan / Comment": "text", "Tingkatan Bahasa/Sor Singgih": "label"})

In [8]:
df['label'].value_counts().index.tolist()

['Basa Andap',
 'Alus Mider',
 'Alus Singgih',
 'Alus Madia',
 'alus mider',
 'Alus Sor',
 'basa andap',
 'alus madia',
 'Basa Andap ',
 'Alus Mider ',
 'Alus Madya',
 'Basa Kasar',
 'Basa Kamus',
 'Basa Indonesia',
 'Kruna Mider',
 '-',
 'Alus mider',
 'basa kasar',
 'basa madia',
 'Alus sor',
 'alus sor',
 'Bas Andap',
 'Campur Kode',
 'alus singgih ',
 'Akus Madia',
 'pakeh: andap, jinah:alus mider, pragina: basa mider, kraras: mider, biang:alus singgih, peken: andap, godel: mider.',
 'Suksma bli. Tiang demen ajak konten² basa Bali Alus bli ne😁: Basa Andap, ohh.... Nggih Rahajeng Rahina Pagerwesi bli, Dumogi Rahayu sareng sami 🙏🏻🙏🏻: Alus Madia',
 'Alis Singgih',
 'Alus singgih',
 'Alis Sor',
 'Bahasa Pengantar : Alus Singgih. Bahasa isi satua : Basa Andap',
 'Alus Andap',
 'Mider',
 'Basa Kasar Jabag',
 'Basa',
 'Imba Basa Alus',
 'Alus Singggih, Basa Kasar, Basa Andap, Alus Madya, Alus Sor, Alus Singgih',
 'Basa Alus Mider',
 'Basa Sunda',
 'Basa Madia',
 'Alus Singih',
 'Alus SIng

In [9]:
label_mapping = {
    "Basa Andap": "basa andap",
    "Alus Mider": "alus mider",
    "Alus Singgih": "alus singgih",
    "Alus Madia": "alus madia",
    "alus mider": "alus mider",
    "Alus Sor": "alus sor",
    "basa andap": "basa andap",
    "alus madia": "alus madia",
    "Basa Andap ": "basa andap",
    "Alus Mider ": "alus mider",
    "Alus Madya": "alus madia",
    "Basa Kasar": "basa kasar",
    "Kruna Mider": "alus mider",
    "Alus mider": "alus mider",
    "basa kasar": "basa kasar",
    "basa madia": "alus madia",
    "Alus sor": "alus sor",
    "alus sor": "alus sor",
    "Bas Andap": "basa andap",
    "alus singgih ": "alus singgih",
    "Akus Madia": "alus madia",
    "Alis Singgih": "alus singgih",
    "Alus singgih": "alus singgih",
    "Alis Sor": "alus sor",
    "Alus Andap": "basa andap",
    "Mider": "alus mider",
    "Basa Kasar Jabag": "basa kasar",
    "Basa Alus Mider": "alus mider",
    "Basa Madia": "alus madia",
    "Alus Singih": "alus singgih",
    "Alus SInggih": "alus singgih",
    "Basa Mider": "alus mider",
    "Bassa Andap": "basa andap",
    "alus singgih": "alus singgih",
    "alus madia ": "alus madia",
    "basa mider": "alus mider",
    "alus mider ": "alus mider",
}

In [10]:
df['label'] = df['label'].map(label_mapping)
df['label'].value_counts()

label
basa andap      549
alus mider      326
alus madia      181
alus singgih    174
alus sor         72
basa kasar       12
Name: count, dtype: int64

In [11]:
df['label'].value_counts() * 100 / len(df)

label
basa andap      40.970149
alus mider      24.328358
alus madia      13.507463
alus singgih    12.985075
alus sor         5.373134
basa kasar       0.895522
Name: count, dtype: float64

In [12]:
label_to_idx = {
    "alus singgih": 0,
    "alus sor": 1,
    "alus mider": 2,
    "alus madia": 3,
    "basa andap": 4,
    "basa kasar": 5,
}
idx_to_label = {v: k for k, v in label_to_idx.items()}

In [13]:
df["label"] = df["label"].map(
    label_to_idx
)

In [14]:
df

Unnamed: 0,text,label
0,@gnyrs Kalau kantor Gubernurnya mimin nenten t...,3.0
1,@mangabdiii munyi gen wi besik ne,4.0
2,"Yen di desan nyama patuh masih nyambat ""pang j...",4.0
3,@mangpink84 Nggih pateh taler nganggen ra repa 🙏,2.0
5,Rahajeng semenng 🙏\n \n #basabali #basabaline ...,2.0
...,...,...
1345,Buatin kakak nama komang dimas merta sedana,
1346,"Ngiring mlajah angka Bali, durusang ketik jawa...",3.0
1347,Becik pisan Baligrafi puniki 😍😍\n #Repost @rai...,2.0
1348,#Repost @mlajahbasabali\n • • • • • •\n Ngirin...,2.0


In [15]:
df.dropna()

Unnamed: 0,text,label
0,@gnyrs Kalau kantor Gubernurnya mimin nenten t...,3.0
1,@mangabdiii munyi gen wi besik ne,4.0
2,"Yen di desan nyama patuh masih nyambat ""pang j...",4.0
3,@mangpink84 Nggih pateh taler nganggen ra repa 🙏,2.0
5,Rahajeng semenng 🙏\n \n #basabali #basabaline ...,2.0
...,...,...
1344,#Repost @mlajahbasabali\n • • • • • •\n wenten...,3.0
1346,"Ngiring mlajah angka Bali, durusang ketik jawa...",3.0
1347,Becik pisan Baligrafi puniki 😍😍\n #Repost @rai...,2.0
1348,#Repost @mlajahbasabali\n • • • • • •\n Ngirin...,2.0


In [16]:
# remove mention and hastag
def clean_text(text: str) -> str:
    cleaned_text = re.sub(r"(@\w+|#\w+)", "", text).strip()
    return cleaned_text

In [17]:
stemmer = Stemmer()

In [18]:
df['text'] = df['text'].apply(clean_text)
df['text'] = df['text'].apply(lambda x: stemmer.stem(x))


In [19]:
df = df.dropna()

In [20]:
df

Unnamed: 0,text,label
0,kala kantor gubernurnya limin nenten tatas uni...,3.0
1,munyi gen ki besik ne,4.0
2,yen di desa nyama patuh masih jambat pang joh ra,4.0
3,anggih pateh taler ke ra repa,2.0
5,rahajeng semeng,2.0
...,...,...
1344,wenten sane sampun bani ke aksara bali ring me...,3.0
1346,gir malajah angka bali durus tik jawat ring ko...,3.0
1347,becik pisan baligrafi puniki baligrafi singa raja,2.0
1348,gir malajah aksara bali repot pb made,2.0


## Modeling

In [21]:
X = df['text'].astype(str)
y = df['label']

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [23]:
tfidf = TFIDF()
X_train_tfidf = tfidf.fit_transform(X_train.to_list())
X_test_tfidf = tfidf.transform(X_test.to_list())

In [24]:
X_train_tfidf.shape

(1051, 2858)

In [25]:
feature_score = chi_square(X_train_tfidf, y_train.to_numpy())
feature_score

array([1.43271743, 0.81936005, 0.22066901, ..., 2.72203613, 0.73138615,
       0.52584026])

In [26]:
n_features = 2000
top_features = np.argsort(feature_score)[::-1][:n_features]

In [27]:
smote = SMOTE()
X_train_sampled, y_train_sampled = smote.fit_resample(X_train_tfidf, y_train.to_numpy(), 5, N=200)

In [28]:
X_train_selected = X_train_sampled[:, top_features]
X_test_selected = X_test_tfidf[:, top_features]

In [29]:
# TODO: cross validation, tuning
# TODO: implement from scratch split dataset, cv, metrics

mnb = MultinomialNB()
mnb.fit(X_train_selected, y_train_sampled)

In [30]:
y_train_pred = mnb.predict(X_train_selected)
print(classification_report(y_train_sampled, y_train_pred, digits=4, zero_division=0))

              precision    recall  f1-score   support

         0.0     0.9437    0.4621    0.6204       145
         1.0     1.0000    0.0192    0.0377        52
         2.0     0.5882    0.8400    0.6919       250
         3.0     0.9368    0.5597    0.7008       159
         4.0     0.8022    0.9862    0.8848       436
         5.0     1.0000    0.3333    0.5000        27

    accuracy                         0.7540      1069
   macro avg     0.8785    0.5334    0.5726      1069
weighted avg     0.8060    0.7540    0.7255      1069



In [31]:
y_test_pred = mnb.predict(X_test_selected)
print(classification_report(y_test, y_test_pred, digits=4, zero_division=0))

              precision    recall  f1-score   support

         0.0     1.0000    0.1034    0.1875        29
         1.0     0.0000    0.0000    0.0000        20
         2.0     0.4800    0.6316    0.5455        76
         3.0     0.2727    0.1364    0.1818        22
         4.0     0.7114    0.9381    0.8092       113
         5.0     0.0000    0.0000    0.0000         3

    accuracy                         0.6084       263
   macro avg     0.4107    0.3016    0.2873       263
weighted avg     0.5774    0.6084    0.5412       263



In [32]:
text = "nah kemu be jemak jajane"
text = stemmer.stem(text, correct_spelling=True)
prediction = mnb.predict(tfidf.transform([text])[:, top_features])
idx_to_label[prediction[0]]

'basa andap'