In [1]:
import copy
import re
import random
from collections import Counter

from notebook import (
    TFIDF,
    MultinomialNB,
    SMOTE,
    Pipeline,
    chi_square,
    split_data,
    accuracy_score,
    cross_validation,
)
from linggapy import Stemmer

from sklearn.metrics import classification_report, confusion_matrix
from tqdm import tqdm
import pandas as pd
import numpy as np
import joblib

In [2]:
np.random.seed(42)
random.seed(42)

## Data Collection

In [3]:
dataframes = []

for i in range(4):
    dataframes.append(
        pd.read_excel(
            f"dataset/Klasifikasi Tingkatan Bahasa Bali p{i+1}.xlsx",
            "Sheet1",
            usecols=[1, 2],
        )
    )

df = pd.concat(dataframes, ignore_index=True)

In [4]:
df

Unnamed: 0,Teks Postingan / Comment,Tingkatan Bahasa/Sor Singgih
0,@gnyrs Kalau kantor Gubernurnya mimin nenten t...,Alus Madia
1,@mangabdiii munyi gen wi besik ne,Basa Andap
2,"Yen di desan nyama patuh masih nyambat ""pang j...",Basa Andap
3,@mangpink84 Nggih pateh taler nganggen ra repa 🙏,Alus Mider
4,"Om swastiastu, nyama sareng sami 🙏🏻\n Kenken k...",
...,...,...
1353,1. Kalimat alus singgih memiliki rasa bahasa y...,
1354,2. Kalimat alus sor merupakan kalimat yang ras...,
1355,3. Kalimat alus madia merupakan kalimat yang m...,
1356,4. Kalimat alus mider merupakan kalimat yang r...,


## Data Preprocessing

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1358 entries, 0 to 1357
Data columns (total 2 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        --------------  ----- 
 0   Teks Postingan / Comment      1357 non-null   object
 1   Tingkatan Bahasa/Sor Singgih  1340 non-null   object
dtypes: object(2)
memory usage: 21.3+ KB


In [6]:
df = df.dropna()

In [7]:
df = df.rename(columns={"Teks Postingan / Comment": "text", "Tingkatan Bahasa/Sor Singgih": "label"})

In [8]:
df['label'].value_counts().index.tolist()

['Basa Andap',
 'Alus Mider',
 'Alus Singgih',
 'Alus Madia',
 'alus mider',
 'Alus Sor',
 'basa andap',
 'alus madia',
 'Basa Andap ',
 'Alus Mider ',
 'Alus Madya',
 'Basa Kasar',
 'Basa Kamus',
 'Basa Indonesia',
 'Kruna Mider',
 '-',
 'Alus mider',
 'basa kasar',
 'basa madia',
 'Alus sor',
 'alus sor',
 'Bas Andap',
 'Campur Kode',
 'alus singgih ',
 'Akus Madia',
 'pakeh: andap, jinah:alus mider, pragina: basa mider, kraras: mider, biang:alus singgih, peken: andap, godel: mider.',
 'Suksma bli. Tiang demen ajak konten² basa Bali Alus bli ne😁: Basa Andap, ohh.... Nggih Rahajeng Rahina Pagerwesi bli, Dumogi Rahayu sareng sami 🙏🏻🙏🏻: Alus Madia',
 'Alis Singgih',
 'Alus singgih',
 'Alis Sor',
 'Bahasa Pengantar : Alus Singgih. Bahasa isi satua : Basa Andap',
 'Alus Andap',
 'Mider',
 'Basa Kasar Jabag',
 'Basa',
 'Imba Basa Alus',
 'Alus Singggih, Basa Kasar, Basa Andap, Alus Madya, Alus Sor, Alus Singgih',
 'Basa Alus Mider',
 'Basa Sunda',
 'Basa Madia',
 'Alus Singih',
 'Alus SIng

In [9]:
label_mapping = {
    "Basa Andap": "basa andap",
    "Alus Mider": "alus mider",
    "Alus Singgih": "alus singgih",
    "Alus Madia": "alus madia",
    "alus mider": "alus mider",
    "Alus Sor": "alus sor",
    "basa andap": "basa andap",
    "alus madia": "alus madia",
    "Basa Andap ": "basa andap",
    "Alus Mider ": "alus mider",
    "Alus Madya": "alus madia",
    "Basa Kasar": "basa kasar",
    "Kruna Mider": "alus mider",
    "Alus mider": "alus mider",
    "basa kasar": "basa kasar",
    "basa madia": "alus madia",
    "Alus sor": "alus sor",
    "alus sor": "alus sor",
    "Bas Andap": "basa andap",
    "alus singgih ": "alus singgih",
    "Akus Madia": "alus madia",
    "Alis Singgih": "alus singgih",
    "Alus singgih": "alus singgih",
    "Alis Sor": "alus sor",
    "Alus Andap": "basa andap",
    "Mider": "alus mider",
    "Basa Kasar Jabag": "basa kasar",
    "Basa Alus Mider": "alus mider",
    "Basa Madia": "alus madia",
    "Alus Singih": "alus singgih",
    "Alus SInggih": "alus singgih",
    "Basa Mider": "alus mider",
    "Bassa Andap": "basa andap",
    "alus singgih": "alus singgih",
    "alus madia ": "alus madia",
    "basa mider": "alus mider",
    "alus mider ": "alus mider",
}

In [10]:
df['label'] = df['label'].map(label_mapping)
df['label'].value_counts()

label
basa andap      549
alus mider      326
alus madia      181
alus singgih    174
alus sor         72
basa kasar       12
Name: count, dtype: int64

In [11]:
df['label'].value_counts() * 100 / len(df)

label
basa andap      40.970149
alus mider      24.328358
alus madia      13.507463
alus singgih    12.985075
alus sor         5.373134
basa kasar       0.895522
Name: count, dtype: float64

In [12]:
label_to_idx = {
    "alus singgih": 0,
    "alus sor": 1,
    "alus mider": 2,
    "alus madia": 3,
    "basa andap": 4,
    "basa kasar": 5,
}
idx_to_label = {v: k for k, v in label_to_idx.items()}

In [13]:
df["label"] = df["label"].map(
    label_to_idx
)

In [14]:
df

Unnamed: 0,text,label
0,@gnyrs Kalau kantor Gubernurnya mimin nenten t...,3.0
1,@mangabdiii munyi gen wi besik ne,4.0
2,"Yen di desan nyama patuh masih nyambat ""pang j...",4.0
3,@mangpink84 Nggih pateh taler nganggen ra repa 🙏,2.0
5,Rahajeng semenng 🙏\n \n #basabali #basabaline ...,2.0
...,...,...
1345,Buatin kakak nama komang dimas merta sedana,
1346,"Ngiring mlajah angka Bali, durusang ketik jawa...",3.0
1347,Becik pisan Baligrafi puniki 😍😍\n #Repost @rai...,2.0
1348,#Repost @mlajahbasabali\n • • • • • •\n Ngirin...,2.0


In [15]:
df.dropna()

Unnamed: 0,text,label
0,@gnyrs Kalau kantor Gubernurnya mimin nenten t...,3.0
1,@mangabdiii munyi gen wi besik ne,4.0
2,"Yen di desan nyama patuh masih nyambat ""pang j...",4.0
3,@mangpink84 Nggih pateh taler nganggen ra repa 🙏,2.0
5,Rahajeng semenng 🙏\n \n #basabali #basabaline ...,2.0
...,...,...
1344,#Repost @mlajahbasabali\n • • • • • •\n wenten...,3.0
1346,"Ngiring mlajah angka Bali, durusang ketik jawa...",3.0
1347,Becik pisan Baligrafi puniki 😍😍\n #Repost @rai...,2.0
1348,#Repost @mlajahbasabali\n • • • • • •\n Ngirin...,2.0


In [16]:
# remove mention and hastag
def clean_text(text: str) -> str:
    cleaned_text = re.sub(r"(@\w+|#\w+)", "", text).strip()
    # keep only alphabet and space
    cleaned_text = re.sub(r"[^a-zA-Z\s]", "", cleaned_text)
    # remove multiple spaces
    cleaned_text = re.sub(r"\s+", " ", cleaned_text)
    cleaned_text = cleaned_text.strip()
    cleaned_text = cleaned_text.lower()
    return cleaned_text


def remove_template(text: str) -> str:
    template_list = [
 """🙏 Elingang like, follow lan share instagram @bahasa_bali mangda iraga prasida mlajah basa lan aksara Bali sareng-sareng.
 .....................
 Ngiring wacén suratan lianan ring belajarbahasabali.com
 facebook: facebook.com/bahasabalii/
 youtube: youtube.com/c/BelajarBahasaBalii""",
 """🙏 Elingang like, follow lan share instagram @bahasa_bali mangda iraga prasida mlajah basa lan aksara Ba🙏 Elingang like, follow lan share instagram @bahasa_bali mangda iraga prasida mlajah basa lan aksara Ba#malajahbahasabalili sareng-sareng.
 .....................
 Ngiring wacén suratan lianan ring belajarbahasabali.com
 facebook: facebook.com/bahasabalii/
 youtube: youtube.com/c/BelajarBahasaBalii
 .....................""",
 """ 🙏 Elingang like, follow lan share instagram @bahasa_bali mangda iraga prasida mlajah basa lan aksara Bali sareng-sareng.
 .....................
 Ngiring wacén sesuratan lianan ring belajarbahasabali.com
 instagram: instagram.com/bahasa_bali
 facebook: facebook.com/bahasabalii
 youtube: youtube.com/c/BelajarBahasaBalii""",
 """🙏elingang like, follow lan share instagram @bahasa_bali mangda iraga prasida mlajah basa lan aksara Bali sareng-sareng.""",
 """🙏elingang like, follow lan share instagram
 @bahasa_bali mangda iraga prasida mlajah basa lan
 aksara Bali sareng-sareng.""",
    ]

    for template in template_list:
        text = text.replace(template, "")

    return text

In [17]:
stemmer = Stemmer()

In [18]:
df

Unnamed: 0,text,label
0,@gnyrs Kalau kantor Gubernurnya mimin nenten t...,3.0
1,@mangabdiii munyi gen wi besik ne,4.0
2,"Yen di desan nyama patuh masih nyambat ""pang j...",4.0
3,@mangpink84 Nggih pateh taler nganggen ra repa 🙏,2.0
5,Rahajeng semenng 🙏\n \n #basabali #basabaline ...,2.0
...,...,...
1345,Buatin kakak nama komang dimas merta sedana,
1346,"Ngiring mlajah angka Bali, durusang ketik jawa...",3.0
1347,Becik pisan Baligrafi puniki 😍😍\n #Repost @rai...,2.0
1348,#Repost @mlajahbasabali\n • • • • • •\n Ngirin...,2.0


In [19]:
df['text'] = df['text'].apply(remove_template)
df['text'] = df['text'].apply(clean_text)
df['text'] = df['text'].apply(lambda x: stemmer.stem(x, correct_spelling=False))

In [20]:
df = df.dropna()

In [21]:
df

Unnamed: 0,text,label
0,lau kantor gubernurnya mimi nenten tatas uning...,3.0
1,munyi gen wi besik ne,4.0
2,yen di desa nyama patuh masih jambat pang joh ra,4.0
3,nggih pateh taler kangge ra repa,2.0
5,rahajeng semenng,2.0
...,...,...
1344,wenten sane sampun bani kangge aksara bali rin...,3.0
1346,iring mlajah angka bali durus ketik jawaban ri...,3.0
1347,becik pisan baligrafi puniki baligrafi singa raja,2.0
1348,iring malajah aksara bali repost fb made,2.0


## Modeling

### Hyperparameter Tuning

In [22]:
def smote_resampling(X: np.ndarray, y: np.ndarray):
    """
    Apply SMOTE resampling using not majority
    """
    smote = SMOTE(k_neighbors=5, random_state=42)

    dict_y = Counter(y)
    majority_class = max(dict_y, key=dict_y.get)
    max_count = dict_y[majority_class]

    percentage_increase = {
        cls: round(((max_count - count) / count) * 100)
        for cls, count in dict_y.items()
        if cls != majority_class
    }

    smote = SMOTE()
    X_train_sampled, y_train_sampled = X, y
    for cls, pct in percentage_increase.items():
        X_train_sampled, y_train_sampled = smote.fit_resample(
            X_train_sampled, y_train_sampled, int(cls), N=pct
        )

    return X_train_sampled, y_train_sampled


def find_best_model(X: np.ndarray, y: np.ndarray, options: dict) -> dict:
    progress_bar = tqdm(
        total=options.get("total"), desc="Searching for best hyperparameters"
    )

    best_score = 0
    best_params = None

    for n_pct_features in options.get("n_pct_features"):
        for alpha in options.get("alpha"):
            for resample in options.get("resample"):
                try:
                    X_train = copy.deepcopy(X)
                    y_train = copy.deepcopy(y)

                    tfidf = TFIDF()
                    X_train_tfidf = tfidf.fit_transform(X_train)

                    feature_score = chi_square(X_train_tfidf, y_train)
                    n_features = int(n_pct_features * X_train_tfidf.shape[1])
                    top_features = np.argsort(feature_score)[::-1][:n_features]

                    if resample:
                        X_train_tfidf, y_train = smote_resampling(
                            X_train_tfidf, y_train
                        )

                    X_train_selected = X_train_tfidf[:, top_features]

                    model = MultinomialNB(alpha=alpha)
                    model.fit(X_train_selected, y_train)

                    cv_score = cross_validation(
                        model, X_train_selected, y_train, k_folds=5, random_state=42
                    )
                    avg_score = np.mean(cv_score)

                    if avg_score > best_score:
                        best_score = avg_score
                        best_params = {
                            "best_score": best_score,
                            "n_pct_features": n_pct_features,
                            "n_features": n_features,
                            "alpha": alpha,
                            "resample": resample,
                        }
                    progress_bar.update(1)
                except Exception as e:
                    print(e)
    progress_bar.close()
    return best_params

In [23]:
X = df['text'].astype(str)
y = df['label']

In [24]:
N_PCT_FEATURES_OPTIONS = np.arange(50, 101, 10) / 100 
ALPHA_OPTIONS = np.arange(0.0, 1.1, 0.25)
RESAMPLE_OPTIONS = np.arange(2)

In [25]:
options = {
    "n_pct_features": N_PCT_FEATURES_OPTIONS,
    "alpha": ALPHA_OPTIONS,
    "resample": RESAMPLE_OPTIONS,
    "total": len(N_PCT_FEATURES_OPTIONS) * len(ALPHA_OPTIONS) * len(RESAMPLE_OPTIONS),
}

In [26]:
X_train, X_test, y_train, y_test = split_data(
    X.to_numpy(), y.to_numpy(), test_size=0.2, random_state=42
)

In [27]:
Counter(y_train)

Counter({np.float64(4.0): 442,
         np.float64(2.0): 260,
         np.float64(3.0): 144,
         np.float64(0.0): 140,
         np.float64(1.0): 57,
         np.float64(5.0): 9})

In [29]:
# BEST_MODEL_CONFIG = find_best_model(X_train, y_train, options)
# BEST_MODEL_CONFIG

Searching for best hyperparameters: 100%|██████████| 60/60 [12:20<00:00, 12.35s/it]


{'best_score': np.float64(0.9091645969744864),
 'n_pct_features': np.float64(0.6),
 'n_features': 2037,
 'alpha': np.float64(0.25),
 'resample': np.int64(1)}

In [28]:
BEST_MODEL_CONFIG = {
    "best_score": np.float64(0.9091645969744864),
    "n_pct_features": np.float64(0.6),
    "n_features": 2037,
    "alpha": np.float64(0.25),
    "resample": np.int64(1),
}

### Best Model

In [29]:
tfidf = TFIDF()
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [30]:
feature_score = chi_square(X_train_tfidf, y_train)
feature_score

array([3.93846216, 0.89511537, 0.25139719, ..., 1.12244143, 3.90949227,
       3.4917151 ])

In [31]:
n_pct_features = BEST_MODEL_CONFIG.get("n_pct_features")
n_features = int(n_pct_features * X_train_tfidf.shape[1])
top_features = np.argsort(feature_score)[::-1][:n_features]
n_features

2032

In [32]:
resample = BEST_MODEL_CONFIG.get("resample")
if resample:
    X_train_sampled, y_train_sampled = smote_resampling(X_train_tfidf, y_train)
else:
    X_train_sampled, y_train_sampled = X_train_tfidf, y_train

In [33]:
Counter(y_train_sampled)

Counter({np.float64(2.0): 442,
         np.float64(4.0): 442,
         np.float64(5.0): 441,
         np.float64(3.0): 432,
         np.float64(0.0): 420,
         np.float64(1.0): 399})

In [34]:
X_train_selected = X_train_sampled[:, top_features]
X_test_selected = X_test_tfidf[:, top_features]

In [35]:
mnb = MultinomialNB(alpha=BEST_MODEL_CONFIG.get("alpha"))
mnb.fit(X_train_selected, y_train_sampled)

In [36]:
y_train_pred = mnb.predict(X_train_selected)
print(classification_report(y_train_sampled, y_train_pred, digits=4, zero_division=0))

              precision    recall  f1-score   support

         0.0     0.9572    0.9595    0.9584       420
         1.0     0.9587    0.9900    0.9741       399
         2.0     0.9157    0.9344    0.9250       442
         3.0     0.9389    0.9606    0.9497       432
         4.0     0.9925    0.9027    0.9455       442
         5.0     0.9844    1.0000    0.9921       441

    accuracy                         0.9573      2576
   macro avg     0.9579    0.9579    0.9575      2576
weighted avg     0.9580    0.9573    0.9572      2576



In [37]:
Counter(y_test)

Counter({np.float64(4.0): 107,
         np.float64(2.0): 66,
         np.float64(3.0): 37,
         np.float64(0.0): 34,
         np.float64(1.0): 15,
         np.float64(5.0): 3})

In [38]:
y_test_pred = mnb.predict(X_test_selected)
print(
    classification_report(
        y_test,
        y_test_pred,
        digits=4,
        zero_division=0,
        target_names=idx_to_label.values(),
    )
)

              precision    recall  f1-score   support

alus singgih     0.3846    0.4412    0.4110        34
    alus sor     0.2400    0.4000    0.3000        15
  alus mider     0.5781    0.5606    0.5692        66
  alus madia     0.3556    0.4324    0.3902        37
  basa andap     0.9647    0.7664    0.8542       107
  basa kasar     0.5000    0.6667    0.5714         3

    accuracy                         0.6031       262
   macro avg     0.5038    0.5445    0.5160       262
weighted avg     0.6592    0.6031    0.6244       262



In [39]:
accuracy_score(y_test, y_test_pred)

0.6030534351145038

In [40]:
confusion_matrix(y_test, y_test_pred)

array([[15,  4, 10,  5,  0,  0],
       [ 2,  6,  3,  4,  0,  0],
       [ 7,  4, 37, 16,  2,  0],
       [ 5,  9,  6, 16,  1,  0],
       [10,  2,  7,  4, 82,  2],
       [ 0,  0,  1,  0,  0,  2]])

### Result Analysis

In [41]:
for observed in idx_to_label:
    for compared in idx_to_label:
        if observed == compared:
            continue
        else:
            o_list = (X_test[y_test == observed]).tolist()
            c_text = "\n".join(X_train[y_train == compared].tolist())
            res = []
            for sentence in o_list:
                word_split = sentence.split()
                total = len(word_split)
                count = 0

                for i in word_split:
                    if i in c_text:
                        count += 1
                res.append(count / total)
            print(
                f"{idx_to_label[observed]} vs {idx_to_label[compared]}:".ljust(30)
                + f"{np.mean(res)*100:.2f}"
            )
    print("====================================")

alus singgih vs alus sor:     68.36
alus singgih vs alus mider:   77.43
alus singgih vs alus madia:   71.87
alus singgih vs basa andap:   67.79
alus singgih vs basa kasar:   3.60
alus sor vs alus singgih:     76.60
alus sor vs alus mider:       84.52
alus sor vs alus madia:       81.15
alus sor vs basa andap:       73.54
alus sor vs basa kasar:       2.95
alus mider vs alus singgih:   71.36
alus mider vs alus sor:       66.37
alus mider vs alus madia:     74.66
alus mider vs basa andap:     73.09
alus mider vs basa kasar:     3.49
alus madia vs alus singgih:   71.99
alus madia vs alus sor:       67.39
alus madia vs alus mider:     78.16
alus madia vs basa andap:     71.24
alus madia vs basa kasar:     5.73
basa andap vs alus singgih:   49.59
basa andap vs alus sor:       40.48
basa andap vs alus mider:     57.78
basa andap vs alus madia:     51.97
basa andap vs basa kasar:     12.98
basa kasar vs alus singgih:   16.19
basa kasar vs alus sor:       19.89
basa kasar vs alus mider:     26

## Inference

In [42]:
pipeline = Pipeline(mnb, tfidf, top_features)

In [43]:
text = "mai be medaar malu"
text = stemmer.stem(text, correct_spelling=False)
idx_to_label[pipeline.predict(text)]

'basa andap'

In [44]:
pipeline.save("model/pipeline.joblib")

In [45]:
loaded_pipeline = joblib.load("model/pipeline.joblib")

In [46]:
text = "ratu durung polih mireng orti punika"
text = stemmer.stem(text, correct_spelling=False)
idx_to_label[pipeline.predict(text)]

'alus singgih'