In [1]:
from notebook import TFIDF, MultinomialNB, chi_square, SMOTE
from linggapy import Stemmer

from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

In [2]:
np.random.seed(42)

In [3]:
df = pd.read_excel(f"https://docs.google.com/spreadsheets/d/1dCSj3772FI7B6i1ioD1Nq5w6siZuAj1p7i4FizHTqcY/export?gid=0&format=xlsx", 'Sheet1')

In [4]:
df

Unnamed: 0,No.,Teks Postingan / Comment,Tingkatan Bahasa/Sor Singgih
0,1,Ngaad artosne ngadu ang atau ngomong janji,Alus Sor
1,2,Arti sujati Ngad. Arti paribasa Ngaduang. Suks...,Alis Sor
2,3,Tiuk tiying arti sujati ngad .. metiiuk tiying...,Alus Sor
3,4,Sira sane seneng ngalawar semeton ? komen nggi...,Alus Singgih
4,5,Jegeg pisan ane me satwe i belong.😂😍,
...,...,...,...
333,334,"Tata Cara Nyurat I Kara, elingang save & share...",Alus Singgih
334,335,Yening madaging ba kembang sareng ta latik pra...,Alus SInggih
335,336,Napi murda sesuratan puniki? 🤭😂😂\n #aksara #ak...,Alus Singgih
336,337,Wenten sane pateh? wkwkwk \n elingang like & s...,Alus Singgih


In [5]:
df = df.drop(columns=['No.'])
df = df.rename(columns={"Teks Postingan / Comment": "text", "Tingkatan Bahasa/Sor Singgih": "label"})

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 338 entries, 0 to 337
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    338 non-null    object
 1   label   334 non-null    object
dtypes: object(2)
memory usage: 5.4+ KB


In [7]:
df['label'].value_counts()

label
Alus Singgih                                                      146
Basa Andap                                                        123
Alus Sor                                                           34
Alus Madia                                                         10
-                                                                   4
Basa Kasar                                                          4
Alus Mider                                                          3
Bahasa Pengantar : Alus Singgih. Bahasa isi satua : Basa Andap      1
Alis Sor                                                            1
Alis Singgih                                                        1
Alus singgih                                                        1
Basa                                                                1
Bassa Andap                                                         1
Basa Kasar Jabag                                                    1
Mider         

In [8]:
label_mapping = {
    "Alus Singgih": "alus singgih",
    "Basa Andap": "basa andap",
    "Alus Sor": "alus sor",
    "Alus Madia": "alus madia",
    "Basa Kasar": "basa kasar",
    "Alus Mider": "alus mider",
    "Alus singgih": "alus singgih",
    "Alis Singgih": "alus singgih",
    "Alis Sor": "alus sor",
    "Bassa Andap": "basa andap",
    "Basa Kasar Jabag": "basa kasar",
    "Mider": "alus mider",
    "Alus Singih": "alus singgih",
    "Alus SInggih": "alus singgih",
}

In [9]:
df['label'] = df['label'].map(label_mapping)
df['label'].value_counts()

label
alus singgih    150
basa andap      124
alus sor         35
alus madia       10
basa kasar        5
alus mider        4
Name: count, dtype: int64

In [10]:
df['label'].value_counts() * 100 / len(df)

label
alus singgih    44.378698
basa andap      36.686391
alus sor        10.355030
alus madia       2.958580
basa kasar       1.479290
alus mider       1.183432
Name: count, dtype: float64

In [11]:
label_to_idx = {
    "alus singgih": 0,
    "alus sor": 1,
    "alus mider": 2,
    "alus madia": 3,
    "basa andap": 4,
    "basa kasar": 5,
}
idx_to_label = {v: k for k, v in label_to_idx.items()}

In [12]:
df["label"] = df["label"].map(
    label_to_idx
)

In [13]:
df

Unnamed: 0,text,label
0,Ngaad artosne ngadu ang atau ngomong janji,1.0
1,Arti sujati Ngad. Arti paribasa Ngaduang. Suks...,1.0
2,Tiuk tiying arti sujati ngad .. metiiuk tiying...,1.0
3,Sira sane seneng ngalawar semeton ? komen nggi...,0.0
4,Jegeg pisan ane me satwe i belong.😂😍,
...,...,...
333,"Tata Cara Nyurat I Kara, elingang save & share...",0.0
334,Yening madaging ba kembang sareng ta latik pra...,0.0
335,Napi murda sesuratan puniki? 🤭😂😂\n #aksara #ak...,0.0
336,Wenten sane pateh? wkwkwk \n elingang like & s...,0.0


In [14]:
stemmer = Stemmer()

In [15]:
df['text'] = df['text'].apply(lambda x: stemmer.stem(x, correct_spelling=False))

In [16]:
df

Unnamed: 0,text,label
0,aad artos adu ang tau omong janji,1.0
1,arti sujati ngad arti paribasa adu suksema,1.0
2,tiuk tiying arti sujati ngad metiiuk tiying ki...,1.0
3,sira sane seneng lawar semeton komen nggih bal...,0.0
4,jegeg pisan ane me satwe i belong,
...,...,...
333,tata cara urat i kara eling save share nggih a...,0.0
334,yening daging ba kembang sareng ta latik prasi...,0.0
335,napi murda sesuratan puniki aksara aksarabali,0.0
336,wenten sane pateh wkwkwk eling like share nggi...,0.0


In [17]:
df = df.dropna(subset=['label'])

In [18]:
df

Unnamed: 0,text,label
0,aad artos adu ang tau omong janji,1.0
1,arti sujati ngad arti paribasa adu suksema,1.0
2,tiuk tiying arti sujati ngad metiiuk tiying ki...,1.0
3,sira sane seneng lawar semeton komen nggih bal...,0.0
5,satua pan balang tamak siki nggih,0.0
...,...,...
333,tata cara urat i kara eling save share nggih a...,0.0
334,yening daging ba kembang sareng ta latik prasi...,0.0
335,napi murda sesuratan puniki aksara aksarabali,0.0
336,wenten sane pateh wkwkwk eling like share nggi...,0.0


In [19]:
X = df['text'].astype(str)
y = df['label']

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [21]:
tfidf = TFIDF()
X_train_tfidf = tfidf.fit_transform(X_train.to_list())
X_test_tfidf = tfidf.transform(X_test.to_list())

In [22]:
feature_score = chi_square(X_train_tfidf, y_train.to_numpy())
feature_score

array([3.1485421 , 0.04354622, 2.03349044, ..., 0.31150804, 0.67075282,
       0.53116423])

In [23]:
n_features = 500
top_features = np.argsort(feature_score)[::-1][:n_features]

In [24]:
smote = SMOTE()
X_train_sampled, y_train_sampled = smote.fit_resample(X_train_tfidf, y_train.to_numpy(), 1, N=200)

In [25]:
X_train_selected = X_train_sampled[:, top_features]
X_test_selected = X_test_tfidf[:, top_features]

In [26]:
mnb = MultinomialNB()
mnb.fit(X_train_selected, y_train_sampled)

In [27]:
y_train_pred = mnb.predict(X_train_selected)
print(classification_report(y_train_sampled, y_train_pred, digits=4, zero_division=0))

              precision    recall  f1-score   support

         0.0     0.8248    1.0000    0.9040       113
         1.0     0.9663    0.9885    0.9773        87
         2.0     0.0000    0.0000    0.0000         3
         3.0     0.0000    0.0000    0.0000         9
         4.0     0.9468    0.8641    0.9036       103
         5.0     0.0000    0.0000    0.0000         5

    accuracy                         0.9000       320
   macro avg     0.4563    0.4754    0.4641       320
weighted avg     0.8587    0.9000    0.8758       320



In [28]:
y_test_pred = mnb.predict(X_test_selected)
print(classification_report(y_test, y_test_pred, digits=4, zero_division=0))

              precision    recall  f1-score   support

         0.0     0.7727    0.9189    0.8395        37
         1.0     0.5000    0.5000    0.5000         6
         2.0     0.0000    0.0000    0.0000         1
         3.0     0.0000    0.0000    0.0000         1
         4.0     0.9375    0.7143    0.8108        21

    accuracy                         0.7879        66
   macro avg     0.4420    0.4266    0.4301        66
weighted avg     0.7769    0.7879    0.7741        66



In [29]:
text = "rahajeng mewali bli, rahayu rahayu nggih"
text = stemmer.stem(text, correct_spelling=False)
prediction = mnb.predict(tfidf.transform([text])[:, top_features])
idx_to_label[prediction[0]]

'alus singgih'