# Notebook: Uji Cepat Leksara
Notebook ini menguji library `leksara` end-to-end: cek lingkungan, instal, import, uji fungsi/kualitas output, pipeline kustom, I/O, benchmark, hingga pembersihan.

## Mengakses Modul `core`
Modul `leksara.core` menyediakan API tingkat lanjut:
- `chain` untuk membangun dan menjalankan pipeline secara OOP atau fungsional.
- `logging` untuk mengaktifkan pencatatan proses pipeline.
- `presets` untuk mengambil konfigurasi pipeline siap pakai (mis. `ecommerce_review`).

### Contoh `CartBoard`
`leksara.frames.cartboard` menyediakan deteksi otomatis untuk noise, flag, dan statistik teks konsumen. Berikut contoh penggunaan cepat:

In [16]:
import pandas as pd
from leksara.frames.cartboard import CartBoard, get_flags, get_stats, noise_detect

sample_reviews = pd.DataFrame(
    {
        "review_id": [101, 102, 103],
        "text": [
            "Barangnya mantul!!! Email saya user@mail.id, kontak +62 812-3456-7890",
            "<p>Murah banget & dikirim <3</p> Harga 5/5 deh",
            "Produk oke tapi packing kurang rapi :(",
        ],
    }
)

board = CartBoard(sample_reviews.loc[0, "text"], rating=4.8)
print("Single review summary:")
print(board.to_dict())

flags = get_flags(sample_reviews, text_column="text")
stats = get_stats(sample_reviews, as_dict=False, text_column="text")
noise = noise_detect(sample_reviews, include_normalized=False, text_column="text")

display(flags)
display(stats[["review_id", "length", "word_count", "noise_count", "emojis"]])
display(noise)

Single review summary:
{'original_text': 'Barangnya mantul!!! Email saya user@mail.id, kontak +62 812-3456-7890', 'rating': 4.8, 'pii_flag': True, 'non_alphabetical_flag': True}


Unnamed: 0,review_id,text,rating_flag,pii_flag,non_alphabetical_flag
0,101,"Barangnya mantul!!! Email saya user@mail.id, k...",False,True,True
1,102,<p>Murah banget & dikirim <3</p> Harga 5/5 deh,True,False,True
2,103,Produk oke tapi packing kurang rapi :(,False,False,False


Unnamed: 0,review_id,length,word_count,noise_count,emojis
0,101,69,8,2,0
1,102,37,5,1,0
2,103,38,6,0,0


Unnamed: 0,review_id,text,detect_noise
0,101,"Barangnya mantul!!! Email saya user@mail.id, k...","{'urls': [], 'html_tags': [], 'emails': ['user..."
1,102,<p>Murah banget & dikirim <3</p> Harga 5/5 deh,"{'urls': [], 'html_tags': ['<p>', '<3</p>'], '..."
2,103,Produk oke tapi packing kurang rapi :(,"{'urls': [], 'html_tags': [], 'emails': [], 'p..."


In [17]:
from leksara import leksara
from leksara.function import (
    case_normal,
    remove_punctuation,
    remove_whitespace,
    replace_address,
    replace_email,
    replace_id,
    replace_phone,
    replace_url,
 )
import pandas as pd

# Membuat DataFrame contoh
df = pd.DataFrame({
    "chat_id": [1, 2],
    "chat_message": [
        "Halo! Nomor saya 0812-3456-7890. Email: x@y.com, Alamat: Jakarta",
        "Hubungi +6281234567890 ya — EMAIL saya: test@mail.co.id! Alamat saya di Bandung"
    ]
})

# Menyusun custom pipeline dengan PII cleaning
custom_pipeline = {
    "patterns": [
        (replace_phone, {"mode": "replace"}),
        (replace_email, {"mode": "replace"}),
        (replace_address, {"mode": "replace"}),
        (replace_id, {"mode": "replace"})
    ],
    "functions": [
        case_normal,
        replace_url,
        remove_punctuation,
        remove_whitespace
    ]
}

# Menggunakan pipeline pada kolom chat_message
df["safe_message"] = leksara(df["chat_message"], pipeline=custom_pipeline)

# Menampilkan hasilnya
df[["chat_id", "safe_message"]]

Unnamed: 0,chat_id,safe_message
0,1,halo [ADDRESS] saya [PHONE_NUMBER] email [EMAI...
1,2,hubungi [PHONE_NUMBER] ya email saya [EMAIL] a...


In [18]:
from leksara import leksara 

df['safe_message'] = leksara(df['chat_message'], preset='ecommerce_review')
df[['chat_id', 'safe_message']]

Unnamed: 0,chat_id,safe_message
0,1,halo [ADDRESS] [PHONE_NUMBER] email [EMAIL] al...
1,2,hubung [PHONE_NUMBER] ya email [EMAIL] alamat ...


In [19]:
# Menggunakan pipeline default pada kolom chat_message
df["safe_message"] = leksara(df["chat_message"])

# Menampilkan hasilnya
df[["chat_id", "safe_message"]]

Unnamed: 0,chat_id,safe_message
0,1,halo nomor saya 081234567890 email xycom alama...
1,2,hubungi 6281234567890 ya email saya testmailco...


In [20]:
from leksara import leksara
from leksara.function import word_normalization

data = ["Produk Bagus sekali", "Saya membeli peralatan rumah tangga"]

custom_pipeline = {
    "patterns": [],
    "functions": [
        (word_normalization, {"word_list": ["Bagus"], "mode": "keep"})
    ]
}

print(leksara(data, pipeline=custom_pipeline))


['produk Bagus sekali', 'saya beli alat rumah tangga']


In [21]:
from leksara import leksara
from leksara.function import (
    case_normal,
    remove_punctuation,
    remove_stopwords,
    remove_whitespace,
 )

# --- contoh data ---
df = pd.DataFrame({
    "chat_id": [1, 2, 3],
    "chat_message": [
        "Saya sangat suka produk ini, dan saya akan beli lagi!",
        "Produk ini bagus sekali untuk dipakai di rumah.",
        "Namun, harga-nya agak mahal ya..."
    ]
})

# --- pipeline sederhana ---
custom_pipeline = {
    "functions": [
        case_normal,
        remove_stopwords,
        remove_punctuation,
        remove_whitespace
    ]
}

df["cleaned"] = leksara(df["chat_message"], pipeline=custom_pipeline)

print("=== Data Asli ===")
print(df[["chat_id", "chat_message"]])

print("\n=== Setelah Cleaning ===")
print(df[["chat_id", "cleaned"]])

=== Data Asli ===
   chat_id                                       chat_message
0        1  Saya sangat suka produk ini, dan saya akan bel...
1        2    Produk ini bagus sekali untuk dipakai di rumah.
2        3                  Namun, harga-nya agak mahal ya...

=== Setelah Cleaning ===
   chat_id                     cleaned
0        1            suka produk beli
1        2  produk bagus dipakai rumah
2        3           harganya mahal ya


In [22]:
from leksara import ReviewChain
from leksara.core.presets import get_preset

# Ambil preset ecommerce_review
pipeline_steps = get_preset("ecommerce_review")

# Membuat ReviewChain dengan langkah-langkah dari preset (BENAR)
review_chain = ReviewChain.from_steps(
    patterns=pipeline_steps["patterns"],
    functions=pipeline_steps["functions"],
)

# Sample data untuk memproses
data = [
    "Produk baru saya: iphone12, harga 12 juta. Hubungi 0812-3456-7890.",
    "Email saya: test@example.com. Produk sangat berkualitas!"
]

# Proses teks menggunakan pipeline
processed_data = review_chain.transform(data)

# Menampilkan hasil setelah diproses
for original, processed in zip(data, processed_data):
    print(f"Original: {original}")
    print(f"Processed: {processed}\n")

out, metrics = review_chain.transform(data, benchmark=True)
display(metrics)
display(out)


Original: Produk baru saya: iphone12, harga 12 juta. Hubungi 0812-3456-7890.
Processed: produk iphone12 harga 12 juta hubung [PHONE_NUMBER]

Original: Email saya: test@example.com. Produk sangat berkualitas!
Processed: email [EMAIL] produk kualitas



{'n_steps': 15,
 'total_time_sec': 0.0003864000100293197,
 'per_step': [('word_normalization', 0.00012089999654563144),
  ('replace_phone', 4.049999552080408e-05),
  ('remove_stopwords', 3.91000066883862e-05),
  ('mask_whitelist', 3.5099998058285564e-05),
  ('remove_emoji', 3.130000550299883e-05),
  ('unmask_whitelist', 2.2000000171829015e-05),
  ('replace_address', 1.8300008377991617e-05),
  ('replace_url', 1.799999881768599e-05),
  ('remove_punctuation', 1.5300000086426735e-05),
  ('shorten_elongation', 1.1699994502123445e-05),
  ('replace_email', 1.0000003385357559e-05),
  ('replace_id', 9.399998816661537e-06),
  ('remove_whitespace', 8.100003469735384e-06),
  ('remove_tags', 4.499997885432094e-06),
  ('case_normal', 2.2000021999701858e-06)]}

['produk iphone12 harga 12 juta hubung [PHONE_NUMBER]',
 'email [EMAIL] produk kualitas']

In [23]:
steps = review_chain.named_steps
for i, (k, v) in enumerate(steps.items(), 1):
    print(f"{i}. {v}")

1. replace_phone
2. replace_email
3. replace_address
4. replace_id
5. mask_whitelist
6. remove_tags
7. case_normal
8. replace_url
9. remove_emoji
10. replace_rating
11. expand_contraction
12. normalize_slangs
13. replace_acronym
14. word_normalization
15. remove_stopwords
16. shorten_elongation
17. remove_punctuation
18. remove_whitespace
19. unmask_whitelist
