In [None]:
import json
import glob
import pandas as pd
from collections import Counter

file_pattern = 'dict/*.json' 
json_files = glob.glob(file_pattern)

print(f"Ditemukan {len(json_files)} file JSON: {json_files}")

combined_dict = {}
duplicates = []

for file in json_files:
    with open(file, 'r', encoding='utf-8') as f:
        try:
            data = json.load(f)
            
            for slang, baku in data.items():
                # Cleaning ringan (lowercase & strip spasi)
                slang_clean = str(slang).lower().strip()
                baku_clean = str(baku).lower().strip()
                
                # Cek duplikat antar file (jika ada)
                if slang_clean in combined_dict and combined_dict[slang_clean] != baku_clean:
                    duplicates.append({
                        'slang': slang_clean,
                        'existing': combined_dict[slang_clean],
                        'new': baku_clean,
                        'file': file
                    })
                
                combined_dict[slang_clean] = baku_clean
                
        except Exception as e:
            print(f"Error membaca file {file}: {e}")

df_kamus = pd.DataFrame(list(combined_dict.items()), columns=['slang', 'baku'])

# A. Cek Self-Loops (Kata Slang == Kata Baku) -> Harusnya dibuang
self_loops = df_kamus[df_kamus['slang'] == df_kamus['baku']]

# B. Cek Target yang "Mirip" tapi Beda (Inkonsistensi)
# Contoh: Targetnya ada 'tidak', 'enggak', 'tak', 'gak' -> Harusnya disatukan jadi 'tidak'
target_counts = df_kamus['baku'].value_counts().reset_index()
target_counts.columns = ['kata_baku', 'frekuensi_slang']

# C. Cek Frasa Panjang (AI sering error ngasih definisi, bukan kata ganti)
long_phrases = df_kamus[df_kamus['baku'].str.split().str.len() > 2]

# ==========================================
# 4. REPORTING
# ==========================================
print("\n" + "="*30)
print("LAPORAN VALIDASI KAMUS")
print("="*30)
print(f"Total kata slang terkumpul: {len(df_kamus)}")

print(f"\n[WARNA KUNING] Self-Loops (Slang == Baku): {len(self_loops)}")
if len(self_loops) > 0:
    print("Contoh: ", self_loops.head(3).values.tolist())
    print("-> Saran: Hapus baris ini.")

print(f"\n[BAHAYA] Frasa Panjang (>2 kata): {len(long_phrases)}")
if len(long_phrases) > 0:
    print("Contoh: ", long_phrases.head(3).values.tolist())
    print("-> Saran: Cek manual, biasanya ini halusinasi AI.")

print(f"\n[CEK KONSISTENSI] Top 10 Kata Baku Paling Sering Jadi Target:")
print(target_counts.head(10))

Ditemukan 7 file JSON: ['dict\\1.json', 'dict\\2.json', 'dict\\3.json', 'dict\\4.json', 'dict\\5.json', 'dict\\6.json', 'dict\\7.json']

LAPORAN VALIDASI KAMUS
Total kata slang terkumpul: 507

[WARNA KUNING] Self-Loops (Slang == Baku): 74
Contoh:  [['sih', 'sih'], ['dong', 'dong'], ['deh', 'deh']]
-> Saran: Hapus baris ini.

[BAHAYA] Frasa Panjang (>2 kata): 2
Contoh:  [['diln', 'di luar negeri'], ['kaburajadlu', 'kabur saja dulu']]
-> Saran: Cek manual, biasanya ini halusinasi AI.

[CEK KONSISTENSI] Top 10 Kata Baku Paling Sering Jadi Target:
      kata_baku  frekuensi_slang
0         tidak               19
1          saja                8
2        sampai                5
3       seperti                5
4          saya                5
5         kalau                5
6        karena                5
7         sudah                5
8        hahaha                5
9  terima kasih                4


In [6]:
df_clean = df_kamus[df_kamus['slang'] != df_kamus['baku']].copy()

print(f"Jumlah setelah hapus self-loop: {len(df_clean)} (Berkurang {len(df_kamus) - len(df_clean)} kata)")

# 3. Simpan ke JSON Final
# Convert ke format dictionary { 'slang': 'baku' }
kamus_final_dict = dict(zip(df_clean['slang'], df_clean['baku']))

output_json = 'kamus_baku.json'
with open(output_json, 'w') as f:
    json.dump(kamus_final_dict, f)

print(f"\nSUKSES! Kamus final disimpan di: {output_json}")
print("Siap digunakan untuk preprocessing.")

Jumlah setelah hapus self-loop: 433 (Berkurang 74 kata)

SUKSES! Kamus final disimpan di: kamus_baku.json
Siap digunakan untuk preprocessing.
