# Import

In [79]:
import numpy as np
import pandas as pd
import random
import re

# Library Bantuan
from tqdm import tqdm
import warnings
import pickle
warnings.filterwarnings('ignore')


# Library Sendiri
from libraryBantuan.katla import Katla

# Konstanta
PATH_FILE_TEXT = "D:\\Rama Nitip\\python\\katla\\file text\\"

# Baca Data

In [17]:
daftar_kata = list(pd.read_csv(PATH_FILE_TEXT + 'lima huruf katla.csv', header=None)[0])
print(f"Banyak Kata : {len(daftar_kata)}")

Banyak Kata : 8314


# Class Node

In [61]:
class Node():
    def __init__(self, kata, daftar_kata):
        self.kata = kata
        self.daftar_kata = daftar_kata 
        self.next_node = {}


# Class Decision Tree

In [84]:
class DecisionTree():
    def __init__(self):
        self.semua_pola = self.__cari_semua_kemungkinan_pola()
        self.root = None
        self.now = self.root

    def kata_berikutnya(self, pola, awal=False):
        if awal:
            self.now = self.root
        if pola in self.now.next_node:
            self.now = self.now.next_node[pola]
        return self.now.kata

    def buat_tree(self, kata, daftar_kata):
        self.root = self.__buat_tree(kata, daftar_kata, 0)

    def __buat_tree(self, kata, daftar_kata, depth):
        node = Node(kata, daftar_kata)
        if (depth==6) or (len(daftar_kata)<=1): # BASE CASE
            return node
        
        for pola in self.semua_pola:
            next_daftar_kata = self.kandidat(kata, pola, daftar_kata)
            if len(next_daftar_kata)==0:        # tidak ada daftar kata dengan pola ini
                continue
            next_kata = self.kata_terbaik(next_daftar_kata)
            node.next_node[pola] = self.__buat_tree(next_kata, next_daftar_kata, depth+1)
        return node

    def kandidat(self, tebakan, pola, daf_kata):
        pola_regex = [r"\w"]*5
        huruf_tanda_tanya = [tebakan[i] for i in range(5) if pola[i]=='?']
        huruf_bintang = ''.join([tebakan[i] for i in range(5) if (pola[i]=='*') & (tebakan[i] not in huruf_tanda_tanya)])

        # Membentuk pola regex
        for i, p in enumerate(pola):
            if p == '!':
                pola_regex[i] = tebakan[i]
            elif p == '?':
                pola_regex[i] = f"[^ {tebakan[i]+huruf_bintang}]"
            elif p == '*':
                pola_regex[i] = f"[^ {huruf_bintang}]"
        
        pola_regex = r"".join(pola_regex)
        daf_kata = re.findall(pola_regex, ' '.join(daf_kata))

        # Mengambil kata2 yang ada huruf ?
        list_harus_ada_huruf = [tebakan[i] for i in range(5) if pola[i]=='?']
        new_daf_kata = []
        for kata in daf_kata:
            kata_tanpa_benar = [kata[i] for i in range(5) if pola[i]!='!']
            sudah_benar = True
            for harus_ada_huruf in list_harus_ada_huruf:
                if harus_ada_huruf in kata_tanpa_benar:
                    kata_tanpa_benar.remove(harus_ada_huruf)
                else:
                    sudah_benar=False; break
            if sudah_benar:
                new_daf_kata.append(kata)
        return new_daf_kata

    def kata_terbaik(self, daftar_kata):
        def P(n):
            return n / len(daftar_kata)
        def impurity(n):
            return ((n-1) / n) if n!=0 else 0

        hasil = ("TIDAK ADA", -10000)
        impurity_awal = impurity(len(daftar_kata))
        # Cari kata terbaik
        for kata in daftar_kata:
            impurity_akhir = 0
            for pola in self.semua_pola:         # untuk setiap pola hitung kemungkinannya
                n = len(self.kandidat(kata, pola, daftar_kata))
                impurity_akhir += P(n) * impurity(n)
            
            information_gain = impurity_awal - impurity_akhir           
            if information_gain>hasil[1]:
                hasil = (kata, information_gain)
        return hasil[0]

    def __cari_semua_kemungkinan_pola(self):
        T = {'0' : '*', '1' : '?', '2' : '!'}
        list_pola = []
        for number in range(3**5):             # total kemungkinan pola : 3 pangkat 5
            ternary=np.base_repr(number,base=3)
            pola = str(ternary).zfill(5)       # kasi angka 0 di depan biar sampai 5 digit
            list_pola.append(''.join([T[i] for i in pola]))
        return list_pola

    def buat_dictionary(self, now):
        hasil = {
            'kata'       : now.kata,
            'daftar kata': now.daftar_kata,
            'next'       : {k:self.buat_dictionary(v) for k,v in now.next_node.items()}
        }

        return hasil

    def simpan_object(self, path):      # Fungsi buat menyimpan 
        with open(path, 'wb') as file:
            object_dict = self.buat_dictionary(self.root)
            pickle.dump(object_dict, file)

    def baca_object(self, path):
        with open(path, 'rb') as file:
            return pickle.load(file)

buat sample

In [76]:
df_sample = random.choices(daftar_kata, k=1000)
katla = Katla(df_sample)
katla.cari_kata_selanjutnya(1)

100%|██████████| 1000/1000 [01:17<00:00, 12.94it/s]


'sarik'

In [85]:
DT = DecisionTree()

In [86]:
DT.buat_tree("sarik", df_sample)

# Lihat isi Dictionary

In [78]:
from pprint import pprint
simpan_DT = DT.buat_dictionary(DT.root)

pprint(DT.buat_dictionary(DT.root))

{'daftar kata': ['katah',
                 'kenek',
                 'lupuk',
                 'lekuk',
                 'sekap',
                 'fluor',
                 'imbak',
                 'gulat',
                 'ranai',
                 'rizki',
                 'pujer',
                 'kunut',
                 'khaul',
                 'patos',
                 'rakun',
                 'caduk',
                 'kecai',
                 'lotak',
                 'bidak',
                 'sukur',
                 'kasap',
                 'bulus',
                 'antul',
                 'polok',
                 'fitur',
                 'fetis',
                 'sawar',
                 'gemit',
                 'bindi',
                 'skafa',
                 'pensi',
                 'sepet',
                 'taiko',
                 'karar',
                 'solid',
                 'umrat',
                 'ceret',
                 'kolah',
            

# Simpan Object Decision Tree

In [87]:
DT.simpan_object(PATH_FILE_TEXT+"DT.dictionary")