In [1]:
import pandas as pd
from fuzzywuzzy import process

# Load dataset
data = pd.read_csv('kokdata2.csv')
translation_dict = dict(zip(data['English'].str.lower(), data['Kokborok']))

def translate_to_kokborok(sentence):
    words = sentence.lower().split()  # Tokenize by space
    translated_sentence = []
    english_phrases = list(translation_dict.keys())

    i = 0
    while i < len(words):
        # Match 2-word phrase
        if i + 1 < len(words):
            two_word = f"{words[i]} {words[i+1]}"
            if two_word in translation_dict:
                translated_sentence.append(translation_dict[two_word])
                i += 2
                continue

        # Match single word exactly
        if words[i] in translation_dict:
            translated_sentence.append(translation_dict[words[i]])
        else:
            # Optional: fuzzy fallback for single word typos
            match, score = process.extractOne(words[i], english_phrases)
            if score > 90:
                translated_sentence.append(translation_dict[match])
            else:
                translated_sentence.append(f"[{words[i]}]")  # mark untranslated
        i += 1

    return ' '.join(translated_sentence)

# Continuous input loop
if __name__ == "__main__":
    print("Type 'exit' to quit the translator.")
    while True:
        user_input = input("\nEnter an English sentence: ")
        if user_input.lower() == "exit":
            print("Goodbye! 👋")
            break
        kokborok_translation = translate_to_kokborok(user_input)
        print(f"Kokborok translation: {kokborok_translation}")







Type 'exit' to quit the translator.



Enter an English sentence:  hello


Kokborok translation: Khulum



Enter an English sentence:  hello thank you


Kokborok translation: Khulum Bathwngthai



Enter an English sentence:  exit


Goodbye! 👋


In [2]:
import pandas as pd
import re
from fuzzywuzzy import process

# Load dataset
data = pd.read_csv('kokdata2.csv')
data.columns = data.columns.str.strip()
translation_dict = dict(zip(data['English'].str.lower(), data['Kokborok']))

# Parameters
FUZZY_THRESHOLD = 90

# Preprocessing function
def tokenize(text):
    # Lowercase and handle punctuation
    text = text.lower()
    text = re.sub(r'([.,!?])', r' \1 ', text)
    text = re.sub(r'\s{2,}', ' ', text)
    return text.strip().split()

# Translator function
def translate_to_kokborok(sentence):
    words = tokenize(sentence)
    translated_sentence = []
    english_phrases = list(translation_dict.keys())

    i = 0
    while i < len(words):
        matched = False

        # Try trigrams
        if i + 2 < len(words):
            trigram = f"{words[i]} {words[i+1]} {words[i+2]}"
            if trigram in translation_dict:
                translated_sentence.append(translation_dict[trigram])
                i += 3
                matched = True

        # Try bigrams
        if not matched and i + 1 < len(words):
            bigram = f"{words[i]} {words[i+1]}"
            if bigram in translation_dict:
                translated_sentence.append(translation_dict[bigram])
                i += 2
                matched = True

        # Try single word
        if not matched:
            word = words[i]
            if word in translation_dict:
                translated_sentence.append(translation_dict[word])
            else:
                match, score = process.extractOne(word, english_phrases)
                if score >= FUZZY_THRESHOLD:
                    translated_sentence.append(translation_dict[match])
                else:
                    translated_sentence.append(f"[{word}]")  # untranslated
            i += 1

    return ' '.join(translated_sentence)

# Continuous input loop
if __name__ == "__main__":
    print("Type 'exit' to quit the translator.")
    while True:
        user_input = input("\nEnter an English sentence: ")
        if user_input.lower() == "exit":
            print("Goodbye! 👋")
            break
        kokborok_translation = translate_to_kokborok(user_input)
        print(f"Kokborok translation: {kokborok_translation}")


Type 'exit' to quit the translator.



Enter an English sentence:  hello


Kokborok translation: Khulum



Enter an English sentence:  hello?




Kokborok translation: Khulum [?]



Enter an English sentence:  exit


Goodbye! 👋


In [3]:
import pandas as pd
import re
from fuzzywuzzy import process
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import load_model
import numpy as np

# Load dataset
data = pd.read_csv('kokdata2.csv')
data.columns = data.columns.str.strip()
translation_dict = dict(zip(data['English'].str.lower(), data['Kokborok']))

# Parameters
FUZZY_THRESHOLD = 90
MAX_SEQUENCE_LENGTH = 20

# Load tokenizer and model (Assumes you saved tokenizer and model after training)
tokenizer_eng = Tokenizer()
tokenizer_kok = Tokenizer()

# Fit tokenizer on dataset
all_eng = data['English'].str.lower().tolist()
all_kok = data['Kokborok'].str.lower().tolist()
tokenizer_eng.fit_on_texts(all_eng)
tokenizer_kok.fit_on_texts(all_kok)

# Load pre-trained model (you must have this saved from training phase)
try:
    model = load_model('kokborok_seq2seq_model.h5')
except:
    model = None
    print("Warning: Deep learning model not found, only dictionary/fuzzy matching will be used.")

# Preprocessing function
def tokenize(text):
    text = text.lower()
    text = re.sub(r'([.,!?])', r' \1 ', text)
    text = re.sub(r'\s{2,}', ' ', text)
    return text.strip().split()

# Rule-based Translator
def translate_with_dictionary(words):
    translated_sentence = []
    english_phrases = list(translation_dict.keys())

    i = 0
    while i < len(words):
        matched = False

        if i + 2 < len(words):
            trigram = f"{words[i]} {words[i+1]} {words[i+2]}"
            if trigram in translation_dict:
                translated_sentence.append(translation_dict[trigram])
                i += 3
                matched = True

        if not matched and i + 1 < len(words):
            bigram = f"{words[i]} {words[i+1]}"
            if bigram in translation_dict:
                translated_sentence.append(translation_dict[bigram])
                i += 2
                matched = True

        if not matched:
            word = words[i]
            if word in translation_dict:
                translated_sentence.append(translation_dict[word])
            else:
                match, score = process.extractOne(word, english_phrases)
                if score >= FUZZY_THRESHOLD:
                    translated_sentence.append(translation_dict[match])
                else:
                    translated_sentence.append(f"[UNK]")
            i += 1

    return translated_sentence

# Deep Learning Translator
def translate_with_model(sentence):
    seq = tokenizer_eng.texts_to_sequences([sentence])
    seq = pad_sequences(seq, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
    
    prediction = model.predict(seq)
    output_seq = np.argmax(prediction, axis=-1)
    translated_tokens = tokenizer_kok.sequences_to_texts(output_seq)
    return translated_tokens[0]

# Main translator
def hybrid_translate(sentence):
    words = tokenize(sentence)
    dict_translation = translate_with_dictionary(words)

    # If too many [UNK], fallback to model
    unk_count = dict_translation.count("[UNK]")
    if model and unk_count > len(words) * 0.3:  # fallback threshold
        print("[INFO] Fallback to deep learning model.")
        return translate_with_model(sentence)
    else:
        return ' '.join(dict_translation)

# Continuous input loop
if __name__ == "__main__":
    print("Type 'exit' to quit the translator.")
    while True:
        user_input = input("\nEnter an English sentence: ")
        if user_input.lower() == "exit":
            print("Goodbye! 👋")
            break
        kokborok_translation = hybrid_translate(user_input)
        print(f"Kokborok translation: {kokborok_translation}")


Type 'exit' to quit the translator.



Enter an English sentence:  hello


Kokborok translation: Khulum



Enter an English sentence:  hello?




Kokborok translation: Khulum [UNK]



Enter an English sentence:  exit


Goodbye! 👋


In [None]:
import pandas as pd
import re
from fuzzywuzzy import process
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import load_model
import numpy as np

# Load dataset
data = pd.read_csv('kokdata2.csv')
data.columns = data.columns.str.strip()
translation_dict = dict(zip(data['English'].str.lower(), data['Kokborok']))

# Parameters
FUZZY_THRESHOLD = 90
MAX_SEQUENCE_LENGTH = 20

# Load tokenizer and model (Assumes you saved tokenizer and model after training)
tokenizer_eng = Tokenizer()
tokenizer_kok = Tokenizer()

# Fit tokenizer on dataset
all_eng = data['English'].str.lower().tolist()
all_kok = data['Kokborok'].str.lower().tolist()
tokenizer_eng.fit_on_texts(all_eng)
tokenizer_kok.fit_on_texts(all_kok)

# Load pre-trained model (you must have this saved from training phase)
try:
    model = load_model('kokborok_seq2seq_model.h5')
except:
    model = None
    print("Warning: Deep learning model not found, only dictionary/fuzzy matching will be used.")

# Preprocessing function
def tokenize(text):
    text = text.lower()
    # Handle punctuation including ?, !, ., ,, ;, :, (, )
    text = re.sub(r'([.,!?;:()])', r' \1 ', text)
    text = re.sub(r'\s{2,}', ' ', text)
    return text.strip().split()

# Rule-based Translator
def translate_with_dictionary(words):
    translated_sentence = []
    english_phrases = list(translation_dict.keys())

    i = 0
    while i < len(words):
        matched = False

        if i + 2 < len(words):
            trigram = f"{words[i]} {words[i+1]} {words[i+2]}"
            if trigram in translation_dict:
                translated_sentence.append(translation_dict[trigram])
                i += 3
                matched = True

        if not matched and i + 1 < len(words):
            bigram = f"{words[i]} {words[i+1]}"
            if bigram in translation_dict:
                translated_sentence.append(translation_dict[bigram])
                i += 2
                matched = True

        if not matched:
            word = words[i]
            if word in translation_dict:
                translated_sentence.append(translation_dict[word])
            elif re.match(r'[.,!?;:()]', word):
                translated_sentence.append(word)  # keep punctuation
            else:
                match, score = process.extractOne(word, english_phrases)
                if score >= FUZZY_THRESHOLD:
                    translated_sentence.append(translation_dict[match])
                else:
                    translated_sentence.append(f"[UNK]")
            i += 1

    return translated_sentence

# Deep Learning Translator
def translate_with_model(sentence):
    seq = tokenizer_eng.texts_to_sequences([sentence])
    seq = pad_sequences(seq, maxlen=MAX_SEQUENCE_LENGTH, padding='post')

    prediction = model.predict(seq)
    output_seq = np.argmax(prediction, axis=-1)
    translated_tokens = tokenizer_kok.sequences_to_texts(output_seq)
    return translated_tokens[0]

# Main translator
def hybrid_translate(sentence):
    words = tokenize(sentence)
    dict_translation = translate_with_dictionary(words)

    unk_count = dict_translation.count("[UNK]")
    if model and unk_count > len(words) * 0.3:  # fallback threshold
        print("[INFO] Fallback to deep learning model.")
        return translate_with_model(sentence)
    else:
        return ' '.join(dict_translation)

# Continuous input loop
if __name__ == "__main__":
    print("Type 'exit' to quit the translator.")
    while True:
        user_input = input("\nEnter an English sentence: ")
        if user_input.lower() == "exit":
            print("Goodbye! 👋")
            break
        kokborok_translation = hybrid_translate(user_input)
        print(f"Kokborok translation: {kokborok_translation}")


Type 'exit' to quit the translator.



Enter an English sentence:  hello?


Kokborok translation: Khulum ?



Enter an English sentence:  hello thank you ?


Kokborok translation: Khulum Bathwngthai ?
