In [51]:
import pandas as pd
import re

In [53]:
# Load the dataset
file_path = 'kamus_clean.csv'
data = pd.read_csv(file_path)

In [55]:
# 1. Bersihkan dataset kamus
# Drop unnecessary columns and remove duplicates
data_cleaned = data.drop(columns=['Unnamed: 0']).drop_duplicates()

In [57]:
# Convert all text to lowercase
data_cleaned['TIDAK BAKU'] = data_cleaned['TIDAK BAKU'].str.lower()
data_cleaned['BAKU'] = data_cleaned['BAKU'].str.lower()

In [59]:
# Remove any special characters or numbers in "TIDAK BAKU" and "BAKU" columns
data_cleaned['TIDAK BAKU'] = data_cleaned['TIDAK BAKU'].apply(lambda x: re.sub(r'[^a-z\s]', '', x))
data_cleaned['BAKU'] = data_cleaned['BAKU'].apply(lambda x: re.sub(r'[^a-z\s]', '', x))

In [61]:
# Convert the cleaned data into a dictionary for normalization
normalization_dict = dict(zip(data_cleaned['TIDAK BAKU'], data_cleaned['BAKU']))

In [63]:
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    
    # Remove special characters and numbers
    text = re.sub(r'[^a-z\s]', '', text)
    
    # Split text into words
    words = text.split()
    normalized_words = []
    
    for word in words:
        # Replace non-standard word with standard form if available
        normalized_word = normalization_dict.get(word, word)
        normalized_words.append(normalized_word)
    
    # Join words back to a single string without duplicates or additional spaces
    processed_text = ' '.join(normalized_words).strip()
    return processed_text


In [67]:
# Contoh penggunaan
# Contoh kalimat
sample_text_1 = "Abstarck adalah konsep yang sebenenarnya kompleks."
sample_text_2 = "Ini evolusionis dalam pendekatan baru."

# Preprocessing pada contoh kalimat
processed_text_1 = preprocess_text(sample_text_1)
processed_text_2 = preprocess_text(sample_text_2)

# Cetak hasilnya
print("Teks Asli 1:", sample_text_1)
print("Teks Setelah Preprocessing 1:", processed_text_1)
print("\nTeks Asli 2:", sample_text_2)
print("Teks Setelah Preprocessing 2:", processed_text_2)

Teks Asli 1: Abstarck adalah konsep yang sebenenarnya kompleks.
Teks Setelah Preprocessing 1: abstrak adalah konsep yang sebenarnya kompleks

Teks Asli 2: Ini evolusionis dalam pendekatan baru.
Teks Setelah Preprocessing 2: ini revolusioner dalam pendekatan baru
