In [1]:
import pandas as pd
import re, string
from spellchecker import SpellChecker
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
df = pd.read_json("../data/transcript/metadata.json")  
df.head()

Unnamed: 0,sample_id,source,keyword,video_id,title,channel,duration,upload_date,url,audio_path,thumbnail_path,transcript_path,transcript_text,transcript_length,language,label,status,transcribed_at
0,YT_00001,youtube,Prabowo presiden Indonesia 2024,IuE7NGQs_xk,Presiden Prabowo Tiba di Malaysia untuk Hadiri...,Sekretariat Presiden,286,20251025,https://www.youtube.com/watch?v=IuE7NGQs_xk,./data/raw/youtube/audio/YT_00001.wav,./data/raw/youtube/thumbnails/YT_00001.jpg,./data/transcript/YT_00001.txt,lotus tonka Ayo! Ayo! Ayo! Ayo! Ayo! Ayo! Ayo...,204,id,0,success,2025-10-31 18:29:39.418341+00:00
1,YT_00002,youtube,Prabowo presiden Indonesia 2024,n7rRd0IF_yc,[FULL] Wapres Gibran Umumkan Kado Istimewa dar...,KOMPASTV,456,20251024,https://www.youtube.com/watch?v=n7rRd0IF_yc,./data/raw/youtube/audio/YT_00002.wav,./data/raw/youtube/thumbnails/YT_00002.jpg,./data/transcript/YT_00002.txt,yang saya hormati sebuah pondok bahagia ia ji...,3573,id,0,success,2025-10-31 18:30:08.634255+00:00
2,YT_00003,youtube,Prabowo presiden Indonesia 2024,6f17uwi2Vtw,Live Event Rekam Jejak Prabowo Menjadi Preside...,METRO TV,222,20241020,https://www.youtube.com/watch?v=6f17uwi2Vtw,./data/raw/youtube/audio/YT_00003.wav,./data/raw/youtube/thumbnails/YT_00003.jpg,./data/transcript/YT_00003.txt,Puka perjalanan mudah bagi Prabowo Subianto h...,3071,id,0,success,2025-10-31 18:30:28.377608+00:00
3,YT_00004,youtube,Prabowo presiden Indonesia 2024,ExCBplKi6kw,"Ulas Utas, Setahun Presiden Prabowo: 'Koruptor...",tvOneNews,567,20251023,https://www.youtube.com/watch?v=ExCBplKi6kw,./data/raw/youtube/audio/YT_00004.wav,./data/raw/youtube/thumbnails/YT_00004.webp,./data/transcript/YT_00004.txt,Selama perioda satu tahun menjabat President ...,6428,id,0,success,2025-10-31 18:31:24.564553+00:00
4,YT_00005,youtube,Kabinet merah putih Prabowo,PqjwO9JEtBM,Reshuffle Kabinet Merah Putih Prabowo-Gibran |...,tvOneNews,467,20250917,https://www.youtube.com/watch?v=PqjwO9JEtBM,./data/raw/youtube/audio/YT_00005.wav,./data/raw/youtube/thumbnails/YT_00005.jpg,./data/transcript/YT_00005.txt,Presiden Perabo Subiantok kembali melakukan p...,6049,id,0,success,2025-10-31 18:32:02.061119+00:00


In [3]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)                     # hapus angka
    text = re.sub(r'[' + string.punctuation + ']', '', text)  # hapus tanda baca
    text = re.sub(r'\s+', ' ', text)                    # hapus spasi berlebih
    text = re.sub(r'(?:\b(\w+)\b\s+)(?=\1\b)', '', text)  # hapus kata duplikat berurutan
    text = text.strip()
    return text


In [4]:
common_errors = {
    "peraputik": "merah putih",
    "perabo": "prabowo",
    "subiantok": "subianto",
    "president": "presiden",
    "puka": "buka",
    "menkopolokap": "menkopolhukam",
    "komps": "kompas",
    "presidenmu": "presiden",
    "parabohu": "prabowo",
    "perabohu": "prabowo",
    "perabuhu": "prabowo",
    "prasidian": "presiden",
    "menkopolokap": "menkopolhukam",
    "peraputi": "merah putih",
    "pariburnah": "paripurna",
    "pejapat": "pejabat"
}

def fix_common_errors(text):
    for k, v in common_errors.items():
        text = text.replace(k, v)
    return text

In [5]:
spell = SpellChecker(language=None)
spell.word_frequency.load_words([
    "prabowo","gibran","indonesia","merah","putih","kabinet",
    "tvone","kompastv","metrotv","sindonews","politik","presiden"
])  # tambahkan kata domain politik agar tidak dikoreksi salah

def correct_spelling(text):
    words = text.split()
    corrected = []
    for w in words:
        if len(w) > 3:
            corrected.append(spell.correction(w) or w)
        else:
            corrected.append(w)
    return ' '.join(corrected)

In [6]:
factory = StemmerFactory()
stemmer = factory.create_stemmer()
stop_words = set(stopwords.words('indonesian'))

def normalize_tokens(text):
    text = stemmer.stem(text)
    text = ' '.join([w for w in text.split() if w not in stop_words])
    return text


In [None]:
def normalize_pipeline(text):
    text = clean_text(text)
    text = fix_common_errors(text)
    text = correct_spelling(text)
    text = normalize_tokens(text)
    return text

df['normalized_transcript'] = df['transcript_text'].apply(normalize_pipeline)

In [None]:
df[['sample_id', 'channel', 'normalized_transcript']].to_csv("normalized_transcript.csv", index=False)
print("âœ… Normalisasi selesai! File tersimpan sebagai 'normalized_transcript.csv'")

# Cek contoh hasil
df[['channel', 'normalized_transcript']].head(10)