In [None]:
import numpy as np
import pandas as pd

In [None]:
df = pd.read_csv("dataset_artikel_labeled_raw.csv")

In [None]:
df.head(5)

Unnamed: 0,link,judul,konten,tanggal,portal,tag,sentiment
0,https://kumparan.com/kumparanbisnis/garuda-ind...,Garuda Indonesia Kembali RUPSLB di Tengah Isu ...,Garuda Indonesia Kembali RUPSLB di Tengah Isu ...,30/09/2025,Kumparan,Manajemen,Neutral
1,https://www.bloombergtechnoz.com/detail-news/8...,Garuda Gelar RUPSLB di Tengah Isu Masuknya Dir...,Garuda Gelar RUPSLB di Tengah Isu Masuknya Dir...,29/09/2025,Bloomberg Technoz,Manajemen,Neutral
2,https://voi.id/ekonomi/519004/komisi-v-dpr-bak...,Komisi V DPR Bakal Dalami Dugaan Mafia Jual Be...,JAKARTA - Ketua Komisi V DPR Lasarus mengataka...,29/09/2025,VOI.ID,Rute/Operasional,Negative
3,https://www.kompasiana.com/zainularifin2714/68...,Rencana Merger Garuda Indonesia - Pelita Air: ...,"Latar Belakang\nPada pertengahan 2023, wacana ...",29/09/2025,Kompasiana.com,Lainnya,Neutral
4,https://www.cnnindonesia.com/ekonomi/202509292...,Dony Oskaria Pastikan Merger Pelita Air-Garuda...,--\nPlt Menteri Badan Usaha Milik Negara (BUMN...,29/09/2025,CNN Indonesia,Lainnya,Neutral


#Normalisasi

In [None]:
lex = pd.read_csv("colloquial-indonesian-lexicon.csv")

In [None]:
lex = lex[['slang', 'formal']].dropna()
# normalisasi ringan
lex['slang'] = lex['slang'].str.lower().str.strip()
lex['formal'] = lex['formal'].str.lower().str.strip()

# kalau ada slang duplikat, ambil yang pertama
lex = lex.drop_duplicates(subset='slang')

# buat dictionary mapping
slang2formal = dict(zip(lex['slang'], lex['formal']))

In [None]:
import re

# urutkan slang berdasarkan panjang (desc)
sorted_slang = sorted(slang2formal.keys(), key=len, reverse=True)

# escape karakter khusus & gabungkan jadi regex
pattern = re.compile(
    r'\b(' + '|'.join(map(re.escape, sorted_slang)) + r')\b',
    flags=re.IGNORECASE
)

In [None]:
def normalize_slang(text):
    if not isinstance(text, str):
        return text

    def replace(match):
        return slang2formal.get(match.group(0).lower(), match.group(0))

    text = pattern.sub(replace, text)

    # rapihin spasi ganda
    text = re.sub(r'\s+', ' ', text).strip()
    return text


#Hapus URL yg ikut ke scrap

In [None]:
import re

def remove_url(text):
    if not isinstance(text, str):
        return text

    # hapus http, https, www
    text = re.sub(r'https?://\S+|www\.\S+', '', text)

    # rapihin spasi
    text = re.sub(r'\s+', ' ', text).strip()
    return text


#Hapus CTA "Baca selanjutnya/klik disini/baca juga"

In [None]:
import re

def remove_news_cta(text):
    if not isinstance(text, str):
        return text

    cta_patterns = [
        r'\bbaca selengkapnya\b.*',
        r'\bbaca lebih lanjut\b.*',
        r'\bbaca juga\b.*',
        r'\bselengkapnya\b.*',
        r'\bklik di sini\b.*',
        r'\bklik disini\b.*'
    ]

    for pat in cta_patterns:
        text = re.sub(pat, '', text, flags=re.IGNORECASE)

    # rapihin spasi
    text = re.sub(r'\s+', ' ', text).strip()
    return text


#Clean Encoding Artefacts

In [None]:
pip install ftfy



In [None]:
import ftfy

def fix_text_encoding(text):
    if not isinstance(text, str):
        return text
    return ftfy.fix_text(text)

In [None]:
import re

def remove_encoding_artifacts(text):
    if not isinstance(text, str):
        return text

    # 1) buang karakter "Â" yang sering jadi sisa NBSP decode
    text = text.replace("Â", " ")

    # 2) perbaiki mojibake kutip/dash yang umum (kalau masih ada)
    text = (text
            .replace("â€œ", '"').replace("â€", '"')
            .replace("â€˜", "'").replace("â€™", "'")
            .replace("â€“", "-").replace("â€”", "-")
            .replace("â€¦", "...")
           )

    # 3) buang bullet/arrow yang sering nyangkut
    text = re.sub(r"[·•●►»«]", " ", text)

    return text


#Remove Boilerplate


In [None]:
def remove_news_boilerplate(text):
    if not isinstance(text, str):
        return text

    # tangkap "baca lebih lanjut/lajut", "baca selengkapnya", "baca juga", dll
    patterns = [
        r"\bbaca\s+lebih\s+lanj(?:ut|ut|u?t)?\b.*$",   # lanjut / lajut / variasi
        r"\bbaca\s+selengkapnya\b.*$",
        r"\bbaca\s+juga\b.*$",
        r"\bselengkapnya\b.*$",
        r"\bklik\s+di\s+sini\b.*$",
        r"\bklik\s+disini\b.*$",
    ]

    for p in patterns:
        text = re.sub(p, "", text, flags=re.IGNORECASE)

    return text


#Normalize Whitespace

In [None]:
def normalize_whitespace(text):
    if not isinstance(text, str):
        return text
    return re.sub(r'\s+', ' ', text).strip()

In [None]:
df["konten_clean_final"] = (
    df["konten"]
    .apply(fix_text_encoding)
    .apply(remove_encoding_artifacts)     # <-- pindah ke sini (lebih awal)
    .apply(normalize_slang)
    .apply(remove_url)
    .apply(remove_news_boilerplate)       # <-- sebelum whitespace final
    .apply(remove_news_cta)
    .apply(normalize_whitespace)          # <-- terakhir
)


In [None]:
mask = df["konten_clean_final"].str.contains(r"(?:Â|â€|â€œ|â€™|Â·|Â»)", regex=True, na=False)
df.loc[mask, "konten_clean_final"].head(10)


Unnamed: 0,konten_clean_final


In [None]:
print("Rows with artifacts:", mask.sum())

Rows with artifacts: 0


In [None]:
# buang baris kosong setelah cleaning
df = df.dropna(subset=['konten_clean_final'])
df = df[df['konten_clean_final'].str.strip() != '']

# (opsional) hapus duplikat konten
df = df.drop_duplicates(subset=['konten_clean_final'])


In [None]:
df.to_csv(
    "garuda_news_preprocessed_final.csv",
    index=False,
    encoding="utf-8"
)


In [None]:
df.head(5)

Unnamed: 0,link,judul,konten,tanggal,portal,tag,sentiment,konten_clean_final
0,https://kumparan.com/kumparanbisnis/garuda-ind...,Garuda Indonesia Kembali RUPSLB di Tengah Isu ...,Garuda Indonesia Kembali RUPSLB di Tengah Isu ...,30/09/2025,Kumparan,Manajemen,Neutral,Garuda Indonesia Kembali RUPSLB di Tengah Isu ...
1,https://www.bloombergtechnoz.com/detail-news/8...,Garuda Gelar RUPSLB di Tengah Isu Masuknya Dir...,Garuda Gelar RUPSLB di Tengah Isu Masuknya Dir...,29/09/2025,Bloomberg Technoz,Manajemen,Neutral,Garuda Gelar RUPSLB di Tengah Isu Masuknya Dir...
2,https://voi.id/ekonomi/519004/komisi-v-dpr-bak...,Komisi V DPR Bakal Dalami Dugaan Mafia Jual Be...,JAKARTA - Ketua Komisi V DPR Lasarus mengataka...,29/09/2025,VOI.ID,Rute/Operasional,Negative,JAKARTA - Ketua Komisi V DPR Lasarus mengataka...
3,https://www.kompasiana.com/zainularifin2714/68...,Rencana Merger Garuda Indonesia - Pelita Air: ...,"Latar Belakang\nPada pertengahan 2023, wacana ...",29/09/2025,Kompasiana.com,Lainnya,Neutral,"Latar Belakang Pada pertengahan 2023, wacana k..."
4,https://www.cnnindonesia.com/ekonomi/202509292...,Dony Oskaria Pastikan Merger Pelita Air-Garuda...,--\nPlt Menteri Badan Usaha Milik Negara (BUMN...,29/09/2025,CNN Indonesia,Lainnya,Neutral,-- Plt Menteri Badan Usaha Milik Negara (BUMN)...
