# Read Dataset

In [1]:
import pandas as pd

data = pd.read_csv("data/diy/sample_data_2.csv")
data.head()

Unnamed: 0,id,date,username,content
0,1929092282449371145,Sun Jun 01 2025 15:26:55 GMT+0700 (Western Ind...,txtfromjogja,Sudah biasaaa https://t.co/EEEQ0s9baL
1,1928993917166399801,Sun Jun 01 2025 08:56:03 GMT+0700 (Western Ind...,txtfromjogja,Iki yo mbulett ae 👀 https://t.co/T1hBxB07S7
2,1928744695951196511,Sat May 31 2025 16:25:44 GMT+0700 (Western Ind...,txtfromjogja,👎👎 https://t.co/QQci5hKbFT
3,1928667158675951668,Sat May 31 2025 11:17:38 GMT+0700 (Western Ind...,txtfromjogja,Zeremmmmm https://t.co/vGu6eLLZv6
4,1928667161624519081,Sat May 31 2025 11:17:38 GMT+0700 (Western Ind...,txtfromjogja,https://t.co/iLhtTQRj1D


In [2]:
# Contoh: styling kolom agar wrap
def display_fullscreen_wrap(df):
    return df.style.set_properties(
        **{
            "white-space": "pre-wrap",  # wrap isi cell
            "word-break": "break-word",  # pecah di mana saja kalau kepanjangan
            "width": "900px",  # bisa diganti sesuai kebutuhan
            "max-width": "700px",  # atur lebar kolom maksimal
        }
    )

# EDA

In [3]:
data.columns

Index(['id', 'date', 'username', 'content'], dtype='object')

### Summary

In [4]:
info = data.info()
print("Summary :", info)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27536 entries, 0 to 27535
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        27536 non-null  int64 
 1   date      27536 non-null  object
 2   username  27536 non-null  object
 3   content   27536 non-null  object
dtypes: int64(1), object(3)
memory usage: 860.6+ KB
Summary : None


### Find Missing Values

In [5]:
# menghitung dan menampilkan missing values
print("Jumlah missing values disetiap kolom:\n", data.isnull().sum())

Jumlah missing values disetiap kolom:
 id          0
date        0
username    0
content     0
dtype: int64


### Delete unnecessary columns and does not provide any information

In [6]:
data = data[["content"]]
data.head()

Unnamed: 0,content
0,Sudah biasaaa https://t.co/EEEQ0s9baL
1,Iki yo mbulett ae 👀 https://t.co/T1hBxB07S7
2,👎👎 https://t.co/QQci5hKbFT
3,Zeremmmmm https://t.co/vGu6eLLZv6
4,https://t.co/iLhtTQRj1D


# Data Preparation

### Case Folding

In [7]:
# merubah jenis huruf menjadi huruf kecil
data["content"] = data["content"].str.lower()
data.head()

Unnamed: 0,content
0,sudah biasaaa https://t.co/eeeq0s9bal
1,iki yo mbulett ae 👀 https://t.co/t1hbxb07s7
2,👎👎 https://t.co/qqci5hkbft
3,zeremmmmm https://t.co/vgu6ellzv6
4,https://t.co/ilhttqrj1d


## Cleaning Data

### Emoji To Word

In [8]:
import pandas as pd
from indoNLP.preprocessing import (
    pipeline,
    replace_word_elongation,
    replace_slang,
    remove_html,
    remove_url,
    emoji_to_words
)
# Apply emoji_to_words to the text column
data["content"] = data["content"].apply(lambda x: emoji_to_words(str(x), lang="id"))
display_fullscreen_wrap(data.head())

Unnamed: 0,content
0,sudah biasaaa https://t.co/eeeq0s9bal
1,iki yo mbulett ae !dua_mata! https://t.co/t1hbxb07s7
2,!jempol_ke_bawah!!jempol_ke_bawah! https://t.co/qqci5hkbft
3,zeremmmmm https://t.co/vgu6ellzv6
4,https://t.co/ilhttqrj1d


### Remove HTML

In [9]:
data["content"] = data["content"].apply(lambda x: remove_html(str(x)))
display_fullscreen_wrap(data.head())

Unnamed: 0,content
0,sudah biasaaa https://t.co/eeeq0s9bal
1,iki yo mbulett ae !dua_mata! https://t.co/t1hbxb07s7
2,!jempol_ke_bawah!!jempol_ke_bawah! https://t.co/qqci5hkbft
3,zeremmmmm https://t.co/vgu6ellzv6
4,https://t.co/ilhttqrj1d


### Remove URL

In [10]:
data["content"] = data["content"].apply(lambda x: remove_url(str(x)))
display_fullscreen_wrap(data.head())

Unnamed: 0,content
0,sudah biasaaa
1,iki yo mbulett ae !dua_mata!
2,!jempol_ke_bawah!!jempol_ke_bawah!
3,zeremmmmm
4,


### Replace Slang

In [11]:
data["content"] = data["content"].apply(lambda x: replace_slang(str(x)))
display_fullscreen_wrap(data.head())

Unnamed: 0,content
0,sudah biasa
1,iki ya mbulett saja !dua_mata!
2,!jempol_ke_bawah!!jempol_ke_bawah!
3,zeremmmmm
4,


### Replace Word Elongation

In [12]:
data["content"] = data["content"].apply(lambda x: replace_word_elongation(str(x)))
display_fullscreen_wrap(data.head())

Unnamed: 0,content
0,sudah biasa
1,iki ya mbulet saja !dua_mata!
2,!jempol_ke_bawah!!jempol_ke_bawah!
3,zerem
4,


### Remove Unnecessary Char

In [13]:
import re

def remove_unnecessary_char(text):
    text = re.sub('\n',' ',text) # Remove every '\n'
    text = re.sub('rt',' ',text) # Remove every retweet symbol
    text = re.sub('  +', ' ', text) # Remove extra spaces
    return text

data["content"] = data["content"].apply(lambda x: remove_unnecessary_char(str(x)))
display_fullscreen_wrap(data.head())

Unnamed: 0,content
0,sudah biasa
1,iki ya mbulet saja !dua_mata!
2,!jempol_ke_bawah!!jempol_ke_bawah!
3,zerem
4,


### Remove Unicode

In [14]:
def remove_unicode(text):
    # Mengkonversi ke ASCII dan menghapus karakter yang tidak bisa di-encode
    text = text.encode("ascii", "ignore").decode("ascii")

    # Regex untuk menghapus pola hex unicode (contoh: \xf0\x9f\x98\xad)
    text = re.sub(r"(\\x[a-fA-F0-9]{2})+", " ", text)

    # Regex untuk menghapus entitas numerik (contoh: \u1234)
    text = re.sub(r"(\\u[a-fA-F0-9]{4})+", " ", text)

    # Menghapus emoji yang sudah dikonversi (yang diapit tanda seru)
    text = re.sub(r"![^!]*!", " ", text)

    # Menghapus SEMUA jenis tanda kutip (termasuk yang khusus)
    text = re.sub(r'["\'`´]', " ", text)

    # Menghapus karakter khusus lainnya
    text = re.sub(r"[^\w\s]", " ", text)

    # Menghapus angka (opsional)
    text = re.sub(r"\d+", " ", text)

    # Menghilangkan spasi berlebih
    text = re.sub(r"\s+", " ", text).strip()

    return text

# Apply the updated function
data["content"] = data["content"].apply(lambda x: remove_unicode(str(x)))
display_fullscreen_wrap(data.head())

Unnamed: 0,content
0,sudah biasa
1,iki ya mbulet saja
2,
3,zerem
4,


### Replace User Mentions

In [15]:
# Function to replace URLs and user mentions
def replace_urls_and_mentions(text):
    # # Replace URLs with HTTPURL
    # text = re.sub(
    #     r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+",
    #     "HTTPURL",
    #     text,
    # )
    # text = re.sub(
    #     r"www\.(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+",
    #     "HTTPURL",
    #     text,
    # )

    # Replace user mentions with @USER
    text = re.sub(r"@[A-Za-z0-9_]+", "@USER", text)

    return text

# Apply URL and mention replacement
data["content"] = data["content"].apply(lambda x: replace_urls_and_mentions(str(x)))
display_fullscreen_wrap(data.head())

Unnamed: 0,content
0,sudah biasa
1,iki ya mbulet saja
2,
3,zerem
4,


### Check Duplicate Data that contain in the content column

In [16]:
data.duplicated().sum()

np.int64(6077)

In [17]:
data.drop_duplicates(inplace=True)

### Check Missing Value

In [18]:
# check missing value
data.isnull().sum()

content    0
dtype: int64

In [None]:
# Alternatif: proses dalam batch untuk efisiensi - HANYA 10 BARIS PERTAMA
try:
    from deep_translator import GoogleTranslator
    import time

    def translate_batch(texts, batch_size=10):
        translator = GoogleTranslator(source="javanese", target="id")
        translated = []

        for i in range(0, len(texts), batch_size):
            batch = texts[i : i + batch_size]
            print(f"Processing batch {i//batch_size + 1}...")

            for text in batch:
                try:
                    result = translator.translate(str(text))
                    translated.append(result if result else str(text))
                    time.sleep(0.2)
                except:
                    translated.append(str(text))

        return translated

    # Apply translation hanya untuk 10 baris pertama
    translated_content = translate_batch(data.head(10)["content"].tolist())

    # Update data asli untuk 10 baris pertama
    data.loc[data.index[:10], "content"] = translated_content

    print("Translation completed for first 10 rows!")
    display_fullscreen_wrap(data.head(10))

except Exception as e:
    print(f"Error: {e}")

In [None]:
data.head(10)

In [19]:
from deep_translator import GoogleTranslator

In [20]:
langs_list = GoogleTranslator().get_supported_languages()
print("Supported languages:", langs_list)

Supported languages: ['afrikaans', 'albanian', 'amharic', 'arabic', 'armenian', 'assamese', 'aymara', 'azerbaijani', 'bambara', 'basque', 'belarusian', 'bengali', 'bhojpuri', 'bosnian', 'bulgarian', 'catalan', 'cebuano', 'chichewa', 'chinese (simplified)', 'chinese (traditional)', 'corsican', 'croatian', 'czech', 'danish', 'dhivehi', 'dogri', 'dutch', 'english', 'esperanto', 'estonian', 'ewe', 'filipino', 'finnish', 'french', 'frisian', 'galician', 'georgian', 'german', 'greek', 'guarani', 'gujarati', 'haitian creole', 'hausa', 'hawaiian', 'hebrew', 'hindi', 'hmong', 'hungarian', 'icelandic', 'igbo', 'ilocano', 'indonesian', 'irish', 'italian', 'japanese', 'javanese', 'kannada', 'kazakh', 'khmer', 'kinyarwanda', 'konkani', 'korean', 'krio', 'kurdish (kurmanji)', 'kurdish (sorani)', 'kyrgyz', 'lao', 'latin', 'latvian', 'lingala', 'lithuanian', 'luganda', 'luxembourgish', 'macedonian', 'maithili', 'malagasy', 'malay', 'malayalam', 'maltese', 'maori', 'marathi', 'meiteilon (manipuri)', 'm

In [22]:
text = "iki ya mbulet saja"
my_translator = GoogleTranslator(source="javanese", target="indonesian")

print("Original text:", text)
print ("Translated text:", my_translator.translate(text))

Original text: iki ya mbulet saja
Translated text: Ini hanya stront


In [None]:
# Save the cleaned data to a new CSV file
data.to_csv("data/diy/cleaned_data.csv", index=False)
print("Data cleaned and saved to 'data/diy/cleaned_data.csv'")