# LIBRARIES


In [1]:
import pandas as pd
import re
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
import logging
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\abelc\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\abelc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\abelc\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# INISIASI LOGGER


In [2]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger()

# INISIASI DATASET


In [3]:
# import
DATASET_FILE_NAME = "merged_dataset.csv"
DATASET_FILE_PATH = f"datasets/merged/{DATASET_FILE_NAME}"
DATA_FRAME_DATASET = pd.read_csv(DATASET_FILE_PATH)
DATA_FRAME_DATASET_LENGTH = len(DATA_FRAME_DATASET)
print(f"fresh dataset: {DATA_FRAME_DATASET_LENGTH} tweets")
display(DATA_FRAME_DATASET.tail(1))

# unused columns removal
DATA_FRAME_DATASET.drop(columns=["Unnamed: 0", "no", "urls", "retweet_count", "like_count"], inplace=True)
DATA_FRAME_DATASET_LENGTH = len(DATA_FRAME_DATASET)
print(f"dataset after unused columns removed")
display(DATA_FRAME_DATASET.tail(1))

# duplicate row removal
DATA_FRAME_DATASET.drop_duplicates(subset=["full_text"], inplace=True)

# reset index
DATA_FRAME_DATASET = DATA_FRAME_DATASET.reset_index(drop=True)
DATA_FRAME_DATASET_LENGTH = len(DATA_FRAME_DATASET)
print(f"dataset after duplicate removal: {DATA_FRAME_DATASET_LENGTH} tweets")
display(DATA_FRAME_DATASET.tail(1))

fresh dataset: 5341 tweets


Unnamed: 0.1,Unnamed: 0,no,urls,user_id,username,user_display_name,tweet_id,full_text,created_at,retweet_count,like_count
5340,5340,223,[],1497098972443602944,RismaII4,Risma II 🇮🇩🇮🇩🇮🇩,1839477624550412418,Sebuah langkah besar menuju masa depan yang le...,Fri Sep 27 01:30:16 +0000 2024,0,1


dataset after unused columns removed


Unnamed: 0,user_id,username,user_display_name,tweet_id,full_text,created_at
5340,1497098972443602944,RismaII4,Risma II 🇮🇩🇮🇩🇮🇩,1839477624550412418,Sebuah langkah besar menuju masa depan yang le...,Fri Sep 27 01:30:16 +0000 2024


dataset after duplicate removal: 4913 tweets


Unnamed: 0,user_id,username,user_display_name,tweet_id,full_text,created_at
4912,1497098972443602944,RismaII4,Risma II 🇮🇩🇮🇩🇮🇩,1839477624550412418,Sebuah langkah besar menuju masa depan yang le...,Fri Sep 27 01:30:16 +0000 2024


# TEXT CLEANING


In [4]:
def clean_text(text):
  # remove RT tag
  text = re.sub(r'RT\s', '', text)
  # remove @_username
  text = re.sub(r"\@([\w]+)", " ", text)
  # replace emoji decode with space
  text = re.sub(r"\\u[a-zA-Z0-9]{4}", " ", text)
  # replace enter /n/ with space
  text = re.sub(r"\n\s", " ", text)
  text = re.sub(r"\n", " ", text)
  # remove non-ascii
  text = re.sub(r'[^\x00-\x7F]+',' ', text)
  # fix duplicate characters (ex: hellooooo)
  text = re.sub(r'([a-zA-Z])\1\1','\\1', text)
  # replace url
  text = re.sub(r'http[s]?\:\/\/.[a-zA-Z0-9\.\/\_?=%&#\-\+!]+',' ', text)
  text = re.sub(r'pic.twitter.com?.[a-zA-Z0-9\.\/\_?=%&#\-\+!]+',' ', text)
  # convert to lowercase
  text = text.lower()
  # remove hashtag
  text = re.sub(r'\#[a-zA-Z0-9_]+','', text)
  # remove numbers
  text = re.sub(r'[0-9]+',' ', text)
  # remove symbols
  text = re.sub(r'[!$%^&*@#()_+|~=`{}\[\]%\-:";\'<>?,.\/]', ' ', text)
  # remove extra spaces to one space
  text = re.sub(r' +', ' ', text)
  # remove leading and trailing spaces
  text = re.sub(r'^[ ]|[ ]$','', text)
  # replace ikn with ibu kota negara baru
  text = text.replace("ikn", "ibukota negara baru")
  
  return text

DATA_FRAME_DATASET["cleaned_text"] = DATA_FRAME_DATASET["full_text"].apply(clean_text)
display(DATA_FRAME_DATASET["cleaned_text"].tail(1).to_frame())

Unnamed: 0,cleaned_text
4912,sebuah langkah besar menuju masa depan yang le...


# INISIASI KAMUS


SLANG, STOPWORDS, DLL.


In [5]:
SLANG_DICTIONARY_FILE_NAME_1 = "kamus_slang_1.csv"
SLANG_DICTIONARY_FILE_PATH_1 = f"dictionaries/{SLANG_DICTIONARY_FILE_NAME_1}"
DATA_FRAME_SLANG_DICTIONARY_1 = pd.read_csv(SLANG_DICTIONARY_FILE_PATH_1)

SLANG_DICTIONARY_FILE_NAME_2 = "kamus_slang_2.csv"
SLANG_DICTIONARY_FILE_PATH_2 = f"dictionaries/{SLANG_DICTIONARY_FILE_NAME_2}"
DATA_FRAME_SLANG_DICTIONARY_2 = pd.read_csv(SLANG_DICTIONARY_FILE_PATH_2)

SLANG_DICTIONARY_1 = pd.Series(DATA_FRAME_SLANG_DICTIONARY_1.formal.values, index=DATA_FRAME_SLANG_DICTIONARY_1.slang).to_dict()
SLANG_DICTIONARY_2 = pd.Series(DATA_FRAME_SLANG_DICTIONARY_2.formal.values, index=DATA_FRAME_SLANG_DICTIONARY_2.slang).to_dict()

display(DATA_FRAME_SLANG_DICTIONARY_1.tail())
display(DATA_FRAME_SLANG_DICTIONARY_2.tail())

SW_DICTIONARY_FILE_NAME_1 = "kamus_stopwords_1.csv"
SW_DICTIONARY_FILE_PATH_1 = f"dictionaries/{SW_DICTIONARY_FILE_NAME_1}"
DATA_FRAME_SW_DICTIONARY_1 = pd.read_csv(SW_DICTIONARY_FILE_PATH_1)

display(DATA_FRAME_SW_DICTIONARY_1.tail())

NEGASI_DICTIONARY_FILE_NAME_1 = "negasi.csv"
NEGASI_DICTIONARY_FILE_PATH_1 = f"dictionaries/{NEGASI_DICTIONARY_FILE_NAME_1}"
DATA_FRAME_NEGASI_DICTIONARY_1 = pd.read_csv(NEGASI_DICTIONARY_FILE_PATH_1)

display(DATA_FRAME_NEGASI_DICTIONARY_1.tail())

ANTONYM_DICTIONARY_FILE_NAME_1 = "antonim_bahasa_indonesia.csv"
ANTONYM_DICTIONARY_FILE_PATH_1 = f"dictionaries/{ANTONYM_DICTIONARY_FILE_NAME_1}"
DATA_FRAME_ANTONYM_DICTIONARY_1 = pd.read_csv(ANTONYM_DICTIONARY_FILE_PATH_1)

ANTONYM_DICTIONARY_1 = pd.Series(DATA_FRAME_ANTONYM_DICTIONARY_1.antonim.values, index=DATA_FRAME_ANTONYM_DICTIONARY_1.word).to_dict()

display(DATA_FRAME_ANTONYM_DICTIONARY_1.tail())

Unnamed: 0,slang,formal
3258,boz,bos
3259,mayan,lumayan
3260,ribed,ribet
3261,ntapz,mantap
3262,ntaps,mantap


Unnamed: 0,slang,formal
1499,uu,peraturan
1500,keppres,keputusan presiden
1501,mancet,macet
1502,eo,panitia
1503,survei,survey


Unnamed: 0,stopwords
751,wong
752,yaitu
753,yakin
754,yakni
755,yang


Unnamed: 0,negasi
5,tanpa
6,pantang
7,jangan
8,bukanlah
9,sok


Unnamed: 0,word,antonim
1688,waspada,lalai
1689,ya,"bukan, tidak"
1690,yakin,ragu-ragu
1691,zalim,baik
1692,hanya,banyak


# INTEGRASI KAMUS SLANG


In [6]:
def slang_dict_integration_kamus_1(text):
  words = text.split()
  standardization_words = []

  for word in words:
    if word in SLANG_DICTIONARY_1:
      standardization_words.append(SLANG_DICTIONARY_1[word])
    else:
      standardization_words.append(word)

  return " ".join(standardization_words)

def slang_dict_integration_kamus_2(text):
  words = text.split()
  standardization_words = []

  for word in words:
    if word in SLANG_DICTIONARY_2:
      standardization_words.append(SLANG_DICTIONARY_2[word])
    else:
      standardization_words.append(word)

  return " ".join(standardization_words)

DATA_FRAME_DATASET["after_slang_text"] = DATA_FRAME_DATASET["cleaned_text"].apply(slang_dict_integration_kamus_1)
DATA_FRAME_DATASET["after_slang_text"] = DATA_FRAME_DATASET["after_slang_text"].apply(slang_dict_integration_kamus_2)
display(DATA_FRAME_DATASET["after_slang_text"].tail().to_frame())

Unnamed: 0,after_slang_text
4908,jokowi bilang ibukota negara baru bukan sekada...
4909,panitia tidak survey dulu ke eh before diorang...
4910,negara yang maju ekonominya seperti china sing...
4911,sebuah langkah besar menuju masa depan yang le...
4912,sebuah langkah besar menuju masa depan yang le...


# INTEGRASI KAMUS NEGASI


combining negation word with the next word with underscore (\_)


In [7]:
def underscore_negation(text):
  words = text.split()
  negation_words = set(DATA_FRAME_NEGASI_DICTIONARY_1["negasi"].values)
  skip_next = False
  new_words = []
    
  for i in  range(len(words)):
    if skip_next:
      skip_next = False
      continue
    if words[i] in negation_words and i < len(words) - 1:
      new_words.append(words[i] + "_" + words[i+1])
      skip_next = True
    else:
      new_words.append(words[i])

  return " ".join(new_words)

DATA_FRAME_DATASET["underscore_negation_text"] = DATA_FRAME_DATASET["after_slang_text"].apply(underscore_negation)
display(DATA_FRAME_DATASET["underscore_negation_text"].tail().to_frame())

Unnamed: 0,underscore_negation_text
4908,jokowi bilang ibukota negara baru bukan_sekada...
4909,panitia tidak_survey dulu ke eh before diorang...
4910,negara yang maju ekonominya seperti china sing...
4911,sebuah langkah besar menuju masa depan yang le...
4912,sebuah langkah besar menuju masa depan yang le...


swapping word after negation words with its antonym (if exists)


In [8]:
def swap_antonyms(text):
  words = text.split()
  antonym_dict = dict(zip(DATA_FRAME_ANTONYM_DICTIONARY_1["word"], DATA_FRAME_ANTONYM_DICTIONARY_1["antonim"]))
  new_words = []
    
  for word in words:
    if "_" in word:
      negation, next_word = word.split("_", 1)
      if next_word in antonym_dict:
        new_words.append(antonym_dict[next_word])
      else:
        new_words.append(word)
    else:
      new_words.append(word)
  
  return " ".join(new_words)

DATA_FRAME_DATASET["swap_negation_text"] = DATA_FRAME_DATASET["underscore_negation_text"].apply(swap_antonyms)
display(DATA_FRAME_DATASET["swap_negation_text"].tail().to_frame())

Unnamed: 0,swap_negation_text
4908,jokowi bilang ibukota negara baru bukan_sekada...
4909,panitia tidak_survey dulu ke eh before diorang...
4910,negara yang maju ekonominya seperti china sing...
4911,sebuah langkah besar menuju masa depan yang le...
4912,sebuah langkah besar menuju masa depan yang le...


swapping "\_" to " " if antonym not exists


In [9]:
def replace_underscore(text):
  text = re.sub(r'_', ' ', text)

  return text

DATA_FRAME_DATASET["final_negation_text"] = DATA_FRAME_DATASET["swap_negation_text"].apply(replace_underscore)
display(DATA_FRAME_DATASET["final_negation_text"].tail().to_frame())

Unnamed: 0,final_negation_text
4908,jokowi bilang ibukota negara baru bukan sekada...
4909,panitia tidak survey dulu ke eh before diorang...
4910,negara yang maju ekonominya seperti china sing...
4911,sebuah langkah besar menuju masa depan yang le...
4912,sebuah langkah besar menuju masa depan yang le...


check the differ between the original text and the text after negation integration


In [10]:
DATA_FRAME_DATASET['is_different'] = DATA_FRAME_DATASET['after_slang_text'] != DATA_FRAME_DATASET['final_negation_text']

changed_rows = DATA_FRAME_DATASET[DATA_FRAME_DATASET['is_different']]
display(changed_rows[['after_slang_text', 'final_negation_text']])

Unnamed: 0,after_slang_text,final_negation_text
4,jangan lupa sih kuningg,ingat sih kuningg
27,ibukota negara baru itu dianggap strategis ban...,ibukota negara baru itu dianggap strategis ban...
37,bukan membela tapi memang benar jakarta masih ...,bukan membela tapi memang benar jakarta masih ...
49,agak bingung juga indonesia sekarang ibukota r...,agak bingung juga indonesia sekarang ibukota r...
56,dan sekarang beliau ragu tidak berani teken ke...,dan sekarang beliau ragu takut teken keputusan...
...,...,...
4845,kalau ibukota negara baru memang belum siap ke...,kalau ibukota negara baru memang belum siap ke...
4859,padahal sih mul sendiri tidak pernah referendu...,padahal sih mul sendiri belum referendum sama ...
4873,situasi stasiun manggarai jam sore coba saja i...,situasi stasiun manggarai jam sore coba saja i...
4874,bodo amat mau pakai apbn kayak mau tidak kayak...,bodo amat mau pakai apbn kayak mau tidak kayak...


# INTEGRASI KAMUS STOPWORDS


In [11]:
custom_stopwords = stopwords.words('indonesian')
custom_stopwords.clear()
custom_stopwords.extend(DATA_FRAME_SW_DICTIONARY_1["stopwords"].values)

factory = StopWordRemoverFactory()
sastrawi_stopwords = factory.get_stop_words()

combined_stopwords = set(custom_stopwords).union(set(sastrawi_stopwords))

def drop_stopwords(text):
  return " ".join([word for word in text.split() if word not in combined_stopwords])

DATA_FRAME_DATASET["after_stopwords_text"] = DATA_FRAME_DATASET["final_negation_text"].apply(drop_stopwords)
display(DATA_FRAME_DATASET["after_stopwords_text"].tail().to_frame())

Unnamed: 0,after_stopwords_text
4908,jokowi bilang ibukota negara proyek presiden k...
4909,panitia survey eh before diorang pilih venue k...
4910,negara maju ekonominya china singapura jepang ...
4911,langkah berkesinambungan inklusif dukungan pem...
4912,langkah berkesinambungan inklusif dukungan pem...


# STEMMING


In [12]:
factory = StemmerFactory()
stemmer = factory.create_stemmer()

def stem_indonesian_text(text, index):
  # logger
  if index % 1000 == 0:
    logger.info(f"stemming progress: {index}/{DATA_FRAME_DATASET_LENGTH}")

  return " ".join([stemmer.stem(word) for word in text.split()])

# apply stemming with streaming logs
for index, row in DATA_FRAME_DATASET.iterrows():
  DATA_FRAME_DATASET.at[index, "after_stemming_text"] = stem_indonesian_text(row["after_stopwords_text"], index)

display(DATA_FRAME_DATASET["after_stemming_text"].tail().to_frame())

2025-01-03 20:52:36,920 - INFO - stemming progress: 0/4913
2025-01-03 20:54:38,036 - INFO - stemming progress: 1000/4913
2025-01-03 20:55:58,502 - INFO - stemming progress: 2000/4913
2025-01-03 20:57:14,571 - INFO - stemming progress: 3000/4913
2025-01-03 20:58:33,285 - INFO - stemming progress: 4000/4913


Unnamed: 0,after_stemming_text
4908,jokowi bilang ibukota negara proyek presiden p...
4909,panitia survey eh before orang pilih venue kat...
4910,negara maju ekonomi china singapura jepang kor...
4911,langkah sambung inklusif dukung bangun kota ne...
4912,langkah sambung inklusif dukung bangun kota ne...


# EXPORT INDONESIAN PREPROCESSED DATASET


In [13]:
# final column
DATA_FRAME_DATASET["preprocessed_text"] = DATA_FRAME_DATASET["after_stemming_text"]

DATA_FRAME_DATASET = DATA_FRAME_DATASET[
    DATA_FRAME_DATASET["preprocessed_text"].apply(lambda x: isinstance(x, str) and x.strip() != "")
]

# DATA_FRAME_DATASET.drop_duplicates(subset=["preprocessed_text"], inplace=True)
# DATA_FRAME_DATASET = DATA_FRAME_DATASET.reset_index(drop=True)

DATA_FRAME_DATASET.to_csv("outputs/preprocessed.csv", index=False)