# LIBRARIES


In [35]:
import pandas as pd
import re
import nltk
import logging
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# INISIASI LOGGER


In [36]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger()

# INISIASI DATASET


In [37]:
# import
DATASET_FILE_NAME = "merged_dataset.csv"
DATASET_FILE_PATH = f"datasets/merged/{DATASET_FILE_NAME}"
DATA_FRAME_DATASET = pd.read_csv(DATASET_FILE_PATH)
DATA_FRAME_DATASET_LENGTH = len(DATA_FRAME_DATASET)
print(f"fresh dataset: {DATA_FRAME_DATASET_LENGTH} tweets")
display(DATA_FRAME_DATASET.tail(1))

# unused columns removal
DATA_FRAME_DATASET.drop(columns=["Unnamed: 0", "no", "urls", "retweet_count", "like_count"], inplace=True)
DATA_FRAME_DATASET_LENGTH = len(DATA_FRAME_DATASET)
print(f"dataset after unused columns removed: {DATA_FRAME_DATASET_LENGTH} tweets")
display(DATA_FRAME_DATASET.tail(1))

# duplicate row removal
DATA_FRAME_DATASET.drop_duplicates(subset=["full_text"], inplace=True)

# reset index
DATA_FRAME_DATASET = DATA_FRAME_DATASET.reset_index(drop=True)
DATA_FRAME_DATASET_LENGTH = len(DATA_FRAME_DATASET)
print(f"dataset after duplicate removal: {DATA_FRAME_DATASET_LENGTH} tweets")
display(DATA_FRAME_DATASET.tail(1))

fresh dataset: 4593 tweets


Unnamed: 0.1,Unnamed: 0,no,urls,user_id,username,user_display_name,tweet_id,full_text,created_at,retweet_count,like_count
4592,4592,1005,[],1685924328851038208,maulani_salsa76,Maulani Salsa,1822028454411563237,@aminun40 Dukung pemindahan Ibu Kota Nusantara...,Fri Aug 09 21:53:29 +0000 2024,0,0


dataset after unused columns removed: 4593 tweets


Unnamed: 0,user_id,username,user_display_name,tweet_id,full_text,created_at
4592,1685924328851038208,maulani_salsa76,Maulani Salsa,1822028454411563237,@aminun40 Dukung pemindahan Ibu Kota Nusantara...,Fri Aug 09 21:53:29 +0000 2024


dataset after duplicate removal: 4042 tweets


Unnamed: 0,user_id,username,user_display_name,tweet_id,full_text,created_at
4041,1685924328851038208,maulani_salsa76,Maulani Salsa,1822028454411563237,@aminun40 Dukung pemindahan Ibu Kota Nusantara...,Fri Aug 09 21:53:29 +0000 2024


# TEXT CLEANING


In [38]:
def clean_text(text):
  # remove RT tag
  text = re.sub(r'RT\s', '', text)
  # remove @_username
  text = re.sub(r"\@([\w]+)", " ", text)
  # replace emoji decode with space
  text = re.sub(r"\\u[a-zA-Z0-9]{4}", " ", text)
  # replace enter /n/ with space
  text = re.sub(r"\n\s", " ", text)
  text = re.sub(r"\n", " ", text)
  # remove non-ascii
  text = re.sub(r'[^\x00-\x7F]+',' ', text)
  # fix duplicate characters (ex: hellooooo)
  text = re.sub(r'([a-zA-Z])\1\1','\\1', text)
  # replace url
  text = re.sub(r'http[s]?\:\/\/.[a-zA-Z0-9\.\/\_?=%&#\-\+!]+',' ', text)
  text = re.sub(r'pic.twitter.com?.[a-zA-Z0-9\.\/\_?=%&#\-\+!]+',' ', text)
  # convert to lowercase
  text = text.lower()
  # remove hashtag
  text = re.sub(r'\#[a-zA-Z0-9_]+','', text)
  # remove numbers
  text = re.sub(r'[0-9]+',' ', text)
  # remove symbols
  text = re.sub(r'[!$%^&*@#()_+|~=`{}\[\]%\-:";\'<>?,.\/]', ' ', text)
  # remove extra spaces to one space
  text = re.sub(r' +', ' ', text)
  # remove leading and trailing spaces
  text = re.sub(r'^[ ]|[ ]$','', text)
  # replace ikn with ibu kota negara baru
  text = text.replace("ikn", "ibu kota negara baru")
  
  return text

DATA_FRAME_DATASET["cleaned_text"] = DATA_FRAME_DATASET["full_text"].apply(clean_text)
display(DATA_FRAME_DATASET["cleaned_text"].tail(1).to_frame())

Unnamed: 0,cleaned_text
4041,dukung pemindahan ibu kota nusantara untuk men...


# INISIASI KAMUS


SLANG, STOPWORDS, DLL.


In [39]:
SLANG_DICTIONARY_FILE_NAME_1 = "kamus_slang_1.csv"
SLANG_DICTIONARY_FILE_PATH_1 = f"dictionaries/{SLANG_DICTIONARY_FILE_NAME_1}"
DATA_FRAME_SLANG_DICTIONARY_1 = pd.read_csv(SLANG_DICTIONARY_FILE_PATH_1)

SLANG_DICTIONARY_FILE_NAME_2 = "kamus_slang_2.csv"
SLANG_DICTIONARY_FILE_PATH_2 = f"dictionaries/{SLANG_DICTIONARY_FILE_NAME_2}"
DATA_FRAME_SLANG_DICTIONARY_2 = pd.read_csv(SLANG_DICTIONARY_FILE_PATH_2)

SLANG_DICTIONARY_1 = pd.Series(DATA_FRAME_SLANG_DICTIONARY_1.formal.values, index=DATA_FRAME_SLANG_DICTIONARY_1.slang).to_dict()
SLANG_DICTIONARY_2 = pd.Series(DATA_FRAME_SLANG_DICTIONARY_2.formal.values, index=DATA_FRAME_SLANG_DICTIONARY_2.slang).to_dict()

display(DATA_FRAME_SLANG_DICTIONARY_1.tail())
display(DATA_FRAME_SLANG_DICTIONARY_2.tail())

SW_DICTIONARY_FILE_NAME_1 = "kamus_stopwords_1.csv"
SW_DICTIONARY_FILE_PATH_1 = f"dictionaries/{SW_DICTIONARY_FILE_NAME_1}"
DATA_FRAME_SW_DICTIONARY_1 = pd.read_csv(SW_DICTIONARY_FILE_PATH_1)

display(DATA_FRAME_SW_DICTIONARY_1.tail())

NEGASI_DICTIONARY_FILE_NAME_1 = "negasi.csv"
NEGASI_DICTIONARY_FILE_PATH_1 = f"dictionaries/{NEGASI_DICTIONARY_FILE_NAME_1}"
DATA_FRAME_NEGASI_DICTIONARY_1 = pd.read_csv(NEGASI_DICTIONARY_FILE_PATH_1)

display(DATA_FRAME_NEGASI_DICTIONARY_1.tail())

ANTONYM_DICTIONARY_FILE_NAME_1 = "antonim_bahasa_indonesia.csv"
ANTONYM_DICTIONARY_FILE_PATH_1 = f"dictionaries/{ANTONYM_DICTIONARY_FILE_NAME_1}"
DATA_FRAME_ANTONYM_DICTIONARY_1 = pd.read_csv(ANTONYM_DICTIONARY_FILE_PATH_1)

ANTONYM_DICTIONARY_1 = pd.Series(DATA_FRAME_ANTONYM_DICTIONARY_1.antonim.values, index=DATA_FRAME_ANTONYM_DICTIONARY_1.word).to_dict()

display(DATA_FRAME_ANTONYM_DICTIONARY_1.tail())

Unnamed: 0,slang,formal
3258,boz,bos
3259,mayan,lumayan
3260,ribed,ribet
3261,ntapz,mantap
3262,ntaps,mantap


Unnamed: 0,slang,formal
1492,bajing,anjing
1493,ngentod,anjing
1494,trouble,masalah
1495,tengkyu,terima kasih
1496,thanks,terima kasih


Unnamed: 0,stopwords
753,wong
754,yaitu
755,yakin
756,yakni
757,yang


Unnamed: 0,negasi
0,tidak
1,bukan
2,belum
3,tak
4,kurang


Unnamed: 0,word,antonim
1687,waspada,lalai
1688,ya,"bukan, tidak"
1689,yakin,ragu-ragu
1690,zalim,baik
1691,hanya,banyak


# INTEGRASI KAMUS SLANG


In [40]:
def slang_dict_integration_kamus_1(text):
  words = text.split()
  standardization_words = []

  for word in words:
    if word in SLANG_DICTIONARY_1:
      standardization_words.append(SLANG_DICTIONARY_1[word])
    else:
      standardization_words.append(word)

  return " ".join(standardization_words)

def slang_dict_integration_kamus_2(text):
  words = text.split()
  standardization_words = []

  for word in words:
    if word in SLANG_DICTIONARY_2:
      standardization_words.append(SLANG_DICTIONARY_2[word])
    else:
      standardization_words.append(word)

  return " ".join(standardization_words)

DATA_FRAME_DATASET["after_slang_text"] = DATA_FRAME_DATASET["cleaned_text"].apply(slang_dict_integration_kamus_1)
DATA_FRAME_DATASET["after_slang_text"] = DATA_FRAME_DATASET["after_slang_text"].apply(slang_dict_integration_kamus_2)
display(DATA_FRAME_DATASET["after_slang_text"].tail().to_frame())

Unnamed: 0,after_slang_text
4037,pemindahan ibu kota menggambarkan tekad indone...
4038,pemindahan ibu kota bukti bahwa kita serius da...
4039,pemindahan ibu kota nusantara akan membuka pel...
4040,presiden jokowi meyakini bahwa pemindahan ibu ...
4041,dukung pemindahan ibu kota nusantara untuk men...


# INTEGRASI KAMUS NEGASI


combining negation word with the next word with underscore (\_)


In [41]:
def underscore_negation(text):
  words = text.split()
  negation_words = set(DATA_FRAME_NEGASI_DICTIONARY_1["negasi"].values)
  skip_next = False
  new_words = []
    
  for i in  range(len(words)):
    if skip_next:
      skip_next = False
      continue
    if words[i] in negation_words and i < len(words) - 1:
      new_words.append(words[i] + "_" + words[i+1])
      skip_next = True
    else:
      new_words.append(words[i])

  return " ".join(new_words)

DATA_FRAME_DATASET["underscore_negation_text"] = DATA_FRAME_DATASET["after_slang_text"].apply(underscore_negation)
display(DATA_FRAME_DATASET["underscore_negation_text"].tail().to_frame())

Unnamed: 0,underscore_negation_text
4037,pemindahan ibu kota menggambarkan tekad indone...
4038,pemindahan ibu kota bukti bahwa kita serius da...
4039,pemindahan ibu kota nusantara akan membuka pel...
4040,presiden jokowi meyakini bahwa pemindahan ibu ...
4041,dukung pemindahan ibu kota nusantara untuk men...


swapping word after negation words with its antonym (if exists)


In [42]:
def swap_antonyms(text):
  words = text.split()
  antonym_dict = dict(zip(DATA_FRAME_ANTONYM_DICTIONARY_1["word"], DATA_FRAME_ANTONYM_DICTIONARY_1["antonim"]))
  new_words = []
    
  for word in words:
    if "_" in word:
      negation, next_word = word.split("_", 1)
      if next_word in antonym_dict:
        new_words.append(antonym_dict[next_word])
      else:
        new_words.append(word)
    else:
      new_words.append(word)
  
  return " ".join(new_words)

DATA_FRAME_DATASET["swap_negation_text"] = DATA_FRAME_DATASET["underscore_negation_text"].apply(swap_antonyms)
display(DATA_FRAME_DATASET["swap_negation_text"].tail().to_frame())

Unnamed: 0,swap_negation_text
4037,pemindahan ibu kota menggambarkan tekad indone...
4038,pemindahan ibu kota bukti bahwa kita serius da...
4039,pemindahan ibu kota nusantara akan membuka pel...
4040,presiden jokowi meyakini bahwa pemindahan ibu ...
4041,dukung pemindahan ibu kota nusantara untuk men...


check the different between the original text and the text after negation integration


In [43]:
DATA_FRAME_DATASET['is_different'] = DATA_FRAME_DATASET['cleaned_text'] != DATA_FRAME_DATASET['swap_negation_text']

changed_rows = DATA_FRAME_DATASET[DATA_FRAME_DATASET['is_different']]
display(changed_rows[['cleaned_text', 'swap_negation_text']])

Unnamed: 0,cleaned_text,swap_negation_text
1,halo pemerintah telah mempersiapkan nusantara ...,halo pemerintah telah mempersiapkan nusantara ...
4,brengsek lu ngebet bikin ibu kota negara baru ...,brengsek kamu ngebet bikin ibu kota negara bar...
5,cuma kalimantan yg ga kena itu yaa ibu kota ba...,cuma kalimantan yang tidak_kena itu iya ibu ko...
6,dok anakku pernah ke salah satu dokter ibu kot...,dok anakku pernah ke salah satu dokter ibu kot...
7,reposted from halo pemerintah telah mempersiap...,reposted from halo pemerintah telah mempersiap...
...,...,...
3996,tapi ingat hanya indonesia saja pemindahan ibu...,tapi ingat hanya indonesia saja pemindahan ibu...
3997,woi goblok rancangan pemindahan ibu kota itu d...,woi bodoh rancangan pemindahan ibu kota itu du...
3998,tadi kan pemindahan bendera pusaka ke ibu kota...,tadi kan pemindahan bendera pusaka ke ibu kota...
4013,apa pentingnya pak kan masih ibu kota nusantar...,apa pentingnya pak kan masih ibu kota nusantar...


# INTEGRASI KAMUS STOPWORDS


In [44]:
custom_stopwords = stopwords.words('indonesian')
custom_stopwords.clear()
custom_stopwords.extend(DATA_FRAME_SW_DICTIONARY_1["stopwords"].values)

factory = StopWordRemoverFactory()
sastrawi_stopwords = factory.get_stop_words()

combined_stopwords = set(custom_stopwords).union(set(sastrawi_stopwords))

def drop_stopwords(text):
  return " ".join([word for word in text.split() if word not in combined_stopwords])

DATA_FRAME_DATASET["after_stopwords_text"] = DATA_FRAME_DATASET["swap_negation_text"].apply(drop_stopwords)
display(DATA_FRAME_DATASET["after_stopwords_text"].tail().to_frame())

Unnamed: 0,after_stopwords_text
4037,pemindahan kota menggambarkan tekad indonesia ...
4038,pemindahan kota bukti serius mengelola sumber ...
4039,pemindahan kota nusantara membuka peluang bisn...
4040,presiden jokowi pemindahan kota nusantara memb...
4041,dukung pemindahan kota nusantara mengoptimalka...


# STEMMING


In [45]:
factory = StemmerFactory()
stemmer = factory.create_stemmer()

def stem_indonesian_text(text, index):
  # logger
  if index % 100 == 0:
    logger.info(f"stemming progress: {index}/{DATA_FRAME_DATASET_LENGTH}")

  return " ".join([stemmer.stem(word) for word in text.split()])

# apply stemming with streaming logs
for index, row in DATA_FRAME_DATASET.iterrows():
  DATA_FRAME_DATASET.at[index, "after_stemming_text"] = stem_indonesian_text(row["after_stopwords_text"], index)

display(DATA_FRAME_DATASET["after_stemming_text"].tail().to_frame())

2024-10-01 13:07:29,304 - INFO - stemming progress: 0/4042
2024-10-01 13:07:40,798 - INFO - stemming progress: 100/4042
2024-10-01 13:07:51,322 - INFO - stemming progress: 200/4042
2024-10-01 13:08:06,040 - INFO - stemming progress: 300/4042
2024-10-01 13:08:15,803 - INFO - stemming progress: 400/4042
2024-10-01 13:08:21,595 - INFO - stemming progress: 500/4042
2024-10-01 13:08:28,418 - INFO - stemming progress: 600/4042
2024-10-01 13:08:32,274 - INFO - stemming progress: 700/4042
2024-10-01 13:08:37,103 - INFO - stemming progress: 800/4042
2024-10-01 13:08:41,014 - INFO - stemming progress: 900/4042
2024-10-01 13:08:45,453 - INFO - stemming progress: 1000/4042
2024-10-01 13:08:49,271 - INFO - stemming progress: 1100/4042
2024-10-01 13:08:55,195 - INFO - stemming progress: 1200/4042
2024-10-01 13:08:59,263 - INFO - stemming progress: 1300/4042
2024-10-01 13:09:04,585 - INFO - stemming progress: 1400/4042
2024-10-01 13:09:11,074 - INFO - stemming progress: 1500/4042
2024-10-01 13:09:19,

Unnamed: 0,after_stemming_text
4037,pindah kota gambar tekad indonesia capai tuju
4038,pindah kota bukti serius kelola sumber daya alam
4039,pindah kota nusantara buka peluang bisnis warg...
4040,presiden jokowi pindah kota nusantara buka pel...
4041,dukung pindah kota nusantara optimal guna sumb...


# EXPORT INDONESIAN PREPROCESSED DATASET


In [46]:
# final column
DATA_FRAME_DATASET["ind_preprocessed_text"] = DATA_FRAME_DATASET["after_stemming_text"]

DATA_FRAME_DATASET.to_csv("outputs/ind-preprocessed.csv", index=False)