# LIBRARIES


In [None]:
import pandas as pd
import re
import nltk
import logging
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory


# INISIASI LOGGER


In [34]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger()

# INISIASI DATASET


In [None]:
# import
DATASET_FILE_NAME = "merged_dataset.csv"
DATASET_FILE_PATH = f"datasets/merged/{DATASET_FILE_NAME}"
DATA_FRAME_DATASET = pd.read_csv(DATASET_FILE_PATH)
DATA_FRAME_DATASET_LENGTH = len(DATA_FRAME_DATASET)
print(f"fresh dataset: {DATA_FRAME_DATASET_LENGTH} tweets")
display(DATA_FRAME_DATASET.tail(1))

# unused columns removal
DATA_FRAME_DATASET.drop(columns=["Unnamed: 0", "no", "urls", "retweet_count", "like_count"], inplace=True)
DATA_FRAME_DATASET_LENGTH = len(DATA_FRAME_DATASET)
print(f"dataset after unused columns removed: {DATA_FRAME_DATASET_LENGTH} tweets")
display(DATA_FRAME_DATASET.tail(1))

# duplicate row removal
DATA_FRAME_DATASET.drop_duplicates(subset=["full_text"], inplace=True)

# reset index
DATA_FRAME_DATASET = DATA_FRAME_DATASET.reset_index(drop=True)
DATA_FRAME_DATASET_LENGTH = len(DATA_FRAME_DATASET)
print(f"dataset after duplicate removal: {DATA_FRAME_DATASET_LENGTH} tweets")
display(DATA_FRAME_DATASET.tail(1))

# TEXT CLEANING


In [None]:
def clean_text(text):
  # remove RT tag
  text = re.sub(r'RT\s', '', text)
  # remove @_username
  text = re.sub(r"\@([\w]+)", " ", text)
  # replace emoji decode with space
  text = re.sub(r"\\u[a-zA-Z0-9]{4}", " ", text)
  # replace enter /n/ with space
  text = re.sub(r"\n\s", " ", text)
  text = re.sub(r"\n", " ", text)
  # remove non-ascii
  text = re.sub(r'[^\x00-\x7F]+',' ', text)
  # fix duplicate characters (ex: hellooooo)
  text = re.sub(r'([a-zA-Z])\1\1','\\1', text)
  # replace url
  text = re.sub(r'http[s]?\:\/\/.[a-zA-Z0-9\.\/\_?=%&#\-\+!]+',' ', text)
  text = re.sub(r'pic.twitter.com?.[a-zA-Z0-9\.\/\_?=%&#\-\+!]+',' ', text)
  # convert to lowercase
  text = text.lower()
  # remove hashtag
  text = re.sub(r'\#[a-zA-Z0-9_]+','', text)
  # remove numbers
  text = re.sub(r'[0-9]+',' ', text)
  # remove symbols
  text = re.sub(r'[!$%^&*@#()_+|~=`{}\[\]%\-:";\'<>?,.\/]', ' ', text)
  # remove extra spaces to one space
  text = re.sub(r' +', ' ', text)
  # remove leading and trailing spaces
  text = re.sub(r'^[ ]|[ ]$','', text)
  # replace ikn with ibu kota negara baru
  text = text.replace("ikn", "ibu kota negara baru")
  
  return text

DATA_FRAME_DATASET["cleaned_text"] = DATA_FRAME_DATASET["full_text"].apply(clean_text)
display(DATA_FRAME_DATASET["cleaned_text"].tail(1).to_frame())

# INISIASI KAMUS


SLANG, STOPWORDS, DLL.


> Source
>
> - [kamus_slang_1](https://github.com/lailikanabila/ANALISIS-SENTIMEN-PADA-PERPINDAHAN-IBUKOTA-INDONESIA-DENGAN-ALGORITMA-SUPPORT-VECTOR-MACHINE/blob/main/Dictionary/Stemming-Kamus_Alay-%20nasalsabila/_json_colloquial-indonesian-lexicon.txt)
> - [kamus_slang_2](https://github.com/MreRes/sentiment-analysis-knn/blob/main/kbba.txt)
> - [kamus_stopwords](https://github.com/lailikanabila/ANALISIS-SENTIMEN-PADA-PERPINDAHAN-IBUKOTA-INDONESIA-DENGAN-ALGORITMA-SUPPORT-VECTOR-MACHINE/tree/main/Dictionary/StopWord)

> References
>
> - `kamus_stopwords`
>   - [1] Tala, F. Z. (2003). A Study of Stemming Effects on Information Retrieval in Bahasa Indonesia. M.Sc. Thesis. Master of Logic Project. Institute for Logic, Language and Computation. Universiteit van Amsterdam, The Netherlands.


In [None]:
SLANG_DICTIONARY_FILE_NAME_1 = "kamus_slang_1.csv"
SLANG_DICTIONARY_FILE_PATH_1 = f"dictionaries/{SLANG_DICTIONARY_FILE_NAME_1}"
DATA_FRAME_SLANG_DICTIONARY_1 = pd.read_csv(SLANG_DICTIONARY_FILE_PATH_1)

SLANG_DICTIONARY_FILE_NAME_2 = "kamus_slang_2.csv"
SLANG_DICTIONARY_FILE_PATH_2 = f"dictionaries/{SLANG_DICTIONARY_FILE_NAME_2}"
DATA_FRAME_SLANG_DICTIONARY_2 = pd.read_csv(SLANG_DICTIONARY_FILE_PATH_2)

display(DATA_FRAME_SLANG_DICTIONARY_1.tail())
display(DATA_FRAME_SLANG_DICTIONARY_2.tail())

SLANG_DICTIONARY_1 = pd.Series(DATA_FRAME_SLANG_DICTIONARY_1.formal.values, index=DATA_FRAME_SLANG_DICTIONARY_1.slang).to_dict()
SLANG_DICTIONARY_2 = pd.Series(DATA_FRAME_SLANG_DICTIONARY_2.formal.values, index=DATA_FRAME_SLANG_DICTIONARY_2.slang).to_dict()

SW_DICTIONARY_FILE_NAME_1 = "kamus_stopwords_1.csv"
SW_DICTIONARY_FILE_PATH_1 = f"dictionaries/{SW_DICTIONARY_FILE_NAME_1}"
DATA_FRAME_SW_DICTIONARY_1 = pd.read_csv(SW_DICTIONARY_FILE_PATH_1)

display(DATA_FRAME_SW_DICTIONARY_1.tail())

# INTEGRASI KAMUS SLANG


In [None]:
def slang_dict_integration(text):
  words = text.split()
  standardization_words = []

  for word in words:
    if word in SLANG_DICTIONARY_1:
      standardization_words.append(SLANG_DICTIONARY_1[word])
    elif word in SLANG_DICTIONARY_2:
      standardization_words.append(SLANG_DICTIONARY_2[word])
    else:
      standardization_words.append(word)

  return " ".join(standardization_words)

DATA_FRAME_DATASET["after_slang_text"] = DATA_FRAME_DATASET["cleaned_text"].apply(slang_dict_integration)
display(DATA_FRAME_DATASET["after_slang_text"].tail().to_frame())

# INTEGRASI KAMUS STOPWORDS


menggunakan library python `stopwords NLTK` untuk dapat menggunakan _custom dictionary Indonesia stopword collection `(stopwords_id)`_ oleh Gene Diaz yang dapat diakses melalui [https://github.com/stopwords-iso/stopwords-id](https://github.com/stopwords-iso/stopwords-id). _Kamus `stopwords-id`_ sudah memiliki `MIT License`.


In [None]:
custom_stopwords = stopwords.words('indonesian')
custom_stopwords.clear()
custom_stopwords.extend(DATA_FRAME_SW_DICTIONARY_1["stopwords"].values)

factory = StopWordRemoverFactory()
sastrawi_stopwords = factory.get_stop_words()

combined_stopwords = set(custom_stopwords).union(set(sastrawi_stopwords))

def drop_stopwords(text):
  return " ".join([word for word in text.split() if word not in combined_stopwords])

DATA_FRAME_DATASET["after_stopwords_text"] = DATA_FRAME_DATASET["after_slang_text"].apply(drop_stopwords)
display(DATA_FRAME_DATASET["after_stopwords_text"].tail().to_frame())

# STEMMING


In [None]:
factory = StemmerFactory()
stemmer = factory.create_stemmer()

def stem_indonesian_text(text, index):
  # logger
  if index % 100 == 0:
    logger.info(f"stemming progress: {index}/{DATA_FRAME_DATASET_LENGTH}")

  return " ".join([stemmer.stem(word) for word in text.split()])

# apply stemming with streaming logs
for index, row in DATA_FRAME_DATASET.iterrows():
  DATA_FRAME_DATASET.at[index, "after_stemming_text"] = stem_indonesian_text(row["after_stopwords_text"], index)

display(DATA_FRAME_DATASET["after_stemming_text"].tail().to_frame())

# EXPORT INDONESIAN PREPROCESSED DATASET


In [42]:
# final column
DATA_FRAME_DATASET["ind_preprocessed_text"] = DATA_FRAME_DATASET["after_stemming_text"]

DATA_FRAME_DATASET.to_csv("outputs/ind-preprocessed.csv", index=False)