# LIBRARIES


In [11]:
import pandas as pd
import string 
import re
import nltk
import logging
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from deep_translator import GoogleTranslator
from nltk.stem import WordNetLemmatizer 

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# INISIASI LOGGER


In [12]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger()

# INISIASI DATASET


In [13]:
# import
DATASET_FILE_NAME = "merged_dataset.csv"
DATASET_FILE_PATH = f"datasets/merged/{DATASET_FILE_NAME}"
DATA_FRAME_DATASET = pd.read_csv(DATASET_FILE_PATH)
DATA_FRAME_DATASET_LENGTH = len(DATA_FRAME_DATASET)
print(f"fresh dataset: {DATA_FRAME_DATASET_LENGTH} tweets")
display(DATA_FRAME_DATASET.tail(1))

# unused columns removal
DATA_FRAME_DATASET.drop(columns=["Unnamed: 0", "no", "urls", "retweet_count", "like_count"], inplace=True)
DATA_FRAME_DATASET_LENGTH = len(DATA_FRAME_DATASET)
print(f"dataset after unused columns removed: {DATA_FRAME_DATASET_LENGTH} tweets")
display(DATA_FRAME_DATASET.tail(1))

# duplicate row removal
DATA_FRAME_DATASET.drop_duplicates(subset=["full_text"], inplace=True)

# reset index
DATA_FRAME_DATASET = DATA_FRAME_DATASET.reset_index(drop=True)
DATA_FRAME_DATASET_LENGTH = len(DATA_FRAME_DATASET)
print(f"dataset after duplicate removal: {DATA_FRAME_DATASET_LENGTH} tweets")
display(DATA_FRAME_DATASET.tail(1))

fresh dataset: 4593 tweets


Unnamed: 0.1,Unnamed: 0,no,urls,user_id,username,user_display_name,tweet_id,full_text,created_at,retweet_count,like_count
4592,4592,1005,[],1685924328851038208,maulani_salsa76,Maulani Salsa,1822028454411563237,@aminun40 Dukung pemindahan Ibu Kota Nusantara...,Fri Aug 09 21:53:29 +0000 2024,0,0


dataset after unused columns removed: 4593 tweets


Unnamed: 0,user_id,username,user_display_name,tweet_id,full_text,created_at
4592,1685924328851038208,maulani_salsa76,Maulani Salsa,1822028454411563237,@aminun40 Dukung pemindahan Ibu Kota Nusantara...,Fri Aug 09 21:53:29 +0000 2024


dataset after duplicate removal: 4042 tweets


Unnamed: 0,user_id,username,user_display_name,tweet_id,full_text,created_at
4041,1685924328851038208,maulani_salsa76,Maulani Salsa,1822028454411563237,@aminun40 Dukung pemindahan Ibu Kota Nusantara...,Fri Aug 09 21:53:29 +0000 2024


# TEXT CLEANING


In [14]:
def clean_text(text):
  # remove RT tag
  text = re.sub(r'RT\s', '', text)
  # remove @_username
  text = re.sub(r"\@([\w]+)", " ", text)
  # replace emoji decode with space
  text = re.sub(r"\\u[a-zA-Z0-9]{4}", " ", text)
  # replace enter /n/ with space
  text = re.sub(r"\n\s", " ", text)
  text = re.sub(r"\n", " ", text)
  # remove non-ascii
  text = re.sub(r'[^\x00-\x7F]+',' ', text)
  # fix duplicate characters (ex: hellooooo)
  text = re.sub(r'([a-zA-Z])\1\1','\\1', text)
  # replace url
  text = re.sub(r'http[s]?\:\/\/.[a-zA-Z0-9\.\/\_?=%&#\-\+!]+',' ', text)
  text = re.sub(r'pic.twitter.com?.[a-zA-Z0-9\.\/\_?=%&#\-\+!]+',' ', text)
  # convert to lowercase
  text = text.lower()
  # remove hashtag
  text = re.sub(r'\#[a-zA-Z0-9_]+','', text)
  # remove numbers
  text = re.sub(r'[0-9]+',' ', text)
  # remove symbols
  text = re.sub(r'[!$%^&*@#()_+|~=`{}\[\]%\-:";\'<>?,.\/]', ' ', text)
  # remove extra spaces to one space
  text = re.sub(r' +', ' ', text)
  # remove leading and trailing spaces
  text = re.sub(r'^[ ]|[ ]$','', text)
  # replace ikn with ibu kota negara baru
  text = text.replace("ikn", "ibu kota negara baru")

  return text

DATA_FRAME_DATASET["cleaned_text"] = DATA_FRAME_DATASET["full_text"].apply(clean_text)
display(DATA_FRAME_DATASET["cleaned_text"].tail(1).to_frame())

Unnamed: 0,cleaned_text
4041,dukung pemindahan ibu kota nusantara untuk men...


# INISIASI KAMUS


SLANG, DLL.


In [15]:
SLANG_DICTIONARY_FILE_NAME_1 = "kamus_slang_1.csv"
SLANG_DICTIONARY_FILE_PATH_1 = f"dictionaries/{SLANG_DICTIONARY_FILE_NAME_1}"
DATA_FRAME_SLANG_DICTIONARY_1 = pd.read_csv(SLANG_DICTIONARY_FILE_PATH_1)

SLANG_DICTIONARY_FILE_NAME_2 = "kamus_slang_2.csv"
SLANG_DICTIONARY_FILE_PATH_2 = f"dictionaries/{SLANG_DICTIONARY_FILE_NAME_2}"
DATA_FRAME_SLANG_DICTIONARY_2 = pd.read_csv(SLANG_DICTIONARY_FILE_PATH_2)

display(DATA_FRAME_SLANG_DICTIONARY_1.tail())
display(DATA_FRAME_SLANG_DICTIONARY_2.tail())

SLANG_DICTIONARY_1 = pd.Series(DATA_FRAME_SLANG_DICTIONARY_1.formal.values, index=DATA_FRAME_SLANG_DICTIONARY_1.slang).to_dict()
SLANG_DICTIONARY_2 = pd.Series(DATA_FRAME_SLANG_DICTIONARY_2.formal.values, index=DATA_FRAME_SLANG_DICTIONARY_2.slang).to_dict()

Unnamed: 0,slang,formal
3258,boz,bos
3259,mayan,lumayan
3260,ribed,ribet
3261,ntapz,mantap
3262,ntaps,mantap


Unnamed: 0,slang,formal
1492,bajing,anjing
1493,ngentod,anjing
1494,trouble,masalah
1495,tengkyu,terima kasih
1496,thanks,terima kasih


# INTEGRASI KAMUS SLANG


In [16]:
def slang_dict_integration(text):
  words = text.split()
  standardization_words = []

  for word in words:
    if word in SLANG_DICTIONARY_1:
      standardization_words.append(SLANG_DICTIONARY_1[word])
    elif word in SLANG_DICTIONARY_2:
      standardization_words.append(SLANG_DICTIONARY_2[word])
    else:
      standardization_words.append(word)

  return " ".join(standardization_words)

DATA_FRAME_DATASET["after_slang_text"] = DATA_FRAME_DATASET["cleaned_text"].apply(slang_dict_integration)
display(DATA_FRAME_DATASET["after_slang_text"].tail().to_frame())

Unnamed: 0,after_slang_text
4037,pemindahan ibu kota menggambarkan tekad indone...
4038,pemindahan ibu kota bukti bahwa kita serius da...
4039,pemindahan ibu kota nusantara akan membuka pel...
4040,presiden jokowi meyakini bahwa pemindahan ibu ...
4041,dukung pemindahan ibu kota nusantara untuk men...


# TRANSLATE


In [17]:
DATA_FRAME_DATASET['translated_text'] = DATA_FRAME_DATASET['after_slang_text']
data = DATA_FRAME_DATASET

for i, w in enumerate(data['translated_text']):
    #time.sleep(1)
    text = w
    try:
        translated = GoogleTranslator(source='id', target='en').translate(w)
        data.at[i, 'translated_text'] = translated 
    except:
        result = ""
        split_sent = sent_tokenize(w)
        
        for j in split_sent: 
            try:
                translated = GoogleTranslator(source='id', target='en').translate(j)
                result += translated + " "
            except:
                continue
            
        data.at[i, 'translated_text'] = result
      
    if ((i+1)%100 == 0):
        print("{0} tweet translated out of {1} --- {2:.2f}%".format(i+1, len(data), ((i+1)/len(data))*100))
    elif ((i+1) == len(data)):
        print("{0} tweet translated out of {1} --- 100%".format(i+1, len(data)))


100 tweet translated out of 4042 --- 2.47%
200 tweet translated out of 4042 --- 4.95%
300 tweet translated out of 4042 --- 7.42%
400 tweet translated out of 4042 --- 9.90%
500 tweet translated out of 4042 --- 12.37%
600 tweet translated out of 4042 --- 14.84%
700 tweet translated out of 4042 --- 17.32%
800 tweet translated out of 4042 --- 19.79%
900 tweet translated out of 4042 --- 22.27%
1000 tweet translated out of 4042 --- 24.74%
1100 tweet translated out of 4042 --- 27.21%
1200 tweet translated out of 4042 --- 29.69%
1300 tweet translated out of 4042 --- 32.16%
1400 tweet translated out of 4042 --- 34.64%
1500 tweet translated out of 4042 --- 37.11%
1600 tweet translated out of 4042 --- 39.58%
1700 tweet translated out of 4042 --- 42.06%
1800 tweet translated out of 4042 --- 44.53%
1900 tweet translated out of 4042 --- 47.01%
2000 tweet translated out of 4042 --- 49.48%
2100 tweet translated out of 4042 --- 51.95%
2200 tweet translated out of 4042 --- 54.43%
2300 tweet translated o

In [22]:
display(DATA_FRAME_DATASET["translated_text"].tail().to_frame())

Unnamed: 0,translated_text
4037,The relocation of the capital city illustrates...
4038,relocating the capital is proof that we are se...
4039,The relocation of the Indonesian capital will ...
4040,President Jokowi believes that moving the capi...
4041,support the relocation of the Indonesian capit...


# CLEAN AGAIN


In [23]:
DATA_FRAME_DATASET["translated_text"] = DATA_FRAME_DATASET["translated_text"].apply(clean_text)
display(DATA_FRAME_DATASET["translated_text"].tail().to_frame())

Unnamed: 0,translated_text
4037,the relocation of the capital city illustrates...
4038,relocating the capital is proof that we are se...
4039,the relocation of the indonesian capital will ...
4040,president jokowi believes that moving the capi...
4041,support the relocation of the indonesian capit...


# INTEGRASI STOPWORDS DEFAULT NLTK


In [24]:
stop_words = set(stopwords.words('english'))
def drop_stopwords(text):
  return " ".join([i for i in text.split() if i not in stop_words])

DATA_FRAME_DATASET["after_stopwords_text"] = DATA_FRAME_DATASET["translated_text"].apply(drop_stopwords)
display(DATA_FRAME_DATASET["after_stopwords_text"].tail().to_frame())

Unnamed: 0,after_stopwords_text
4037,relocation capital city illustrates indonesia'...
4038,relocating capital proof serious managing natu...
4039,relocation indonesian capital open new busines...
4040,president jokowi believes moving capital city ...
4041,support relocation indonesian capital optimize...


# LEMMATISASI


In [25]:
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
  return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

DATA_FRAME_DATASET["after_lemmatized_text"] = DATA_FRAME_DATASET["after_stopwords_text"].apply(lemmatize_text)
display(DATA_FRAME_DATASET["after_lemmatized_text"].tail().to_frame())

Unnamed: 0,after_lemmatized_text
4037,relocation capital city illustrates indonesia'...
4038,relocating capital proof serious managing natu...
4039,relocation indonesian capital open new busines...
4040,president jokowi belief moving capital city ar...
4041,support relocation indonesian capital optimize...


# EXPORT ENGLISH PREPROCESSED DATASET


In [26]:
# final column
DATA_FRAME_DATASET["eng_preprocessed_text"] = DATA_FRAME_DATASET["after_lemmatized_text"]

DATA_FRAME_DATASET.to_csv("outputs/eng-preprocessed.csv", index=False)