In [1]:
import pandas as pd
import numpy as np
import preprocessor as p
import re
import string
import itertools
import requests

import nltk
#nltk.download()
from nltk.tokenize import word_tokenize
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from deep_translator import GoogleTranslator

from tqdm.notebook import tqdm
tqdm.pandas()

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_colwidth', None)

In [2]:
raw = pd.read_csv('data.csv')
raw.shape

(158493, 5)

In [3]:
raw.sample(5)

Unnamed: 0,created_at,author_id,username,text,reference_type
103744,2021-07-20T01:32:21.000Z,99229819,galathxia_,RT @Puspen_PKI: ricky elson disuruh pulang dahlan iskan dari jepang buat mengabdi ke negara. dia mimpin proyek mobil listrik. \n\nmobil itu dâ€¦,['retweeted']
140376,2021-11-13T00:38:58.000Z,934208481812344832,zhaokebao,Oh bukan gokar tapi mobil listrik gue rasa,['replied_to']
81192,2021-09-15T09:13:58.000Z,2748535542,ryolandafit,RT @OfficialiNewsTV: Dia menyakini hal itu karena cadangan nikel Indonesia merupakan yang terbesar di dunia. CEN\n\nSelengkapnya https://t.coâ€¦,['retweeted']
25513,2021-02-01T09:34:19.000Z,79650903,aprilanyta,"RT @ArsitektropiS: TANAH MINIMALIS, \nRUMAH MAKSIMALIS check âœ…\n\nJadi luas tanah CUMAN 100M2\nTapii isinya selevel RUMAH SULTAN\n\nðŸ›ŒðŸ›ŒðŸ›ŒðŸ›ŒðŸ›Œ 5 Kmr Tâ€¦",['retweeted']
2446,2021-03-22T01:06:43.000Z,2728984603,DodiJusra,"RT @KATADATAcoid: Pengamat menilai, rencana kenaikan pajak pembelian mobil listrik akan membuat kendaraan ini hanya menjadi angan-angan bagâ€¦",['retweeted']


In [4]:
raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 158493 entries, 0 to 158492
Data columns (total 5 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   created_at      158493 non-null  object
 1   author_id       158493 non-null  int64 
 2   username        158493 non-null  object
 3   text            158493 non-null  object
 4   reference_type  114462 non-null  object
dtypes: int64(1), object(4)
memory usage: 6.0+ MB


# 1. Data Preprocessing
---

## 1.1 Filter Tweets

Drop retweeted Tweets to diminish duplication

In [5]:
raw['reference_type'].value_counts()

['retweeted']               93146
['replied_to']              18263
['quoted']                   2853
['quoted', 'replied_to']      200
Name: reference_type, dtype: int64

We will drop 93146 retweeted Tweets

In [3]:
df = raw[raw['reference_type'] != "['retweeted']"].reset_index(drop=True)

In [4]:
df.shape

(65347, 5)

## 1.2 Text Preprocessing

In [5]:
def translating(text):
    translated = GoogleTranslator(source='auto', target='id').translate(text)
    return translated

def remove_repetitive_characters(text): # Menghapus huruf-huruf yang repetitive (berulang)
    return ''.join(''.join(s)[:1] for _, s in itertools.groupby(text))

def cleaningText(text):
    text = text.replace('\n', ' ').replace('\r', '') # replace new line into space
    text = p.clean(text)
    text = re.sub(r'(?<!\s)\.(?!\s)', ' ', text) # remove dots
    text = re.sub(r'(?<!\s)\,(?!\s)', ' ', text) # remove commas
    text = re.sub(r'[0-9]+', '', text) # remove numbers
    text = re.sub(r'#\w+','', text) # remove hash
    text = re.sub(r' +', ' ', text) # remove white space
    text = text.translate(str.maketrans('', '', string.punctuation)) # remove all punctuations
    text = text.strip(' ') # remove characters space from both left and right text
    text = text.replace('htps','')
    return text

def casefoldingText(text): # Converting all the characters in a text into lower case and split it to tokens
    text = text.lower()
    word_tokens = word_tokenize(text) 
    return word_tokens

def filteringText(text): # Remove stopwords in a text
    f = open("tala-stopwords-indonesia.txt", "r")
    stopword_list = []
    for line in f:
        stripped_line = line.strip()
        line_list = stripped_line.split()
        stopword_list.append(line_list[0])
    f.close()

    factory = StopWordRemoverFactory().get_stop_words()
    stop_words = set(factory + stopword_list)
    stop_words.discard('tidak')
    filtered_text = [word for word in text if word not in stop_words]
    return filtered_text

def stemmingText(text): # Reducing a word to its word stem that affixes to suffixes and prefixes or to the roots of words
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    text = [stemmer.stem(word) for word in text]
    return text

In [6]:
# Normalisasi Kata
url1 = (r'https://raw.githubusercontent.com/ksnugroho/klasifikasi-spam-sms/master/data/key_norm.csv')
url2 = (r'https://raw.githubusercontent.com/nasalsabila/kamus-alay/master/colloquial-indonesian-lexicon.csv')
url3 = (r'https://raw.githubusercontent.com/okkyibrohim/id-abusive-language-detection/master/kamusalay.csv')
url4 = (r'https://raw.githubusercontent.com/louisowen6/NLP_bahasa_resources/master/combined_slang_words.txt')
r = requests.get(url4)

kamus_normalisasi = pd.read_csv(url1)
kamus_alay1 = pd.read_csv(url2)
kamus_alay2 = pd.read_csv(url3, names=['slang','formal'])
kamus_slang = pd.json_normalize(r.json()).transpose().reset_index()
kamus_slang.columns = ['slang','formal']

def normalisasi(text):
  text = [kamus_normalisasi[kamus_normalisasi['singkat'] == word]['hasil'].values[0] if (kamus_normalisasi['singkat'] == word).any() else word for word in text]
  return text

def normalisasi_slang(text):
  text = [kamus_alay1[kamus_alay1['slang'] == word]['formal'].values[0] if (kamus_alay1['slang'] == word).any() else word for word in text]
  text = [kamus_alay2[kamus_alay2['slang'] == word]['formal'].values[0] if (kamus_alay2['slang'] == word).any() else word for word in text]
  text = [kamus_slang[kamus_slang['slang'] == word]['formal'].values[0] if (kamus_slang['slang'] == word).any() else word for word in text]
  return text

In [7]:
# Pipeline
def preprocessing(text):
  text = translating(text)
  text = remove_repetitive_characters(text)
  text = cleaningText(text)
  text = casefoldingText(text)
  text = normalisasi(text)
  text = normalisasi_slang(text)
  text = filteringText(text)
  text = stemmingText(text)
  return text

In [22]:
# Start preprocessing
df['tokenized'] = df['text'].progress_apply(preprocessing)

  0%|          | 0/65347 [00:00<?, ?it/s]

In [23]:
# Save hasil preprocessing
df.to_csv('data_preprocessed.csv')

In [9]:
df_preprocessed = pd.read_csv('data_preprocessed.csv').iloc[: , 1:]

In [10]:
df_preprocessed.head()

Unnamed: 0,created_at,author_id,username,text,reference_type,tokenized
0,2021-03-30T23:44:33.000Z,118646322,Bisniscom,"Kabar Emiten: ISAT Bakal Raih US$750 Juta, SLIS Terpantik Kendaraan Listrik https://t.co/3mPpB7zlE7",,"['kabar', 'emiten', 'isat', 'raih', 'us', 'juta', 'slis', 'pantik', 'kendara', 'listrik']"
1,2021-03-30T23:22:28.000Z,1285016293,Pemuda_Depok,BMW Persiapkan Waktu Untuk Beralih ke MobilÂ Listrik https://t.co/gN2E4cs1QW,,"['bmw', 'siap', 'alih', 'mobillistrik']"
2,2021-03-30T23:22:13.000Z,223119150,idaulat,BMW Persiapkan Waktu Untuk Beralih ke MobilÂ Listrik https://t.co/YlZjq5rMFw,,"['bmw', 'siap', 'alih', 'mobillistrik']"
3,2021-03-30T23:22:11.000Z,58936699,Telegraf_ID,BMW Persiapkan Waktu Untuk Beralih ke MobilÂ Listrik https://t.co/5ZTmVcAAv8,,"['bmw', 'siap', 'alih', 'mobillistrik']"
4,2021-03-30T23:01:16.000Z,151825438,zrjackspicer,Xiaomi produksi mobil listrik dengan investasi 10 miliar dolar - TEKNO YOGYA https://t.co/MdS9g6jmSN #Xiaomi #mobillistrik #ElectricVehicles,,"['xiaomi', 'produksi', 'mobil', 'listrik', 'investasi', 'miliar', 'dolar', 'tekno', 'yogya']"
