## Import Library

In [1]:
import csv
import re
import nltk
import Sastrawi
import sklearn
import matplotlib.pyplot as plt
import pandas as pd 
import numpy as np

import warnings
warnings.filterwarnings('ignore')

## Dataset

In [2]:
dataset = pd.read_csv('data/twitterIKN-labelled_new.csv')
dataset = dataset.drop(['Unnamed: 7', 'length of word', 'date', 'username', 'deEmoji', 'cleanTweet'],1)
df = dataset
df

Unnamed: 0,location,tweet,Label
0,"Bandung, Jawa Barat",Ridwan Kamil Pastikan Pelaksanaan PPDB 2022 Be...,Positif
1,"Bandung, Jawa Barat",Bewarajabar: Ridwan Kamil Pastikan Pelaksanaan...,Positif
2,"Majalengka, Indonesia",!smansa semangat yang ppdb,Positif
3,Bandung,"Sistem PPDB 2022, Ridwan Kamil: Adil dan Siste...",Positif
4,"Bandung, Indonesia",Hari Pertama PPDB di SMAN 4 Bandung Berjalan L...,Positif
...,...,...,...
94,"Bandung, Jawa Barat","Alur PPDB 2022 Jalur Afirmasi (KETM), Anak Keb...",Netral
95,"Cianjur,Jawa Barat",PPDB SMPN 3 Takokak TP 2022-2023 @ Sindangresm...,Netral
96,"Bandung, Jawa Barat",PPDB 2022: Jawa Barat Siapkan 12 Persen Kuota ...,Netral
97,"Bandung,Indonesia",✔️ Pendaftaran daring/luring oleh sekolah asal...,Netral


## Preprocessing

### Cleansing Data

In [3]:
import string 
import re #regex library

def remove_tweet_special(text):
    # remove tab, new line, ans back slice
    text = text.replace('\\t'," ").replace('\\n'," ").replace('\\u'," ").replace('\\',"")
    # remove non ASCII (emoticon, chinese word, .etc)
    text = text.encode('ascii', 'replace').decode('ascii')
    # remove mention @
    text = re.sub(r"[@][\w_-]+","", text)
    # remove link, hashtag
    text = ' '.join(re.sub("([@#][A-Za-z0-9]+)|(\w+:\/\/\S+)"," ", text).split())
    # remove incomplete URL
    return text.replace("http://", " ").replace("https://", " ")
                
df['cleansing'] = df['tweet'].apply(remove_tweet_special)

#remove number
def remove_number(text):
    return  re.sub(r"\d+", "", text)

df['cleansing'] = df['cleansing'].apply(remove_number)

#remove punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans("","",string.punctuation))

df['cleansing'] = df['cleansing'].apply(remove_punctuation)

#remove whitespace leading & trailing
def remove_whitespace_LT(text):
    return text.strip()

df['cleansing'] = df['cleansing'].apply(remove_whitespace_LT)

#remove multiple whitespace into single whitespace
def remove_whitespace_multiple(text):
    return re.sub('\s+',' ',text)

df['cleansing'] = df['cleansing'].apply(remove_whitespace_multiple)

# remove single char
def remove_singl_char(text):
    return re.sub(r"\b[a-zA-Z]\b", "", text)

df['cleansing'] = df['cleansing'].apply(remove_singl_char)

# remove duplicate
#df.drop_duplicates(subset=['cleansing'], inplace=True)

df.loc[0:100,['tweet', 'cleansing']]

Unnamed: 0,tweet,cleansing
0,Ridwan Kamil Pastikan Pelaksanaan PPDB 2022 Be...,Ridwan Kamil Pastikan Pelaksanaan PPDB Berlang...
1,Bewarajabar: Ridwan Kamil Pastikan Pelaksanaan...,Bewarajabar Ridwan Kamil Pastikan Pelaksanaan ...
2,!smansa semangat yang ppdb,smansa semangat yang ppdb
3,"Sistem PPDB 2022, Ridwan Kamil: Adil dan Siste...",Sistem PPDB Ridwan Kamil Adil dan Sistemnya Ta...
4,Hari Pertama PPDB di SMAN 4 Bandung Berjalan L...,Hari Pertama PPDB di SMAN Bandung Berjalan Lancar
...,...,...
94,"Alur PPDB 2022 Jalur Afirmasi (KETM), Anak Keb...",Alur PPDB Jalur Afirmasi KETM Anak Kebutuhan K...
95,PPDB SMPN 3 Takokak TP 2022-2023 @ Sindangresm...,PPDB SMPN Takokak TP Sindangresmi
96,PPDB 2022: Jawa Barat Siapkan 12 Persen Kuota ...,PPDB Jawa Barat Siapkan Persen Kuota untuk Kel...
97,✔️ Pendaftaran daring/luring oleh sekolah asal...,Pendaftaran daringluring oleh sekolah asal Dat...


### Case Folding

In [4]:
# ------ Case Folding --------
df['case_folding'] = df['cleansing'].str.lower()
df.loc[0:100,['cleansing', 'case_folding']]

Unnamed: 0,cleansing,case_folding
0,Ridwan Kamil Pastikan Pelaksanaan PPDB Berlang...,ridwan kamil pastikan pelaksanaan ppdb berlang...
1,Bewarajabar Ridwan Kamil Pastikan Pelaksanaan ...,bewarajabar ridwan kamil pastikan pelaksanaan ...
2,smansa semangat yang ppdb,smansa semangat yang ppdb
3,Sistem PPDB Ridwan Kamil Adil dan Sistemnya Ta...,sistem ppdb ridwan kamil adil dan sistemnya ta...
4,Hari Pertama PPDB di SMAN Bandung Berjalan Lancar,hari pertama ppdb di sman bandung berjalan lancar
...,...,...
94,Alur PPDB Jalur Afirmasi KETM Anak Kebutuhan K...,alur ppdb jalur afirmasi ketm anak kebutuhan k...
95,PPDB SMPN Takokak TP Sindangresmi,ppdb smpn takokak tp sindangresmi
96,PPDB Jawa Barat Siapkan Persen Kuota untuk Kel...,ppdb jawa barat siapkan persen kuota untuk kel...
97,Pendaftaran daringluring oleh sekolah asal Dat...,pendaftaran daringluring oleh sekolah asal dat...


### Tokenize

In [5]:
# import word_tokenize & FreqDist from NLTK
from nltk.tokenize import word_tokenize 
from nltk.probability import FreqDist

# ------ Tokenizing ---------

# NLTK word rokenize 
def word_tokenize_wrapper(text):
    return word_tokenize(text)

df['tokenize'] = df['case_folding'].apply(word_tokenize_wrapper)

df.loc[0:100,['case_folding', 'tokenize']]

Unnamed: 0,case_folding,tokenize
0,ridwan kamil pastikan pelaksanaan ppdb berlang...,"[ridwan, kamil, pastikan, pelaksanaan, ppdb, b..."
1,bewarajabar ridwan kamil pastikan pelaksanaan ...,"[bewarajabar, ridwan, kamil, pastikan, pelaksa..."
2,smansa semangat yang ppdb,"[smansa, semangat, yang, ppdb]"
3,sistem ppdb ridwan kamil adil dan sistemnya ta...,"[sistem, ppdb, ridwan, kamil, adil, dan, siste..."
4,hari pertama ppdb di sman bandung berjalan lancar,"[hari, pertama, ppdb, di, sman, bandung, berja..."
...,...,...
94,alur ppdb jalur afirmasi ketm anak kebutuhan k...,"[alur, ppdb, jalur, afirmasi, ketm, anak, kebu..."
95,ppdb smpn takokak tp sindangresmi,"[ppdb, smpn, takokak, tp, sindangresmi]"
96,ppdb jawa barat siapkan persen kuota untuk kel...,"[ppdb, jawa, barat, siapkan, persen, kuota, un..."
97,pendaftaran daringluring oleh sekolah asal dat...,"[pendaftaran, daringluring, oleh, sekolah, asa..."


### Normalisasi

In [6]:
normalized_word = pd.read_csv("References/NormalisasiKata/kamus_alay.csv")

normalized_word_dict = {}

for index, row in normalized_word.iterrows():
    if row[0] not in normalized_word_dict:
        normalized_word_dict[row[0]] = row[1] 

def normalized_term(document):
    return [normalized_word_dict[term] if term in normalized_word_dict else term for term in document]

df['normalisasi'] = df['tokenize'].apply(normalized_term)

df.loc[0:100,['tokenize', 'normalisasi']]

Unnamed: 0,tokenize,normalisasi
0,"[ridwan, kamil, pastikan, pelaksanaan, ppdb, b...","[ridwan, kamil, pastikan, pelaksanaan, ppdb, b..."
1,"[bewarajabar, ridwan, kamil, pastikan, pelaksa...","[bewarajabar, ridwan, kamil, pastikan, pelaksa..."
2,"[smansa, semangat, yang, ppdb]","[smansa, semangat, yang, ppdb]"
3,"[sistem, ppdb, ridwan, kamil, adil, dan, siste...","[sistem, ppdb, ridwan, kamil, adil, dan, siste..."
4,"[hari, pertama, ppdb, di, sman, bandung, berja...","[hari, pertama, ppdb, di, sman, bandung, berja..."
...,...,...
94,"[alur, ppdb, jalur, afirmasi, ketm, anak, kebu...","[alur, ppdb, jalur, afirmasi, ketm, anak, kebu..."
95,"[ppdb, smpn, takokak, tp, sindangresmi]","[ppdb, smpn, takokak, tapi, sindangresmi]"
96,"[ppdb, jawa, barat, siapkan, persen, kuota, un...","[ppdb, jawa, barat, siapkan, persen, kuota, un..."
97,"[pendaftaran, daringluring, oleh, sekolah, asa...","[pendaftaran, daringluring, oleh, sekolah, asa..."


### Stopwords

In [7]:
nltk.download('stopwords')
from nltk.corpus import stopwords

# ----------------------- get stopword from NLTK stopword -------------------------------
# get stopword indonesia
list_stopwords = stopwords.words('indonesian')

# ---------------------------- manualy add stopword  ------------------------------------
# append additional stopword
list_stopwords.extend(["yg", "dg", "rt", "dgn", "ny", "d", 'klo', 
                       'kalo', 'amp', 'biar', 'bikin', 'bilang', 
                       'gak', 'ga', 'krn', 'nya', 'nih', 'sih', 
                       'si', 'tau', 'tdk', 'tuh', 'utk', 'ya', 
                       'jd', 'jgn', 'sdh', 'aja', 'n', 't', 
                       'nyg', 'hehe', 'pen', 'u', 'nan', 'loh', 'rt',
                       '&amp', 'yah'])

# ----------------------- add stopword from txt file ------------------------------------
# read txt stopword using pandas
txt_stopword = pd.read_csv("stopwords.txt", names= ["stopwords"], header = None)

# convert stopword string to list & append additional stopword
list_stopwords.extend(txt_stopword["stopwords"][0].split(' '))

# ---------------------------------------------------------------------------------------

# convert list to dictionary
list_stopwords = set(list_stopwords)


#remove stopword pada list token
def stopwords_removal(words):
    #kata = [word for word in words if word not in list_stopwords]
    return [word for word in words if word not in list_stopwords]

df['stopwords'] = df['normalisasi'].apply(stopwords_removal) 

df.loc[0:100,['normalisasi', 'stopwords']]

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/reikiko/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,normalisasi,stopwords
0,"[ridwan, kamil, pastikan, pelaksanaan, ppdb, b...","[ridwan, kamil, pastikan, pelaksanaan, ppdb, a..."
1,"[bewarajabar, ridwan, kamil, pastikan, pelaksa...","[bewarajabar, ridwan, kamil, pastikan, pelaksa..."
2,"[smansa, semangat, yang, ppdb]","[smansa, semangat, ppdb]"
3,"[sistem, ppdb, ridwan, kamil, adil, dan, siste...","[sistem, ppdb, ridwan, kamil, adil, sistemnya,..."
4,"[hari, pertama, ppdb, di, sman, bandung, berja...","[ppdb, sman, bandung, berjalan, lancar]"
...,...,...
94,"[alur, ppdb, jalur, afirmasi, ketm, anak, kebu...","[alur, ppdb, jalur, afirmasi, ketm, anak, kebu..."
95,"[ppdb, smpn, takokak, tapi, sindangresmi]","[ppdb, smpn, takokak, sindangresmi]"
96,"[ppdb, jawa, barat, siapkan, persen, kuota, un...","[ppdb, jawa, barat, siapkan, persen, kuota, ke..."
97,"[pendaftaran, daringluring, oleh, sekolah, asa...","[pendaftaran, daringluring, sekolah, data, per..."


### Stemming

In [8]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import swifter


# create stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# stemmed
def stemmed_wrapper(term):
    return stemmer.stem(term)

term_dict = {}

for document in df['stopwords']:
    for term in document:
        if term not in term_dict:
            term_dict[term] = ' '
            
for term in term_dict:
    term_dict[term] = stemmed_wrapper(term)

# apply stemmed term to dataframe
def get_stemmed_term(document):
    return [term_dict[term] for term in document]

df['stemming'] = df['stopwords'].swifter.apply(get_stemmed_term)

df.loc[0:100,['stopwords', 'stemming']]

Pandas Apply:   0%|          | 0/99 [00:00<?, ?it/s]

Unnamed: 0,stopwords,stemming
0,"[ridwan, kamil, pastikan, pelaksanaan, ppdb, a...","[ridwan, kamil, pasti, laksana, ppdb, adil, tr..."
1,"[bewarajabar, ridwan, kamil, pastikan, pelaksa...","[bewarajabar, ridwan, kamil, pasti, laksana, p..."
2,"[smansa, semangat, ppdb]","[smansa, semangat, ppdb]"
3,"[sistem, ppdb, ridwan, kamil, adil, sistemnya,...","[sistem, ppdb, ridwan, kamil, adil, sistem, ta..."
4,"[ppdb, sman, bandung, berjalan, lancar]","[ppdb, sman, bandung, jalan, lancar]"
...,...,...
94,"[alur, ppdb, jalur, afirmasi, ketm, anak, kebu...","[alur, ppdb, jalur, afirmasi, ketm, anak, butu..."
95,"[ppdb, smpn, takokak, sindangresmi]","[ppdb, smpn, takokak, sindangresmi]"
96,"[ppdb, jawa, barat, siapkan, persen, kuota, ke...","[ppdb, jawa, barat, siap, persen, kuota, kelua..."
97,"[pendaftaran, daringluring, sekolah, data, per...","[daftar, daringluring, sekolah, data, syarat, ..."


### Numeric

In [9]:
df['Label'] = df['Label'].replace(['Positif','Netral','Negatif'],[1,0,-1])
df

Unnamed: 0,location,tweet,Label,cleansing,case_folding,tokenize,normalisasi,stopwords,stemming
0,"Bandung, Jawa Barat",Ridwan Kamil Pastikan Pelaksanaan PPDB 2022 Be...,1,Ridwan Kamil Pastikan Pelaksanaan PPDB Berlang...,ridwan kamil pastikan pelaksanaan ppdb berlang...,"[ridwan, kamil, pastikan, pelaksanaan, ppdb, b...","[ridwan, kamil, pastikan, pelaksanaan, ppdb, b...","[ridwan, kamil, pastikan, pelaksanaan, ppdb, a...","[ridwan, kamil, pasti, laksana, ppdb, adil, tr..."
1,"Bandung, Jawa Barat",Bewarajabar: Ridwan Kamil Pastikan Pelaksanaan...,1,Bewarajabar Ridwan Kamil Pastikan Pelaksanaan ...,bewarajabar ridwan kamil pastikan pelaksanaan ...,"[bewarajabar, ridwan, kamil, pastikan, pelaksa...","[bewarajabar, ridwan, kamil, pastikan, pelaksa...","[bewarajabar, ridwan, kamil, pastikan, pelaksa...","[bewarajabar, ridwan, kamil, pasti, laksana, p..."
2,"Majalengka, Indonesia",!smansa semangat yang ppdb,1,smansa semangat yang ppdb,smansa semangat yang ppdb,"[smansa, semangat, yang, ppdb]","[smansa, semangat, yang, ppdb]","[smansa, semangat, ppdb]","[smansa, semangat, ppdb]"
3,Bandung,"Sistem PPDB 2022, Ridwan Kamil: Adil dan Siste...",1,Sistem PPDB Ridwan Kamil Adil dan Sistemnya Ta...,sistem ppdb ridwan kamil adil dan sistemnya ta...,"[sistem, ppdb, ridwan, kamil, adil, dan, siste...","[sistem, ppdb, ridwan, kamil, adil, dan, siste...","[sistem, ppdb, ridwan, kamil, adil, sistemnya,...","[sistem, ppdb, ridwan, kamil, adil, sistem, ta..."
4,"Bandung, Indonesia",Hari Pertama PPDB di SMAN 4 Bandung Berjalan L...,1,Hari Pertama PPDB di SMAN Bandung Berjalan Lancar,hari pertama ppdb di sman bandung berjalan lancar,"[hari, pertama, ppdb, di, sman, bandung, berja...","[hari, pertama, ppdb, di, sman, bandung, berja...","[ppdb, sman, bandung, berjalan, lancar]","[ppdb, sman, bandung, jalan, lancar]"
...,...,...,...,...,...,...,...,...,...
94,"Bandung, Jawa Barat","Alur PPDB 2022 Jalur Afirmasi (KETM), Anak Keb...",0,Alur PPDB Jalur Afirmasi KETM Anak Kebutuhan K...,alur ppdb jalur afirmasi ketm anak kebutuhan k...,"[alur, ppdb, jalur, afirmasi, ketm, anak, kebu...","[alur, ppdb, jalur, afirmasi, ketm, anak, kebu...","[alur, ppdb, jalur, afirmasi, ketm, anak, kebu...","[alur, ppdb, jalur, afirmasi, ketm, anak, butu..."
95,"Cianjur,Jawa Barat",PPDB SMPN 3 Takokak TP 2022-2023 @ Sindangresm...,0,PPDB SMPN Takokak TP Sindangresmi,ppdb smpn takokak tp sindangresmi,"[ppdb, smpn, takokak, tp, sindangresmi]","[ppdb, smpn, takokak, tapi, sindangresmi]","[ppdb, smpn, takokak, sindangresmi]","[ppdb, smpn, takokak, sindangresmi]"
96,"Bandung, Jawa Barat",PPDB 2022: Jawa Barat Siapkan 12 Persen Kuota ...,0,PPDB Jawa Barat Siapkan Persen Kuota untuk Kel...,ppdb jawa barat siapkan persen kuota untuk kel...,"[ppdb, jawa, barat, siapkan, persen, kuota, un...","[ppdb, jawa, barat, siapkan, persen, kuota, un...","[ppdb, jawa, barat, siapkan, persen, kuota, ke...","[ppdb, jawa, barat, siap, persen, kuota, kelua..."
97,"Bandung,Indonesia",✔️ Pendaftaran daring/luring oleh sekolah asal...,0,Pendaftaran daringluring oleh sekolah asal Dat...,pendaftaran daringluring oleh sekolah asal dat...,"[pendaftaran, daringluring, oleh, sekolah, asa...","[pendaftaran, daringluring, oleh, sekolah, asa...","[pendaftaran, daringluring, sekolah, data, per...","[daftar, daringluring, sekolah, data, syarat, ..."


### Final Preprocessing

In [25]:
df_post_prepro = df[['location','tweet','stemming','Label']].copy(deep=True)

In [26]:
df_post_prepro

Unnamed: 0,location,tweet,stemming,Label
0,"Bandung, Jawa Barat",Ridwan Kamil Pastikan Pelaksanaan PPDB 2022 Be...,"[ridwan, kamil, pasti, laksana, ppdb, adil, tr...",1
1,"Bandung, Jawa Barat",Bewarajabar: Ridwan Kamil Pastikan Pelaksanaan...,"[bewarajabar, ridwan, kamil, pasti, laksana, p...",1
2,"Majalengka, Indonesia",!smansa semangat yang ppdb,"[smansa, semangat, ppdb]",1
3,Bandung,"Sistem PPDB 2022, Ridwan Kamil: Adil dan Siste...","[sistem, ppdb, ridwan, kamil, adil, sistem, ta...",1
4,"Bandung, Indonesia",Hari Pertama PPDB di SMAN 4 Bandung Berjalan L...,"[ppdb, sman, bandung, jalan, lancar]",1
...,...,...,...,...
94,"Bandung, Jawa Barat","Alur PPDB 2022 Jalur Afirmasi (KETM), Anak Keb...","[alur, ppdb, jalur, afirmasi, ketm, anak, butu...",0
95,"Cianjur,Jawa Barat",PPDB SMPN 3 Takokak TP 2022-2023 @ Sindangresm...,"[ppdb, smpn, takokak, sindangresmi]",0
96,"Bandung, Jawa Barat",PPDB 2022: Jawa Barat Siapkan 12 Persen Kuota ...,"[ppdb, jawa, barat, siap, persen, kuota, kelua...",0
97,"Bandung,Indonesia",✔️ Pendaftaran daring/luring oleh sekolah asal...,"[daftar, daringluring, sekolah, data, syarat, ...",0


In [34]:
#Export to CSV
df_post_prepro.to_csv(r'/Users/reikiko/Documents/Study/KP/post_prepro_dataset.csv',index=False)