In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

In [2]:
# Baca raw_data
df = pd.read_csv('DataProcess/raw_data.csv') 
df.head()

Unnamed: 0,text,label
0,Menpora Dukung Pianis Cilik asal Kendal yang I...,0
1,10 Festival Unik dari Berbagai Belahan DuniaBe...,0
2,Merawat Pusaka TionghoaLie kim in. Tjia Gwan S...,0
3,Ziarah Riwayat Foramadiahi dan Kastela“Semanga...,0
4,Budaya Indonesia Pudar karena Ulah Kita Sendir...,0


# Preproccesing

In [3]:
def delNumber(text):
    noNum = re.sub(r"\d+", "", text)
    return noNum

In [4]:
def delSymbol(text):
    symbols = "!\"“”#$%&’'()*+-,./:;<=>?@[\]^_`{|}~\n"
    for i in symbols:
        text = np.char.replace(text, i, ' ')
    return text

In [5]:
def delWhiteSpace(text):
    text = re.sub('\s+',' ',str(text))
    return text

In [6]:
def caseFolding(text):
    text = np.char.lower(text)
    return text

In [7]:
def tokenize(text):
    text = nltk.tokenize.word_tokenize(str(text))
    return text

In [8]:
def stopwordRemove(text, list_stopwords):
    text = [word for word in text if not word in list_stopwords]
    return text

In [9]:
# Data Cleansing
df['text']=df.apply(lambda x: delNumber(x[0]), axis=1)
df['text']=df.apply(lambda x: delSymbol(x[0]), axis=1)
df['text']=df.apply(lambda x: delWhiteSpace(x[0]), axis=1)
df.to_pickle('D:\JupyterNotebook\DataProcess\cleansing_df.pkl')
print("Data Cleansing done, save in cleansing_df.pkl")

# Case Folding
df['text']=df.apply(lambda x: caseFolding(x[0]), axis=1)
df.to_pickle('D:\JupyterNotebook\DataProcess\casefolding_df.pkl')
print("Case Folding done, save in casefolding_df.pkl")

# Tokenization
df['text']=df.apply(lambda x: tokenize(x[0]), axis=1)
df.to_pickle(r'D:\JupyterNotebook\DataProcess\tokenization_df.pkl')
print("Tokenization done, save in tokenization_df.pkl")

# Stopwords Removal
list_stopwords = set(stopwords.words('indonesian')) # Set dictionary from library
df['text']=df.apply(lambda x: stopwordRemove(x[0],list_stopwords), axis=1)
df.to_pickle('D:\JupyterNotebook\DataProcess\stopwords_df.pkl')
print("Stopwords Removal done, save in stopwords_df.pkl")

Data Cleansing done, save in cleansing_df.pkl
Case Folding done, save in casefolding_df.pkl
Tokenization done, save in tokenization_df.pkl
Stopwords Removal done, save in stopwords_df.pkl


In [2]:
def stemmingSastrawi(text):
    newtext = []
    for kata in text:
        newtext.append(stemmer.stem(kata))
    return newtext

In [3]:
# Steeming
factory = StemmerFactory()
stemmer = factory.create_stemmer()

In [27]:
dfX = pd.read_pickle(r'D:\JupyterNotebook\DataProcess\stemming_49.pkl')
dfX.head()

Unnamed: 0,text,label
0,"[menpora, dukung, pianis, cilik, kendal, pecah...",0
1,"[festival, unik, bahan, duniaberbagai, negara,...",0
2,"[rawat, pusaka, tionghoalie, kim, in, tjia, gw...",0
3,"[ziarah, riwayat, foramadiahi, kastela, semang...",0
4,"[budaya, indonesia, pudar, ulah, sendiribudaya...",0


In [75]:
# 0-9 ll1 = 9
# 10-19 ll2 = 19
# 20-29 ll3 = 29
# 30-39 ll4 = 39
# 40-49 ll5 =49
# 50-59 ll6 = 59
# 60-69 ll7 = 69
# 70-79 ll8 = 79
# 80-89 ll9 = 89
# 90-99 ll10 = 99
# 100-119 ll11 = 119
# 120-139 ll12 = 139
# 140-159 ll13 = 159
# 160-179 ll14 = 179
# 180-199 ll15 = 199
# 200-219 ll16 = 219
# 220-239 ll17 = 239
# 240-259 ll18 = 259
# 260-279 ll19 = 279
# 280-299 
ll20 = 299
# 300-319 
ll21 = 319
# 320-339 
ll22 = 339
# 340-365 
ll23 = 365

In [69]:
df3 = dfX

(366, 2)

In [84]:
%%time
# DONE
counter = ll22
while(counter <= ll23):
    newData = []
    for data in dfX['text'].loc[counter]:
        newData.append(stemmer.stem(data))
    df3['text'].loc[counter] = newData
    counter += 1

Wall time: 1min 5s


In [91]:
df3.to_pickle('D:\JupyterNotebook\DataProcess\stemming_365.pkl')
print("Stemming done, save in stemming_365.pkl")

Stemming done, save in stemming_365.pkl


In [89]:
print(dfX['text'].loc[365])

['destinasi', 'wisata', 'mengerikan', 'berenang', 'buaya', 'kolam', 'setanguys', 'bosan', 'liburan', 'mencoba', 'beda', 'salahnya', 'mencoba', 'ekstrim', 'pengalaman', 'terlupakan', 'dibanding', 'berbaring', 'pantai', 'minggu', 'menguji', 'adrenalin', 'menemukan', 'terduga', 'destinasi', 'ekstrim', 'menunggumu', 'dilansir', 'tribuntravel', 'com', 'laman', 'herbeauty', 'com', 'rekomendasi', 'cage', 'of', 'death', 'australiapantai', 'australia', 'dikenal', 'sebagia', 'indah', 'dunia', 'bayangkan', 'bentangan', 'pasir', 'putih', 'bermil', 'mil', 'jauhnya', 'sunset', 'indah', 'ombak', 'cocok', 'berselancar', 'destinasi', 'sempurna', 'mengunjungi', 'australia', 'liburan', 'berbeda', 'coba', 'menyelam', 'menit', 'air', 'buaya', 'raksasa', 'mematikan', 'berenang', 'bebas', 'sekitarmu', 'cage', 'of', 'death', 'menawarkan', 'fasilitas', 'berenang', 'buaya', 'mencoba', 'ditempatkan', 'kandang', 'akrilik', 'tembus', 'pandang', 'dirancang', 'orang', 'derajat', 'cage', 'of', 'death', 'beroperasi', 

In [90]:
print(df3['text'].loc[365])

['destinasi', 'wisata', 'keri', 'renang', 'buaya', 'kolam', 'setanguys', 'bosan', 'libur', 'coba', 'beda', 'salah', 'coba', 'ekstrim', 'alam', 'lupa', 'banding', 'baring', 'pantai', 'minggu', 'uji', 'adrenalin', 'temu', 'duga', 'destinasi', 'ekstrim', 'tunggu', 'lansir', 'tribuntravel', 'com', 'laman', 'herbeauty', 'com', 'rekomendasi', 'cage', 'of', 'death', 'australiapantai', 'australia', 'kenal', 'sebagia', 'indah', 'dunia', 'bayang', 'bentang', 'pasir', 'putih', 'mil', 'mil', 'jauh', 'sunset', 'indah', 'ombak', 'cocok', 'selancar', 'destinasi', 'sempurna', 'unjung', 'australia', 'libur', 'beda', 'coba', 'selam', 'menit', 'air', 'buaya', 'raksasa', 'mati', 'renang', 'bebas', 'sekitar', 'cage', 'of', 'death', 'tawar', 'fasilitas', 'renang', 'buaya', 'coba', 'tempat', 'kandang', 'akrilik', 'tembus', 'pandang', 'rancang', 'orang', 'derajat', 'cage', 'of', 'death', 'operasi', 'kali', 'tiket', 'usd', 'tara', 'rp', 'juta', 'orang', 'bungee', 'jump', 'tinggi', 'dunia', 'chinabungee', 'jum'