In [1]:
import pandas as pd
import numpy as np
import os
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

In [2]:
df = pd.DataFrame(columns=['text','label'])
df2 = pd.DataFrame(columns=['text','label'])
folder = ['Budaya','Ekonomi','Entertaiment','Hukum','Kesehatan','Lifestyle',
          'Otomotif','Pendidikan','Politik','Sport','Tekno','Wisata']

In [3]:
# Fungsi membaca file dari folder
def savetodf(all_files,folder,label):
  dfn = pd.DataFrame(columns=['text','label'])
  for i in range(len(all_files)):
    path_txt_file = 'Dataset/'+folder+'/'+all_files[i]
    text = pd.read_fwf(path_txt_file, sep='\n', header=None)[0].str.cat()
    dfn.loc[i] = [text,label]
  return dfn

In [4]:
# Membaca semua data dari semua folder
for i in range(len(folder)):
  all_files = os.listdir("Dataset/"+folder[i]+"/")
  label = int(i)
  dfX = savetodf(all_files,folder[i],label)
  frames = [df, dfX] 
  df = pd.concat(frames).drop_duplicates().reset_index(drop=True)

In [5]:
df.head()

Unnamed: 0,text,label
0,Menpora Dukung Pianis Cilik asal Kendal yang I...,0
1,10 Festival Unik dari Berbagai Belahan DuniaBe...,0
2,Merawat Pusaka TionghoaLie kim in. Tjia Gwan S...,0
3,Ziarah Riwayat Foramadiahi dan Kastela“Semanga...,0
4,Budaya Indonesia Pudar karena Ulah Kita Sendir...,0


# Preproccesing

In [6]:
def delNumber(text):
    noNum = re.sub(r"\d+", "", text)
    return noNum

In [7]:
def delSymbol(text):
    symbols = "!\"“”#$%&’'()*+-,./:;<=>?@[\]^_`{|}~\n"
    for i in symbols:
        text = np.char.replace(text, i, ' ')
    return text

In [8]:
def delWhiteSpace(text):
    text = re.sub('\s+',' ',str(text))
    return text

In [9]:
def caseFolding(text):
    text = np.char.lower(text)
    return text

In [10]:
def tokenize(text):
    text = nltk.tokenize.word_tokenize(str(text))
    return text

In [11]:
def stopwordRemove(text, list_stopwords):
    text = [word for word in text if not word in list_stopwords]
    return text

In [12]:
# Data Cleansing
df['text']=df.apply(lambda x: delNumber(x[0]), axis=1)
df['text']=df.apply(lambda x: delSymbol(x[0]), axis=1)
df['text']=df.apply(lambda x: delWhiteSpace(x[0]), axis=1)

# Case Folding
df['text']=df.apply(lambda x: caseFolding(x[0]), axis=1)

# Tokenization
df['text']=df.apply(lambda x: tokenize(x[0]), axis=1)

# Stopwords Removal
list_stopwords = set(stopwords.words('indonesian')) # Set dictionary from library
df['text']=df.apply(lambda x: stopwordRemove(x[0],list_stopwords), axis=1)

In [13]:
def stemmingSastrawi(text):
    newtext = []
    for kata in text:
        newtext.append(stemmer.stem(kata))
    return newtext

In [14]:
# Steeming
factory = StemmerFactory()
stemmer = factory.create_stemmer()

In [15]:
%%time
for data in range(len(df)):
    newData = []
    for kata in df['text'].loc[data]:
        newData.append(stemmer.stem(kata))
    df2['text'].loc[data] = newData

KeyboardInterrupt: 

In [46]:
print(df2['text'].loc[163])

['susah', 'pasang', 'jelek', 'initeknologi', 'cipta', 'mudah', 'hidup', 'manusia', 'kecuali', 'tinder', 'gadang', 'gadang', 'cupid', 'dunia', 'maya', 'kelas', 'cinta', 'anjur', 'murid', 'jomblo', 'sejati', 'gerilya', 'situs', 'aplikasi', 'kencan', 'online', 'temu', 'jakarta', 'senin', 'lex', 'depraxis', 'alas', 'pria', 'wanita', 'susah', 'pasang', 'jelek', 'alas', 'gaul', 'kunci', 'hubungandia', 'lanjut', 'main', 'kencan', 'online', 'malas', 'gaul', 'cari', 'pasang', 'aplikasi', 'situs', 'kencan', 'online', 'kelas', 'cinta', 'saran', 'gerilya', 'gaul', 'dunia', 'nyata', 'bergabungbahkan', 'lex', 'gabung', 'komunitas', 'temu', 'cinta', 'kelas', 'beda', 'suka', 'main', 'badminton', 'gabung', 'klub', 'rumah', 'klub', 'kantor', 'tuju', 'gabung', 'komunitas', 'tambah', 'hobi', 'mampu', 'main', 'badminton', 'luas', 'gaul', 'temu', 'orang', 'orang', 'pasang', 'shierine', 'wangsa', 'wibawa', 'kompas', 'com']


In [40]:
print(df['text'].loc[163])

['susah', 'pasangan', 'jelek', 'initeknologi', 'diciptakan', 'mempermudah', 'kehidupan', 'manusia', 'terkecuali', 'tinder', 'digadang', 'gadang', 'cupid', 'dunia', 'maya', 'kelas', 'cinta', 'menganjurkan', 'muridnya', 'jomblo', 'sejati', 'bergerilya', 'situs', 'aplikasi', 'kencan', 'online', 'ditemui', 'jakarta', 'senin', 'lex', 'depraxis', 'alasannya', 'pria', 'wanita', 'susah', 'pasangan', 'jelek', 'alasan', 'bergaul', 'kunci', 'hubungandia', 'melanjutkan', 'bermain', 'kencan', 'online', 'malas', 'bergaul', 'mencari', 'pasangan', 'aplikasi', 'situs', 'kencan', 'online', 'kelas', 'cinta', 'menyarankan', 'bergerilya', 'bergaul', 'dunia', 'nyata', 'bergabungbahkan', 'lex', 'bergabung', 'komunitas', 'menemukan', 'cinta', 'kelas', 'berbeda', 'suka', 'bermain', 'badminton', 'bergabung', 'klub', 'perumahan', 'klub', 'kantor', 'tujuan', 'bergabung', 'komunitas', 'menambah', 'hobi', 'kemampuan', 'bermain', 'badminton', 'memperluas', 'pergaulan', 'bertemu', 'orang', 'orang', 'pasangan', 'shier

In [16]:
listnya = ["di","ke","dari","yang"]
kalimat = ["di","pasar","mama","ke","dari","pergi","kemana","ya","namun"]
kalimat = [word for word in kalimat if not word in list_stopwords]
print(kalimat)

['pasar', 'mama', 'pergi', 'kemana', 'ya']


In [17]:
kalimat1 = ["memakan","jambu","bersepeda","berbeda"]
text = [stemmer.stem(kata) for kata in kalimat1]
new = []
for kata in kalimat1:
    new.append(stemmer.stem(kata))
print(kalimat1)
print(new)

['memakan', 'jambu', 'bersepeda', 'berbeda']
['makan', 'jambu', 'sepeda', 'beda']
