In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as mpt

In [2]:
df = pd.read_excel('../dataset/tittle200.xlsx')
df.head()

Unnamed: 0,title,label
0,"Ngatiyana Terjun Langsung Menemui Warga, Menam...",1
1,Spesifikasi Redmi 10C HP Gaming Murah Rp 1 Jutaan,0
2,Puluhan Rumah di Perum Griya Zavira Cilawu Ter...,2
3,Prakiraan Cuaca Kota-kota Besar di Indonesia u...,0
4,"Kurang dari 24 Jam, Pelaku Pembunuh Remaja Dic...",2


# Case Folding

In [3]:
import re
def casefolding(kalimat):
    kalimat = kalimat.strip()
    kalimat = kalimat.lower()
    kalimat = re.sub(r'[|?|$|.|!_:")(-+,)]','', kalimat)
    return kalimat

In [4]:

df['title'] = df['title'].apply(casefolding)
df.head()

Unnamed: 0,title,label
0,ngatiyana terjun langsung menemui warga menamp...,1
1,spesifikasi redmi 10c hp gaming murah rp 1 jutaan,0
2,puluhan rumah di perum griya zavira cilawu ter...,2
3,prakiraan cuaca kota-kota besar di indonesia u...,0
4,kurang dari 24 jam pelaku pembunuh remaja dici...,2


# Tokenizing

In [5]:
from nltk.tokenize import RegexpTokenizer
regexp = RegexpTokenizer(r'\w+|$[0-9]+|\S')

In [6]:
token = df["title"].apply(regexp.tokenize)
df.insert(df.columns.get_loc("label"),"Token", token)

In [7]:
df.head(3)

Unnamed: 0,title,Token,label
0,ngatiyana terjun langsung menemui warga menamp...,"[ngatiyana, terjun, langsung, menemui, warga, ...",1
1,spesifikasi redmi 10c hp gaming murah rp 1 jutaan,"[spesifikasi, redmi, 10c, hp, gaming, murah, r...",0
2,puluhan rumah di perum griya zavira cilawu ter...,"[puluhan, rumah, di, perum, griya, zavira, cil...",2


# Stop Word

In [8]:
import nltk as nk 
nk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\AdmiN\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
from nltk.corpus import stopwords
stopword = stopwords.words('indonesian')
txt_stopword = pd.read_csv('../text/stopword.txt', names=['stopword'], header=None)
stopword.extend(['wkwk','hahahaha','haha','yang','yoi','yoyoy', 'mm', 'mk', 'bandung', 'subang','wartakinico','galamedianews'])
stopword.extend(txt_stopword["stopword"][0].split('\n'))
stopword = set(stopword)

def stopwords(text):
    text = [word for word in text if word not in stopword]
    return text

In [10]:
stopword = df['Token'].apply(stopwords)
df.insert(df.columns.get_loc("label"),"stopword", stopword)

In [11]:
df.head(3)

Unnamed: 0,title,Token,stopword,label
0,ngatiyana terjun langsung menemui warga menamp...,"[ngatiyana, terjun, langsung, menemui, warga, ...","[ngatiyana, terjun, langsung, menemui, warga, ...",1
1,spesifikasi redmi 10c hp gaming murah rp 1 jutaan,"[spesifikasi, redmi, 10c, hp, gaming, murah, r...","[spesifikasi, redmi, 10c, hp, gaming, murah, r...",0
2,puluhan rumah di perum griya zavira cilawu ter...,"[puluhan, rumah, di, perum, griya, zavira, cil...","[puluhan, rumah, perum, griya, zavira, cilawu,...",2


In [12]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

In [13]:
def stemming(konten):
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    do = []
    for w in konten:
        dt = stemmer.stem(w)
        do.append(dt)
    
    d_clean = []
    d_clean = " ".join(do)
    return d_clean

In [14]:
stemmed = df['stopword'].apply(stemming)
df.insert(df.columns.get_loc("label"),"stemmed", stemmed)

In [15]:
df.head()

Unnamed: 0,title,Token,stopword,stemmed,label
0,ngatiyana terjun langsung menemui warga menamp...,"[ngatiyana, terjun, langsung, menemui, warga, ...","[ngatiyana, terjun, langsung, menemui, warga, ...",ngatiyana terjun langsung temu warga tampung a...,1
1,spesifikasi redmi 10c hp gaming murah rp 1 jutaan,"[spesifikasi, redmi, 10c, hp, gaming, murah, r...","[spesifikasi, redmi, 10c, hp, gaming, murah, r...",spesifikasi redmi 10c hp gaming murah rp 1 juta,0
2,puluhan rumah di perum griya zavira cilawu ter...,"[puluhan, rumah, di, perum, griya, zavira, cil...","[puluhan, rumah, perum, griya, zavira, cilawu,...",puluh rumah perum griya zavira cilawu ancam ba...,2
3,prakiraan cuaca kota-kota besar di indonesia u...,"[prakiraan, cuaca, kota, -, kota, besar, di, i...","[prakiraan, cuaca, kota, -, kota, indonesia, s...",prakira cuaca kota - kota indonesia senin 25 j...,0
4,kurang dari 24 jam pelaku pembunuh remaja dici...,"[kurang, dari, 24, jam, pelaku, pembunuh, rema...","[24, jam, pelaku, pembunuh, remaja, diciduk, c...",24 jam laku bunuh remaja ciduk coba lari,2


In [16]:
df.to_csv('../dataset/titleDataClean200.csv', index=False, header=True)