In [None]:
# Library yang digunakan
import re
import nltk
import pandas as pd
from collections import Counter
from nltk.tokenize import word_tokenize 
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.tree import DecisionTreeClassifier
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [None]:
# Load data yang digunakan
columns_name = ['text', 'category']
data = pd.read_excel('1000 data dongeng.xlsx', header=None, names=columns_name)
data

Unnamed: 0,text,category
0,"Ketika kedua kalinya sang ibu menghampirinya, ...",marah
1,Pangeran Empang Kuala murka mendengar pinangan...,marah
2,Bujang Kelana akan menantang Pendekar Katung u...,marah
3,Pendekar Katung yang tidak terima dengan kekal...,marah
4,Ibunya sangat marah. ‘’ Dasar bodoh! Baju kesa...,marah
...,...,...
995,"Akan tetapi, ia dan rombongan terkejut sebab d...",terkejut
996,"Mendengar hal itu, Halimah sangat kaget dan te...",terkejut
997,Ketika aku memperhatikan domba-domba yang seda...,terkejut
998,Pada suatu hari beliau duduk di gua Hira dan t...,terkejut


In [None]:
X = data.text
y = data.category

In [None]:
def clean_textFunc(text):
    
    # Stemming
    factory_stemmer = StemmerFactory()
    stemmer = factory_stemmer.create_stemmer()
    
    # Stopword
    factory_stopword = StopWordRemoverFactory()
    stopword_func = factory_stopword.create_stop_word_remover()

    # Lemmatization 
    lemmatizer = WordNetLemmatizer()
    
    # Clean text
    clean_text = re.sub(r'\W', ' ', text)
    clean_text = re.sub(r'\d', ' ', clean_text)
    clean_text = clean_text.split()
    join_text = ' '.join(clean_text)
    
    # Process
    katadasar = stemmer.stem(join_text)
    stopword = stopword_func.remove(join_text)
    lemma = lemmatizer.lemmatize(join_text)
    
    text1 = lemmatizer.lemmatize(join_text)
    text2 = stopword_func.remove(text1)
    text3 = stemmer.stem(text2)
    text4 = nltk.tokenize.word_tokenize(text3) 
    
    return katadasar, stopword, lemma, text3, text4

In [None]:
tokens = []
process_text = []

for i in enumerate(X):

    result = clean_textFunc(i[1].lower())
    process_text.append(result[3])
    tokens.append(result[4])

In [None]:
my_data = pd.DataFrame(columns=['text', 'kategori', 'clean', 'token'])

for data1, data2, data3, data4 in zip(X, y, process_text, tokens):
    values = [data1, data2, data3, data4]
    zipped = zip(my_data, values)
    a_dictionary = dict(zipped)
    
    my_data = my_data.append(a_dictionary, ignore_index=True) 

In [None]:
my_data

Unnamed: 0,text,kategori,clean,token
0,"Ketika kedua kalinya sang ibu menghampirinya, ...",marah,dua kali sang ibu hampir sang putri justru ben...,"[dua, kali, sang, ibu, hampir, sang, putri, ju..."
1,Pangeran Empang Kuala murka mendengar pinangan...,marah,pangeran empang kuala murka dengar pinang tolak,"[pangeran, empang, kuala, murka, dengar, pinan..."
2,Bujang Kelana akan menantang Pendekar Katung u...,marah,bujang kelana tantang pendekar katung sabung ayam,"[bujang, kelana, tantang, pendekar, katung, sa..."
3,Pendekar Katung yang tidak terima dengan kekal...,marah,pendekar katung tidak terima kalah perintah aw...,"[pendekar, katung, tidak, terima, kalah, perin..."
4,Ibunya sangat marah. ‘’ Dasar bodoh! Baju kesa...,marah,ibu sangat marah dasar bodoh baju sayang harga...,"[ibu, sangat, marah, dasar, bodoh, baju, sayan..."
...,...,...,...,...
995,"Akan tetapi, ia dan rombongan terkejut sebab d...",terkejut,tetapi dan rombong kejut samping huma buka pra...,"[tetapi, dan, rombong, kejut, samping, huma, b..."
996,"Mendengar hal itu, Halimah sangat kaget dan te...",terkejut,dengar itu halimah sangat kaget pukul,"[dengar, itu, halimah, sangat, kaget, pukul]"
997,Ketika aku memperhatikan domba-domba yang seda...,terkejut,aku perhati domba domba sedang main aku kaget ...,"[aku, perhati, domba, domba, sedang, main, aku..."
998,Pada suatu hari beliau duduk di gua Hira dan t...,terkejut,suatu hari beliau duduk gua hira tiba tiba bel...,"[suatu, hari, beliau, duduk, gua, hira, tiba, ..."


In [None]:
my_X = my_data.clean
my_y = my_data.kategori

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

sentences_train, sentences_test, y_train, y_test = train_test_split(my_X, my_y, test_size=0.2)
vectorizer = CountVectorizer()
vectorizer.fit(sentences_train)

X_train = vectorizer.transform(sentences_train)
X_test  = vectorizer.transform(sentences_test)
X_train

<800x1572 sparse matrix of type '<class 'numpy.int64'>'
	with 7143 stored elements in Compressed Sparse Row format>

# DECISION TREE

In [None]:
classifier_decisiontree = DecisionTreeClassifier(random_state = 0)
classifier_decisiontree_model = classifier_decisiontree.fit(X_train, y_train)
prediction_decisiontree = classifier_decisiontree_model.predict(X_test)

In [None]:
# Output yang diinginkan
print('confusion_matrix\n')
print(confusion_matrix(y_test, prediction_decisiontree))
print()
print('classification_report\n')
print(classification_report(y_test, prediction_decisiontree))
print('Accuracy: ',accuracy_score(y_test, prediction_decisiontree))

confusion_matrix

[[ 6  0  2  0  0  1]
 [ 1 31  1  0  0  0]
 [ 4  1 35  1  1  0]
 [ 0  2  6 43  0  1]
 [ 3  0  3  0 26  0]
 [ 0  0  1  0  1 30]]

classification_report

              precision    recall  f1-score   support

       jijik       0.43      0.67      0.52         9
       marah       0.91      0.94      0.93        33
       sedih       0.73      0.83      0.78        42
      senang       0.98      0.83      0.90        52
       takut       0.93      0.81      0.87        32
    terkejut       0.94      0.94      0.94        32

    accuracy                           0.85       200
   macro avg       0.82      0.84      0.82       200
weighted avg       0.88      0.85      0.86       200

Accuracy:  0.855


# RANDOM FOREST

In [None]:
from sklearn.ensemble import RandomForestClassifier

classifier_random = RandomForestClassifier(n_estimators=1000, random_state=0)
classifier_random.fit(X_train, y_train)

y_pred = classifier_random.predict(X_test)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print('confusion_matrix\n')
print(confusion_matrix(y_test,y_pred))
print()
print('classification_report\n')
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

confusion_matrix

[[ 6  1  1  1  0  0]
 [ 0 29  0  4  0  0]
 [ 0  2 35  5  0  0]
 [ 0  1  2 49  0  0]
 [ 0  0  0  4 27  1]
 [ 0  0  0  2  1 29]]

classification_report

              precision    recall  f1-score   support

       jijik       1.00      0.67      0.80         9
       marah       0.88      0.88      0.88        33
       sedih       0.92      0.83      0.88        42
      senang       0.75      0.94      0.84        52
       takut       0.96      0.84      0.90        32
    terkejut       0.97      0.91      0.94        32

    accuracy                           0.88       200
   macro avg       0.91      0.85      0.87       200
weighted avg       0.89      0.88      0.88       200

0.875
