1. PENGUMPULAN DATA

data berasal dari penelitian di perpustakaan SD Negeri Pledokan, Kecamatan Sumowono, Kabupaten Semarang,

Link google drive dataset: https://drive.google.com/file/d/1tHY4XNBxwgd8b3B1XUKBZbgbuXSqGq_S/view?usp=sharing

Link google drive data rules: https://drive.google.com/file/d/12xxfndiKe9OuI4oRe3-6YRnnOBCIuDYu/view?usp=sharing

2. MENELAAH DATA

In [1]:
#library
import pandas as pd
import numpy as np
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import re
import pickle


In [2]:
#load dataset
books = pd.read_csv('databuku.csv')

In [3]:
books.head()

Unnamed: 0,Nomor Klasifikasi,Tanggal,Judul Buku,Kode Judul Buku,Kategori,Asal/Sumber,Kode Pengarang,Pengarang,Penerbit,Tahun Terbit,Jumlah Buku,Harga,Keterangan
0,500,2022-10-14,Muatan Lokal Ensiklopedia IPTEK 7,M,Ensiklopedia Umum,-,DOD,Dody Hidayat dan Imam Firdaus,Penerbit Lentera Abadi,2017,2,0,Baik
1,500,2022-10-04,Cakrawala Sains Serba Serbi Energi,C,Fisika,-,OKT,Dian Malini Oktovina,Nobel Edumedia,2008,6,0,Baik
2,500,2022-10-04,Mengenal Energi,M,Fisika,-,SAF,Desi Safitri,PT Graha Bandung Kencana,2017,3,0,Baik
3,500,2022-10-04,Panas dan Energi,P,Fisika,-,SEA,Bobbi Searle,PT Gading Inti Prima,2017,6,0,Baik
4,500,2022-10-04,Mari Bereksperimen dengan Sains,M,Fisika,-,NUR,R Nurfadilah,PT Musi Perkasa Utama,2017,6,0,Baik


3. VALIDASI DATA

In [4]:
#cek data NaN
books.isnull().sum()


Nomor Klasifikasi    0
Tanggal              0
Judul Buku           0
Kode Judul Buku      0
Kategori             0
Asal/Sumber          0
Kode Pengarang       0
Pengarang            0
Penerbit             0
Tahun Terbit         0
Jumlah Buku          0
Harga                0
Keterangan           0
dtype: int64

In [5]:
books.dtypes

Nomor Klasifikasi     int64
Tanggal              object
Judul Buku           object
Kode Judul Buku      object
Kategori             object
Asal/Sumber          object
Kode Pengarang       object
Pengarang            object
Penerbit             object
Tahun Terbit          int64
Jumlah Buku           int64
Harga                 int64
Keterangan           object
dtype: object

In [6]:
books = books.astype({'Judul Buku': 'string'})
books = books.astype({'Nomor Klasifikasi': 'category'})

books.dtypes


Nomor Klasifikasi          category
Tanggal                      object
Judul Buku           string[python]
Kode Judul Buku              object
Kategori                     object
Asal/Sumber                  object
Kode Pengarang               object
Pengarang                    object
Penerbit                     object
Tahun Terbit                  int64
Jumlah Buku                   int64
Harga                         int64
Keterangan                   object
dtype: object

4. MENENTUKAN OBJECT DATA

In [7]:
#memilih 2 fitur yang akan digunakan
books = pd.DataFrame(books, columns=['Nomor Klasifikasi', 'Judul Buku'])
books


Unnamed: 0,Nomor Klasifikasi,Judul Buku
0,500,Muatan Lokal Ensiklopedia IPTEK 7
1,500,Cakrawala Sains Serba Serbi Energi
2,500,Mengenal Energi
3,500,Panas dan Energi
4,500,Mari Bereksperimen dengan Sains
...,...,...
1430,800,Ada Apa Sih? Ada Apa Sih?
1431,800,Bubur Bassang
1432,800,Randi dan Hutan Bukit Gir
1433,800,Bayoi


In [8]:
#menambahkan data rules
rules = pd.read_csv('data_rules.csv')

#gabungkan dataset dengan data rules
books = pd.concat([books, rules], ignore_index=True)

books

Unnamed: 0,Nomor Klasifikasi,Judul Buku
0,500,Muatan Lokal Ensiklopedia IPTEK 7
1,500,Cakrawala Sains Serba Serbi Energi
2,500,Mengenal Energi
3,500,Panas dan Energi
4,500,Mari Bereksperimen dengan Sains
...,...,...
1983,900,sejarah umum eropa
1984,900,sejarah umum asia
1985,900,sejarah umum afrika
1986,900,sejarah umum amerika utara dan amerika selatan


In [9]:
column_mapping = {
    'Nomor Klasifikasi': 'genre',
    'Judul Buku': 'title'
}
books.rename(columns=column_mapping, inplace=True)

books


Unnamed: 0,genre,title
0,500,Muatan Lokal Ensiklopedia IPTEK 7
1,500,Cakrawala Sains Serba Serbi Energi
2,500,Mengenal Energi
3,500,Panas dan Energi
4,500,Mari Bereksperimen dengan Sains
...,...,...
1983,900,sejarah umum eropa
1984,900,sejarah umum asia
1985,900,sejarah umum afrika
1986,900,sejarah umum amerika utara dan amerika selatan


In [10]:
books['genre'].value_counts()


genre
800    359
600    319
500    316
300    263
700    203
200    149
0      134
100    117
900     81
400     47
Name: count, dtype: int64

5. MEMBERSIHKAN DATA

In [11]:
#cek data NaN
books_clean = books.dropna(how='all')

books.isnull().sum()


genre    0
title    0
dtype: int64

In [12]:
books_clean['title'].fillna('placeholder_word', inplace=True)

In [13]:
books_clean.dropna(subset=['title'], inplace=True)

6. KONSTRUKSI DATA

In [14]:
#melakukan preprocessing pada kolom title (judul buku)

#menampilkan "judul buku" sebelum di preprocessing
books_clean['title']


0                    Muatan Lokal Ensiklopedia IPTEK 7
1                   Cakrawala Sains Serba Serbi Energi
2                                      Mengenal Energi
3                                     Panas dan Energi
4                      Mari Bereksperimen dengan Sains
                             ...                      
1983                                sejarah umum eropa
1984                                 sejarah umum asia
1985                               sejarah umum afrika
1986    sejarah umum amerika utara dan amerika selatan
1987               sejarah umum bagian lain dari bumi 
Name: title, Length: 1988, dtype: object

In [15]:
#case folding
def clean(text):
    #menghapus garis miring dan tanda kutip
    text = re.sub("\'", "", str(text))
    #menghapus semuanya kecuali huruf
    text = re.sub("[^a-zA-Z]", " ", text)
    #menghapus spasi
    text = ' '.join(text.split())
    #mengubah text menjadi huruf kecil
    text = text.lower()
    return text


books_clean.loc[:, 'title'] = books_clean.loc[:,'title'].apply(lambda x: clean(x))

#menampilkan hasil text-cleaning
books_clean['title'].head()


0       muatan lokal ensiklopedia iptek
1    cakrawala sains serba serbi energi
2                       mengenal energi
3                      panas dan energi
4       mari bereksperimen dengan sains
Name: title, dtype: object

In [16]:
#tokenizing
def token(title):
  nstr = title.split(' ')
  dat = []
  a = -1
  for hu in nstr:
    a = a + 1
  if hu == '':
    dat.append(a)
  p = 0
  b = 0
  for q in dat:
    b = q - p
    del nstr[b]
    p = p + 1
  return nstr


books_clean['title'] = books_clean['title'].apply(token)

#menampilkan hasil tokenizing
books_clean['title'].head()


0        [muatan, lokal, ensiklopedia, iptek]
1    [cakrawala, sains, serba, serbi, energi]
2                          [mengenal, energi]
3                        [panas, dan, energi]
4        [mari, bereksperimen, dengan, sains]
Name: title, dtype: object

In [17]:
#filtering
#removal of stopwords
from nltk.corpus import stopwords
nltk.download('stopwords')


def stopword_removal(title):
  filtering = stopwords.words('indonesian', 'english')
  x = []
  data = []

  def nyFunc(x):
    if x in filtering:
      return False
    else:
      return True
  fit = filter(nyFunc, title)
  for x in fit:
    data.append(x)
  return data


books_clean['title'] = books_clean['title'].apply(stopword_removal)

#menampilkan hasil filtering
books_clean['title'].head()


[nltk_data] Downloading package stopwords to C:\Users\win
[nltk_data]     X\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


0        [muatan, lokal, ensiklopedia, iptek]
1    [cakrawala, sains, serba, serbi, energi]
2                          [mengenal, energi]
3                             [panas, energi]
4                [mari, bereksperimen, sains]
Name: title, dtype: object

In [18]:
#stemming
from sklearn.pipeline import Pipeline
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

factory = StemmerFactory()
stemmer = factory.create_stemmer()
def stemming(sentence):
    stemSentence = ""
    for word in sentence:
        stem = stemmer.stem(word)
        stemSentence += stem
        stemSentence += " "
    stemSentence = stemSentence.strip()
    return stemSentence

books_clean['title'] = books_clean['title'].apply(lambda x: stemming(x))

#menampilkan hasil stemming
books_clean['title'].head()

0         muat lokal ensiklopedia iptek
1    cakrawala sains serba serbi energi
2                          kenal energi
3                          panas energi
4                 mari eksperimen sains
Name: title, dtype: object

In [19]:
books_clean.dtypes

genre     int64
title    object
dtype: object

In [20]:
books_clean = books_clean.astype({'title':'string'})
books_clean = books_clean.astype({'genre': 'category'})

books_clean.dtypes

genre          category
title    string[python]
dtype: object

7. MODELLING

In [21]:
#proses TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(books_clean['title'])


In [22]:
#proses WORD2VEC
import gensim

sentences = [title.split() for title in books_clean['title']]
model = gensim.models.Word2Vec(sentences, vector_size=100, window=5, min_count=1, sg=0)


In [23]:
#proses kombinasi TF-IDF dan Word2Vec
# Fungsi untuk menghitung vektor rata-rata dari kata dalam judul buku
def get_average_word2vec(title, model, tfidf_vectorizer):
    words = title.split()
    vectors = []
    for word in words:
        if word in model.wv:
            word_vector = model.wv[word]
            tfidf_value = tfidf_vectorizer.idf_[tfidf_vectorizer.vocabulary_.get(word, 0)]
            vectors.append(word_vector * tfidf_value)
    if len(vectors) > 0:
        return sum(vectors) / len(vectors)
    else:
        return [0.0] * model.vector_size

X_word2vec = [get_average_word2vec(title, model, tfidf_vectorizer) for title in books['title']]

# Gabungkan vektor TF-IDF dan Word2Vec
X_combined = [list(tfidf_vector.toarray()[0]) + list(word2vec) for tfidf_vector, word2vec in zip(X_tfidf, X_word2vec)]


In [33]:
#fitting the models -- 80-20% split
x_train, x_test, y_train, y_test = train_test_split(X_combined, books_clean['genre'], train_size=0.8, test_size=0.2, random_state=100)

In [34]:
#proses oversampling
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=100)
x_train_res, y_train_res = sm.fit_resample(x_train, y_train)

print('original sample: ', np.unique(y_train, return_counts=True))
print('after oversampling: ', np.unique(y_train_res, return_counts=True))

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


original sample:  (array([  0, 100, 200, 300, 400, 500, 600, 700, 800, 900], dtype=int64), array([107,  91, 121, 218,  37, 262, 259, 153, 283,  59], dtype=int64))
after oversampling:  (array([  0, 100, 200, 300, 400, 500, 600, 700, 800, 900], dtype=int64), array([283, 283, 283, 283, 283, 283, 283, 283, 283, 283], dtype=int64))


In [35]:
#SVM
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

svc = svm.SVC(kernel='linear', C=1, decision_function_shape='ovr').fit(
    x_train_res, y_train_res)
predicted = svc.predict(x_test)
print("SVM Accuracy: ", accuracy_score(y_test, predicted)*100)
print("\n", classification_report(y_test, predicted, zero_division=0))


  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


SVM Accuracy:  72.11055276381909

               precision    recall  f1-score   support

           0       1.00      0.85      0.92        27
         100       0.81      0.50      0.62        26
         200       0.94      0.57      0.71        28
         300       0.74      0.62      0.67        45
         400       1.00      0.90      0.95        10
         500       0.83      0.72      0.77        54
         600       0.72      0.78      0.75        60
         700       0.94      0.68      0.79        50
         800       0.49      0.84      0.62        76
         900       0.82      0.64      0.72        22

    accuracy                           0.72       398
   macro avg       0.83      0.71      0.75       398
weighted avg       0.78      0.72      0.73       398



  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [32]:
#uji coba
judul_buku = "mencegah penyakit diabetes"

# Proses judul buku dengan TF-IDF dan Word2Vec
tfidf_vector_judul = tfidf_vectorizer.transform([judul_buku])
word2vec_judul = get_average_word2vec(judul_buku, model, tfidf_vectorizer)

# Gabungkan vektor TF-IDF dan Word2Vec dari judul buku
X_combined_judul = list(tfidf_vector_judul.toarray()[0]) + list(word2vec_judul)

# Gunakan model SVM yang telah dilatih untuk memprediksi genre buku
predicted_genre = svc.predict([X_combined_judul])

print(f"Prediksi Genre Buku: {predicted_genre[0]}")

Prediksi Genre Buku: 800


In [36]:
with open('model.pkl', 'wb') as files:
    pickle.dump(svc,files)

In [26]:
with open('tfidf.pkl', 'wb') as files:
    pickle.dump(tfidf_vectorizer,files)

In [28]:
with open('word2vec.pkl', 'wb') as files:
    pickle.dump(model,files)