**Model Pengkategorisasian Secara Otomatis**

In [3378]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score,f1_score,recall_score,precision_score,classification_report
import joblib
import warnings
warnings.filterwarnings('ignore')

**Persiapkan Dataset**

In [3379]:
data = pd.read_csv(r'C:\Users\jovan\OneDrive\Documents\Vs Code\Python VSCODE\Bangkit\Capstone\Data\dataset_final.csv')
# Memilih kolom tertentu dari dataframe
df = data[['nama', 'kategori']]

# Kolom 'Lainnya'
df['Lainnya'] = pd.DataFrame(df['kategori']).apply(lambda row: 1 if 'Lainnya' in row.values else 0, axis=1)

# Kolom 'Makanan dan Minuman'
df['Makanan dan Minuman'] = pd.DataFrame(df['kategori']).apply(lambda row: 1 if 'Makanan dan Minuman' in row.values else 0, axis=1)

# Kolom 'Belanja'
df['Belanja'] = pd.DataFrame(df['kategori']).apply(lambda row: 1 if 'Belanja' in row.values else 0, axis=1)

# Kolom 'Hiburan'
df['Hiburan'] = pd.DataFrame(df['kategori']).apply(lambda row: 1 if 'Hiburan' in row.values else 0, axis=1)

In [3380]:
df

Unnamed: 0,nama,kategori,Lainnya,Makanan dan Minuman,Belanja,Hiburan
0,Bensin,Lainnya,1,0,0,0
1,Bayar Listrik,Lainnya,1,0,0,0
2,Bayar Air,Lainnya,1,0,0,0
3,Pajak Motor,Lainnya,1,0,0,0
4,Pajak Mobil,Lainnya,1,0,0,0
...,...,...,...,...,...,...
6006,Penghapus,Belanja,0,0,1,0
6007,Penggaris,Belanja,0,0,1,0
6008,Rautan Pensil,Belanja,0,0,1,0
6009,Pensil Warna,Belanja,0,0,1,0


**Split Dataset**

In [3381]:
from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit
from sklearn.utils import _safe_indexing
from sklearn.utils.validation import _num_samples
from sklearn.model_selection._split import _validate_shuffle_split

In [3382]:
def multilabel_train_test_split(*arrays, test_size=None, train_size=None, random_state=None, shuffle=True, stratify=None):
    if stratify is None:
        return train_test_split(*arrays, test_size=test_size, train_size=train_size, random_state=random_state, stratify=None, shuffle=shuffle)

    n_samples = _num_samples(arrays[0])
    n_train, n_test = _validate_shuffle_split(n_samples, test_size, train_size, default_test_size=0.25)

    cv = MultilabelStratifiedShuffleSplit(test_size=n_test, train_size=n_train, random_state=123)
    train, test = next(cv.split(X=arrays[0], y=stratify))

    return [(_safe_indexing(a, train), _safe_indexing(a, test)) for a in arrays]


In [3383]:
df = df.drop_duplicates()

print(df[df.index.duplicated()])


Empty DataFrame
Columns: [nama, kategori, Lainnya, Makanan dan Minuman, Belanja, Hiburan]
Index: []


In [3384]:
X = df['nama']
Y = df.drop(columns=['nama','kategori'])
X_train,X_test,Y_train,Y_test = multilabel_train_test_split(np.array(X), np.array(Y),test_size=0.2)

**Vectorized Dataset**

In [3385]:
# Create vectorizer
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2))

# Vectorizer X_train
X_train_values = tfidf_vectorizer.fit_transform(X_train)

# Vectorizer X_test
X_test_values = tfidf_vectorizer.transform(X_test)


In [3386]:
joblib.dump(tfidf_vectorizer,'tfidfvectorizer.pkl')

['tfidfvectorizer.pkl']

In [3387]:
from sklearn.svm import LinearSVC
from sklearn.multioutput import MultiOutputClassifier
from sklearn.multiclass import OneVsRestClassifier


**SVC**

In [3388]:
Model = LinearSVC(random_state=42)
ModelSVC = MultiOutputClassifier(Model)
ModelSVC.fit(X_train_values, Y_train)

In [3389]:
predict = ModelSVC.predict(X_test_values)

In [3390]:
accuracy_score(predict,Y_test)

0.892271662763466

In [3391]:
f1_score(predict,Y_test,average='micro')

0.8985849056603774

In [3392]:
recall_score(predict,Y_test,average='micro')

0.9049881235154394

In [3393]:
precision_score(predict,Y_test,average='micro')

0.892271662763466

In [3394]:
classification_report(predict,Y_test)

'              precision    recall  f1-score   support\n\n           0       0.68      1.00      0.81        27\n           1       1.00      0.87      0.93       306\n           2       0.00      0.00      0.00         0\n           3       0.77      1.00      0.87        88\n\n   micro avg       0.89      0.90      0.90       421\n   macro avg       0.61      0.72      0.65       421\nweighted avg       0.93      0.90      0.91       421\n samples avg       0.89      0.89      0.89       421\n'

In [3395]:
joblib.dump(ModelSVC,'ModelFinal.pkl')


['ModelFinal.pkl']

**Result**

In [3396]:
Y_train_columns = Y.columns


joblib.dump(Y_train_columns,'Y_train_columns.pkl')

['Y_train_columns.pkl']

In [3412]:
import re

def modelFunction(nama, tfidf_vectorizer):
    m = joblib.load('ModelFinal.pkl')

    # Jika input kosong atau tidak sesuai pola
    

    text_vectorizer = tfidf_vectorizer.transform([nama])
    prediction = m.predict(text_vectorizer)

    result = []

    # Cek apakah hasil prediksi memiliki nilai 1
    if 1 in prediction:
        for pre in np.where(prediction == 1)[1]:
            result.append(Y_train_columns[pre])
    elif not nama or not re.match("^[a-zA-Z0-9]+$", nama):
        return ["Lainnya"]

    return result


In [3413]:
j = modelFunction("Siomay Rp.5000",tfidf_vectorizer)
j

['Makanan dan Minuman']

In [3415]:
# Minta input dari pengguna untuk teks yang ingin diprediksi
input_user = input("Masukkan teks yang ingin diprediksi: ")
user_input = modelFunction(input_user, tfidf_vectorizer)
print("Hasil Prediksi:", user_input)

Hasil Prediksi: ['Lainnya']


In [3400]:
print(df.columns)
print(df.shape)
print(df.head)

Index(['nama', 'kategori', 'Lainnya', 'Makanan dan Minuman', 'Belanja',
       'Hiburan'],
      dtype='object')
(2134, 6)
<bound method NDFrame.head of                nama kategori  Lainnya  Makanan dan Minuman  Belanja  Hiburan
0            Bensin  Lainnya        1                    0        0        0
1     Bayar Listrik  Lainnya        1                    0        0        0
2         Bayar Air  Lainnya        1                    0        0        0
3       Pajak Motor  Lainnya        1                    0        0        0
4       Pajak Mobil  Lainnya        1                    0        0        0
...             ...      ...      ...                  ...      ...      ...
6006      Penghapus  Belanja        0                    0        1        0
6007      Penggaris  Belanja        0                    0        1        0
6008  Rautan Pensil  Belanja        0                    0        1        0
6009   Pensil Warna  Belanja        0                    0        1        0
