In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint




In [None]:
# 1. Load Dataset
df = pd.read_csv("all_dataset.csv")
df

Unnamed: 0,seller_id,product_name,product_id,buyer_id,product_rating,product_price
0,457,40 Set Menu Sehari-hari Hits di Instagram ala ...,30070,1504,4.9,142800
1,457,7 Hari Belajar Drone Photography (Edisi Revisi),30079,1435,5.0,102000
2,457,Akasha : City Hunter - Complete Edition 03,30121,1560,4.9,49300
3,457,AKASHA : FATE/APOCRYPHA 02,30163,271,4.8,38250
4,457,AKASHA : RECORD OF RAGNAROK 03,30219,6,5.0,38250
...,...,...,...,...,...,...
49995,105,Samyang V-AF 24mm T1.9 Lens for Sony FE Samyan...,23658,47,5.0,8979000
49996,135,Godox S60 LED Focusing 3-Light Kit,29682,132,5.0,18849000
49997,134,Memory 128GB SF-M Tough Series UHS-II SDXC Mem...,28802,250,5.0,1599000
49998,4,Insta360 One X3 Sticky Lensguard,20253,248,5.0,539000


In [None]:
# 2. Hapus duplikat produk yang sama di toko yang sama
df_unique = df[['product_id', 'product_name', 'seller_id', 'product_rating', 'product_price']].drop_duplicates()
df_unique

Unnamed: 0,product_id,product_name,seller_id,product_rating,product_price
0,30070,40 Set Menu Sehari-hari Hits di Instagram ala ...,457,4.9,142800
1,30079,7 Hari Belajar Drone Photography (Edisi Revisi),457,5.0,102000
2,30121,Akasha : City Hunter - Complete Edition 03,457,4.9,49300
3,30163,AKASHA : FATE/APOCRYPHA 02,457,4.8,38250
4,30219,AKASHA : RECORD OF RAGNAROK 03,457,5.0,38250
...,...,...,...,...,...
49994,24137,Keychron K12-H2 RGB Backlight Gateron Hot-swap...,104,5.0,1399000
49995,23658,Samyang V-AF 24mm T1.9 Lens for Sony FE Samyan...,105,5.0,8979000
49997,28802,Memory 128GB SF-M Tough Series UHS-II SDXC Mem...,134,5.0,1599000
49998,20253,Insta360 One X3 Sticky Lensguard,4,5.0,539000


In [None]:
# 3. Assign Category
def assign_category(pid):
    pid_str = str(pid)
    if pid_str.startswith('1'):
        return 'alat_masak'
    elif pid_str.startswith('2'):
        return 'kamera'
    elif pid_str.startswith('3'):
        return 'buku'
    else:
        return 'lainnya'

df_unique['category'] = df_unique['product_id'].apply(assign_category)

In [None]:
# 4. Preprocessing Teks
df_unique['product_name2'] = df_unique['product_name'].str.lower()
df_unique['product_name2'] = df_unique['product_name2'].str.replace(u'\xa0', ' ')
df_unique['product_name2'] = df_unique['product_name2'].str.replace(u'\u200a', ' ')
corpus = df_unique['product_name2'].tolist()


In [None]:
# 5. Membuat mapping kategori ke angka
kategori_map = {
    'alat_masak': 0,
    'kamera': 1,
    'buku': 2
}

df_unique['category_label'] = df_unique['category'].map(kategori_map)

In [None]:
# 6. Fit TF-IDF ke nama produk
vectorizer = TfidfVectorizer()
tfidf_matrix_produk = vectorizer.fit_transform(df_unique['product_name2'])

In [None]:
num_categories = 3
produk_vec_dim = tfidf_matrix_produk.shape[1]

In [9]:
from sklearn.model_selection import train_test_split


In [None]:
X_list = []
for i in range(len(df_unique)):
    produk_tfidf_vec = tfidf_matrix_produk[i].toarray().flatten()
    category_label = df_unique.iloc[i]['category_label']
    category_one_hot = np.zeros(num_categories)
    category_one_hot[category_label] = 1
    fitur = np.hstack([category_one_hot, produk_tfidf_vec])
    X_list.append(fitur)
X = np.array(X_list).astype('float32')
y = df_unique['category_label'].values

y_binary = (y == 1).astype(np.float32)

# Split data train-test
X_train, X_test, y_train, y_test = train_test_split(X, y_binary, test_size=0.3, random_state=42)

In [None]:
# 7. Bangun model neural network
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(num_categories + produk_vec_dim,)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])




In [None]:
# 8. Callback dan Checkpoint
early_stop = EarlyStopping(monitor='accuracy', patience=3, restore_best_weights=True)
model_path = "models/model_final2.h5"
checkpoint = ModelCheckpoint(model_path, monitor='accuracy', save_best_only=True)

In [None]:
# 9. Train model
model.fit(X_train, y_train, epochs=10, batch_size=32, verbose=1, callbacks=[early_stop, checkpoint])

Epoch 1/10


Epoch 2/10


  saving_api.save_model(


Epoch 3/10
Epoch 4/10
Epoch 5/10


<keras.src.callbacks.History at 0x25fabb4dc50>

In [None]:
# 10. Membangun fungsi
def recommend_with_nn(keyword, df_produk, vectorizer, tfidf_matrix_produk, model, top_n=30):
    query_vec = vectorizer.transform([keyword.lower()])
    cosine_sim = cosine_similarity(query_vec, tfidf_matrix_produk).flatten()
    candidate_indices = cosine_sim.argsort()[-20:][::-1]

    hasil = []
    for idx in candidate_indices:
        produk_tfidf_vec = tfidf_matrix_produk[idx].toarray()
        category_label = df_produk.iloc[idx]['category_label']
        category_one_hot = np.zeros(num_categories)
        category_one_hot[category_label] = 1

        x_input = np.hstack([category_one_hot, produk_tfidf_vec.flatten()]).reshape(1, -1)
        prob = model.predict(x_input, verbose=0)[0][0]

        hasil.append({
            'product_name': df_produk.iloc[idx]['product_name'],
            'seller_id': df_produk.iloc[idx].get('seller_id', 'N/A'),
            # tidak memasukkan score ke hasil
        })

        df_hasil = pd.DataFrame(hasil)
        
    # Urut berdasarkan prob tanpa menampilkan
    df_hasil['score'] = [model.predict(
        np.hstack([np.eye(num_categories)[df_produk.iloc[idx]['category_label']],
                   tfidf_matrix_produk[idx].toarray().flatten()]).reshape(1,-1),
        verbose=0)[0][0] for idx in candidate_indices]
    df_hasil_sorted = df_hasil.sort_values(by='score', ascending=False).drop(columns=['score'])
    return df_hasil_sorted.head(top_n)



In [19]:
model = tf.keras.models.load_model("models/model_final2.h5")


In [20]:
query = input("Cari produk: ")
hasil_rekomendasi = recommend_with_nn(query, df_unique, vectorizer, tfidf_matrix_produk, model)

print(hasil_rekomendasi)


                                         product_name  seller_id
0   Sony ZV-1 II Vlogging Camera Sony ZV1 II Sony ...        133
2   Sony ZV-1 II Vlogging Camera Sony ZV1 II Sony ...        172
3   Sony ZV-1 II Vlogging Camera Sony ZV1 II Sony ...         40
4   Sony ZV-1 II Vlogging Camera Sony ZV1 II Sony ...         31
1   Sony ZV-1 II Vlogging Camera Sony ZV1 II Sony ...         95
8   Sony A7IV Sony a74 Sony A7 IV Mirrorless Camer...         49
9   Sony A7IV Sony a74 Sony A7 IV Mirrorless Camer...        186
10  Sony A7IV Sony a74 Sony A7 IV Mirrorless Camer...        154
7   Sony A7IV Sony a74 Sony A7 IV Mirrorless Camer...        189
6   Sony A7IV Sony a74 Sony A7 IV Mirrorless Camer...         42
5   Sony A7IV Sony a74 Sony A7 IV Mirrorless Camer...         28
14  Sony Alpha A7CR Sony A7C R Sony A7CR Mirrorles...         97
15  Sony Alpha A7CR Sony A7C R Sony A7CR Mirrorles...         19
16  Sony Alpha A7CR Sony A7C R Sony A7CR Mirrorles...         59
17  Sony Alpha A7CR Sony 

In [21]:
# Simpan vectorizer
import pickle
with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

In [22]:
from scipy.sparse import save_npz
save_npz("tfidf_matrix.npz", tfidf_matrix_produk)

In [1]:
pip freeze > requirements.txt


Note: you may need to restart the kernel to use updated packages.


