## Content-Based Filtering (Rekomendasi Destinasi)

In [None]:
# %% [markdown]
# # TRAVELMATE - Rekomendasi Wisata Yogyakarta
# **Teknologi:** Content-Based Filtering
# **Dataset:** tourism_with_id.csv

# %% [markdown]
# ## 1. Persiapan Environment
# **Pastikan struktur folder:**
# ```
# Project/
# ├── Datasets/
# │   └── tourism_with_id.csv
# └── Notebook/
#     └── TravelMate_Jogja.ipynb
# ```

# %% [markdown]
# ## 2. Import Library
import pandas as pd
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import joblib

# %% [markdown]
# ## 3. Load Data dengan Validasi
try:
    df = pd.read_csv('../Datasets/tourism_with_id.csv')
    print("✅ Dataset berhasil dimuat")
    print(f"Jumlah data: {len(df)}")
    print(f"Kolom yang tersedia: {list(df.columns)}")
    
except FileNotFoundError:
    print("❌ Error: File tidak ditemukan. Pastikan:")
    print("1. Folder 'Datasets' ada di direktori parent")
    print("2. Nama file 'tourism_with_id.csv' benar")
    print(f"Direktori saat ini: {os.getcwd()}")

# %% [markdown]
# ## 4. Filter Data Yogyakarta
# Daftar stop words bahasa Indonesia
indonesian_stop_words = [
    'yang', 'di', 'ke', 'dari', 'dan', 'untuk', 'pada', 'dengan', 
    'ini', 'itu', 'atau', 'juga', 'dalam', 'tidak', 'akan', 'ada'
]

# Filter data Jogja
if 'City' in df.columns:
    df_jogja = df[
        df['City'].str.contains('Yogyakarta|DIY', case=False, na=False)
    ].copy()
    
    print("\n🔎 Data Yogyakarta:")
    print(f"Jumlah destinasi: {len(df_jogja)}")
    print(f"Kategori unik: {df_jogja['Category'].unique()}")
    
    # Feature engineering
    df_jogja['features'] = (
        df_jogja['Place_Name'] + ' ' +
        df_jogja['Category'] + ' ' +
        df_jogja['Price'].astype(str) + ' ' +
        df_jogja['Rating'].astype(str)
    )
    
    # TF-IDF dengan parameter khusus
    tfidf = TfidfVectorizer(
        stop_words=indonesian_stop_words,
        ngram_range=(1, 2),
        max_features=500
    )
    tfidf_matrix = tfidf.fit_transform(df_jogja['features'])
    
    cosine_sim = cosine_similarity(tfidf_matrix)
    
else:
    print("❌ Kolom 'City' tidak ditemukan dalam dataset")

# %% [markdown]
# ## 5. Fungsi Rekomendasi dengan Validasi
def rekomendasi_jogja(place_id, n=5):
    try:
        if place_id not in df_jogja['Place_Id'].values:
            return "⚠️ ID tidak valid"
            
        idx = df_jogja[df_jogja['Place_Id'] == place_id].index[0]
        sim_scores = list(enumerate(cosine_sim[idx]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:n+1]
        
        result = df_jogja.iloc[[i[0] for i in sim_scores]][[
            'Place_Id', 'Place_Name', 'Category', 'Rating', 'Price'
        ]]
        
        return result
    
    except Exception as e:
        return f"🚨 Error: {str(e)}"

# %% [markdown]
# ## 6. Contoh Penggunaan
if not df_jogja.empty:
    sample_id = df_jogja['Place_Id'].iloc[0]
    print(f"\nContoh rekomendasi untuk ID {sample_id}:")
    print(rekomendasi_jogja(sample_id))
else:
    print("\nTidak ada data Yogyakarta yang tersedia")

# %% [markdown]
# ## 7. Simpan Model
if not df_jogja.empty:
    joblib.dump(
        {
            'tfidf': tfidf,
            'cosine_sim': cosine_sim,
            'metadata': df_jogja[['Place_Id', 'Place_Name']]
        },
        'travelmate_jogja_model.joblib'
    )
    print("\n💾 Model berhasil disimpan")

Kota yang tersedia: ['Jakarta' 'Yogyakarta' 'Bandung' 'Semarang' 'Surabaya']

Jumlah Data Yogyakarta: 126


InvalidParameterError: The 'stop_words' parameter of TfidfVectorizer must be a str among {'english'}, an instance of 'list' or None. Got 'indonesian' instead.