# Get dataset from GitHub

In [1]:
!wget https://raw.githubusercontent.com/siskasimandalahi/DestiGo/Machine-Learning/Dataset/data_destinasi_wisata.csv

--2023-06-06 14:26:31--  https://raw.githubusercontent.com/siskasimandalahi/DestiGo/Machine-Learning/Dataset/data_destinasi_wisata.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 378078 (369K) [text/plain]
Saving to: ‘data_destinasi_wisata.csv’


2023-06-06 14:26:31 (14.5 MB/s) - ‘data_destinasi_wisata.csv’ saved [378078/378078]



# **Import the required Library**
**Don't forget the important library and mandatory -> TensorFlow** ❤

In [2]:
# for modelling
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# visualization
import matplotlib.pyplot as plt
import seaborn as sns

# for data processing
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Data Understanding
this is section that read the data and we make a upload manual the csv file.
We make a plan that the data is put into github so we not upload every time reconnect the runtike of colab

In [4]:
# read the data
all_data = pd.read_csv('data_destinasi_wisata.csv')

In [5]:
all_data.sample(5)

Unnamed: 0,id,nama_tempat,kategori,kota,alamat,deskripsi,lat,long
224,225,Istana Negara Republik Indonesia,Budaya,Jakarta Pusat,"Jl. Veteran No.16, RT.2/RW.3, Gambir, Kecamata...",Istana Negara merupakan pusat kegiatan pemerin...,-6.168069,106.823973
75,76,Taman Budaya Jawa Barat,Budaya,Bandung,"Jl. Bukit Dago Selatan No.53 A, Dago, Kecamata...",Taman Budaya Jawa Barat adalah sebuah kompleks...,-6.870099,107.616439
20,21,Kolam Renang Priangan Tirta,Taman Hiburan,Bandung,"Jl. Bojong Cipandan No.9, Cikasungka, Kec. Cik...",Kolam Renang Priangan Tirta adalah sebuah fasi...,-7.011109,107.822986
413,414,G. Walk,Kuliner,Surabaya,"G Walk Citraland, Ruko Taman Gapura, Jl. Niaga...",Sentra Wisata Kuliner Surabaya G-Walk adalah s...,-7.291548,112.654563
479,480,Mirota Batik & Handicraft Surabaya,Pusat Perbelanjaan,Surabaya,"Jl. Sulawesi No.24, Ngagel, Kec. Wonokromo, Su...",Mirota Handicraft Shop atau Mirota Batik merup...,-7.276939,112.74713


In [6]:
print(f"Number of places in the datasets : {len(all_data.id.unique())}")

Number of places in the datasets : 515


# Exploratory Data Analysis
In this section, we analysis the data that we get

In [7]:
# check info of dataset such as how many column and what is the type
all_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 515 entries, 0 to 514
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           515 non-null    int64  
 1   nama_tempat  515 non-null    object 
 2   kategori     515 non-null    object 
 3   kota         515 non-null    object 
 4   alamat       515 non-null    object 
 5   deskripsi    514 non-null    object 
 6   lat          515 non-null    float64
 7   long         515 non-null    float64
dtypes: float64(2), int64(1), object(5)
memory usage: 32.3+ KB


In [8]:
# check if there is null
all_data.isnull().sum()

id             0
nama_tempat    0
kategori       0
kota           0
alamat         0
deskripsi      1
lat            0
long           0
dtype: int64

In [10]:
# drop the null data
all_data = all_data.dropna(subset=['deskripsi'])

In [11]:
# check the data unique of column 'Kategori'
all_data.kategori.unique()

array(['Taman Hiburan', 'Budaya', 'Alam', 'Tempat Ibadah', 'Kuliner',
       'Bahari', 'Pusat Perbelanjaan'], dtype=object)

# Data Preprocessing
Clean and preprocess the collected data to prepare it for further analysis. This may involve removing duplicates, handling missing values, normalizing text, and feature engineering.

In [12]:
# see an overview of the data
all_data.head()

Unnamed: 0,id,nama_tempat,kategori,kota,alamat,deskripsi,lat,long
0,1,Trans Studio Bandung,Taman Hiburan,Bandung,"Jl. Gatot Subroto No.289A, Cibangkong, Kec. Ba...",Trans Studio Bandung adalah sebuah kompleks ta...,-6.925094,107.636494
1,2,Farm House Susu Lembang,Taman Hiburan,Bandung,"Jl. Raya Lembang No.108, Gudangkahuripan, Kec....",Farm House Susu Lembang adalah destinasi yang ...,-6.832969,107.605618
2,3,Dusun Bambu,Taman Hiburan,Bandung,"Jl. Kolonel Masturi No.KM. 11, Kertawangi, Kec...",Dusun Bambu adalah destinasi wisata yang menaw...,-6.789715,107.579163
3,4,Kebun Binatang Bandung,Taman Hiburan,Bandung,"Jl. Kebun Binatang No.6, Lb. Siliwangi, Kecama...","Kebun Binatang Bandung, juga dikenal sebagai K...",-6.889718,107.607728
4,5,Dago Dreampark,Taman Hiburan,Bandung,"Jl. Dago Giri No.Km. 2.2, Pagerwangi, Kec. Lem...",Dago Dreampark adalah kompleks rekreasi yang t...,-6.848642,107.625939


In [14]:
# make just column 'id', 'nama_tempat', 'kategori', 'kota', 'deskripsi
new_data = all_data.drop('alamat', axis=1)
new_data

Unnamed: 0,id,nama_tempat,kategori,kota,deskripsi,lat,long
0,1,Trans Studio Bandung,Taman Hiburan,Bandung,Trans Studio Bandung adalah sebuah kompleks ta...,-6.925094,107.636494
1,2,Farm House Susu Lembang,Taman Hiburan,Bandung,Farm House Susu Lembang adalah destinasi yang ...,-6.832969,107.605618
2,3,Dusun Bambu,Taman Hiburan,Bandung,Dusun Bambu adalah destinasi wisata yang menaw...,-6.789715,107.579163
3,4,Kebun Binatang Bandung,Taman Hiburan,Bandung,"Kebun Binatang Bandung, juga dikenal sebagai K...",-6.889718,107.607728
4,5,Dago Dreampark,Taman Hiburan,Bandung,Dago Dreampark adalah kompleks rekreasi yang t...,-6.848642,107.625939
...,...,...,...,...,...,...,...
510,511,Klenteng Pak Kik Bio,Tempat Ibadah,Surabaya,Kelenteng Pak KIk Bio adalah sebuah kelenteng ...,-7.247918,112.744513
511,512,Klenteng Sanggar Agung,Tempat Ibadah,Surabaya,Kelenteng Sanggar Agung atau Klenteng Hong San...,-7.246944,112.802222
512,513,Masjid Agung Sunan Ampel,Tempat Ibadah,Surabaya,Masjid Agung Sunan Ampel adalah sebuah masjid ...,-7.230321,112.742911
513,514,Masjid Muhammad Cheng Hoo,Tempat Ibadah,Surabaya,Masjid Cheng Hoo Surabaya adalah Masjid bernua...,-7.252275,112.746880


In [15]:
# new_data.info()
# Define columns with float type, which is lat and long
float_cols = new_data.select_dtypes(include=['float']).columns

In [16]:
# Delete columns with float type
new_data = new_data.drop(columns=float_cols)

In [17]:
# make a single column of "kategori" column and "kota" column because later we want to retrieve unique words from categories and cities
new_data['category_city'] = new_data[['kategori','kota']].agg(' '.join,axis=1)

In [18]:
# Remove duplicates
new_data_prep = new_data.drop_duplicates('id')

In [20]:
# converts the columns of a DataFrame object into a Python list.
dest_id = new_data_prep.id.tolist()
dest_name = new_data_prep.nama_tempat.tolist()
dest_desc = new_data_prep.deskripsi.tolist()
dest_category = new_data_prep.kategori.tolist()
dest_city = new_data_prep.kota.tolist()
category_city = new_data_prep.category_city.tolist()

In [21]:
# make a new header in new_data
tourist_data = pd.DataFrame({
    "id" : dest_id,
    "name" : dest_name,
    "category" : dest_category,
    "city" : dest_city,
    "description" : dest_desc,
    "category_city" : category_city
})

tourist_data

Unnamed: 0,id,name,category,city,description,category_city
0,1,Trans Studio Bandung,Taman Hiburan,Bandung,Trans Studio Bandung adalah sebuah kompleks ta...,Taman Hiburan Bandung
1,2,Farm House Susu Lembang,Taman Hiburan,Bandung,Farm House Susu Lembang adalah destinasi yang ...,Taman Hiburan Bandung
2,3,Dusun Bambu,Taman Hiburan,Bandung,Dusun Bambu adalah destinasi wisata yang menaw...,Taman Hiburan Bandung
3,4,Kebun Binatang Bandung,Taman Hiburan,Bandung,"Kebun Binatang Bandung, juga dikenal sebagai K...",Taman Hiburan Bandung
4,5,Dago Dreampark,Taman Hiburan,Bandung,Dago Dreampark adalah kompleks rekreasi yang t...,Taman Hiburan Bandung
...,...,...,...,...,...,...
509,511,Klenteng Pak Kik Bio,Tempat Ibadah,Surabaya,Kelenteng Pak KIk Bio adalah sebuah kelenteng ...,Tempat Ibadah Surabaya
510,512,Klenteng Sanggar Agung,Tempat Ibadah,Surabaya,Kelenteng Sanggar Agung atau Klenteng Hong San...,Tempat Ibadah Surabaya
511,513,Masjid Agung Sunan Ampel,Tempat Ibadah,Surabaya,Masjid Agung Sunan Ampel adalah sebuah masjid ...,Tempat Ibadah Surabaya
512,514,Masjid Muhammad Cheng Hoo,Tempat Ibadah,Surabaya,Masjid Cheng Hoo Surabaya adalah Masjid bernua...,Tempat Ibadah Surabaya


# Content Based Filtering
technique used in recommender systems to recommend items or content to users based on the characteristics or properties of the items themselves.

*make a modelling data using content-based filtering technique*

In [22]:
data = tourist_data
data.sample(5)

Unnamed: 0,id,name,category,city,description,category_city
187,189,Atmosphere Resort Cafe,Kuliner,Bandung,Atmosphere Resort Cafe adalah sebuah kafe yang...,Kuliner Bandung
264,266,Dante Coffee,Kuliner,Jakarta Pusat,Dante Coffee ini terletak di dalam Liberty Hot...,Kuliner Jakarta Pusat
313,315,Le Mint Indian & Chinese Restaurant,Kuliner,Jakarta Utara,"Le Mint Indian & Chinese Restaurant, tempat tr...",Kuliner Jakarta Utara
249,251,Carl's Jr,Kuliner,Jakarta Barat,Jika ingin mencari restoran cepat saji di Peta...,Kuliner Jakarta Barat
220,222,Museum Wayang,Budaya,Jakarta Barat,Museum Wayang adalah sebuah museum yang berlok...,Budaya Jakarta Barat


# TF-IDF Vectorizer
TF-IDF Vectorizer is a commonly used technique in natural language processing (NLP) and information retrieval to convert text documents into numerical feature vectors. TF-IDF stands for "Term Frequency-Inverse Document Frequency."

In [23]:
cv = CountVectorizer()
cv.fit(data['category_city'])

print("Features Name: ", list(cv.vocabulary_.keys()))

Features Name:  ['taman', 'hiburan', 'bandung', 'budaya', 'alam', 'tempat', 'ibadah', 'kuliner', 'bahari', 'jakarta', 'utara', 'barat', 'pusat', 'selatan', 'timur', 'perbelanjaan', 'surabaya']


In [24]:
cv_matrix = cv.transform(data['category_city']) 
 
cv_matrix.shape 

(514, 17)

In [25]:
# change that to one-hot encoding
cv_matrix.todense()

matrix([[0, 0, 1, ..., 0, 0, 0],
        [0, 0, 1, ..., 0, 0, 0],
        [0, 0, 1, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 1, 0, 0],
        [0, 0, 0, ..., 1, 0, 0],
        [0, 0, 0, ..., 1, 0, 0]])

In [26]:
# show it in the table
pd.DataFrame(
    cv_matrix.todense(),
    columns = list(cv.vocabulary_.keys()),
    index = data.name
).sample(5)

Unnamed: 0_level_0,taman,hiburan,bandung,budaya,alam,tempat,ibadah,kuliner,bahari,jakarta,utara,barat,pusat,selatan,timur,perbelanjaan,surabaya
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
The Coffee Bean & Tea Leaf,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0
Kolam Renang Priangan Tirta,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0
Bakso Mesir,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0
Coffee by Taboo,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0
Wisata Batu Kuda,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


Calculate the cosine similarity

In [27]:
cosine_sim = cosine_similarity(cv_matrix)
cosine_sim

array([[1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 1., 1.],
       [0., 0., 0., ..., 1., 1., 1.],
       [0., 0., 0., ..., 1., 1., 1.]])

In [28]:
# show table of comparison
cosine_sim_df = pd.DataFrame(cosine_sim,index=data['name'],columns=data['name'])
cosine_sim_df.sample(5,axis=1).sample(10,axis=0)

name,Restoran Kehidupan Tak Pernah Berakhir,Dim Sum Inc.,Cafe Palalangon,Nasi Pecel Bu Djoyo,Taman Film
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Kadipaten Surabaya,0.0,0.0,0.0,0.5,0.5
Stone Garden Citatah,0.408248,0.0,0.408248,0.0,0.408248
Kolam Renang Priangan Tirta,0.408248,0.0,0.408248,0.0,0.408248
Gua Belanda,0.5,0.0,0.5,0.0,1.0
Observatorium Bosscha,0.408248,0.0,0.408248,0.0,0.408248
Gereja Katolik Kelahiran Santa Perawan Maria,0.0,0.0,0.0,0.408248,0.0
Dapur Solo Resto & Lunch Box - Matraman,0.408248,0.666667,0.408248,0.408248,0.0
Rawon Pak Pangat,0.5,0.408248,0.5,1.0,0.0
Javakarta Resto,0.408248,0.666667,0.408248,0.408248,0.0
Taman Buah Surabaya,0.0,0.0,0.0,0.408248,0.0


# Recommendation


In [29]:
def tourism_recommendations(place_name,similarity_data=cosine_sim_df,items=data[['name','category','description','city']],k=5):
    index = similarity_data.loc[:,place_name].to_numpy().argpartition(range(-1,-k,-1))
    
    closest = similarity_data.columns[index[-1:-(k+2):-1]]
    
    closest = closest.drop(place_name,errors='ignore')
    
    return pd.DataFrame(closest).merge(items).head(k)

In [30]:
tourism_recommendations("Monumen Bandung Lautan Api")

Unnamed: 0,name,category,description,city
0,Batu Cinta,Budaya,Batu Cinta Patenggang adalah sebuah objek wis...,Bandung
1,Roemah Seni Sarasvati,Budaya,Roemah Seni Sarasvati adalah sebuah pusat seni...,Bandung
2,Taman Budaya Jawa Barat,Budaya,Taman Budaya Jawa Barat adalah sebuah kompleks...,Bandung
3,Museum Pos Indonesia,Budaya,Museum ini tidak hanya menampilkan perangko te...,Bandung
4,Taman Sejarah Bandung,Budaya,Taman Sejarah Bandung adalah sebuah taman yang...,Bandung


In [31]:
tourism_recommendations("Rawon Setan")

Unnamed: 0,name,category,description,city
0,Kafe Bromo,Kuliner,Terletak di kompleks Sheraton Surabaya Hotel &...,Surabaya
1,Sentra Wisata Kuliner Dharmahusada,Kuliner,Sentra wisata kuliner ini memiliki jam operasi...,Surabaya
2,Sentra Wisata Kuliner Gayungan,Kuliner,Kota Surabaya Surabaya memiliki beberapa Sentr...,Surabaya
3,Sentra Wisata Kuliner Dharmawangsa,Kuliner,Sentra wisata kuliner dharmawangsa merupakan t...,Surabaya
4,Sentra Wisata Kuliner Embong Sawo,Kuliner,Sentra Kuliner Binaan Pemkot Surabaya yang dim...,Surabaya


# Succeded
success make a recommendation but we have to build a model using TensorFlow

This is the recommendation based on "Kategori" and "Kota"