# Get dataset from GitHub

In [1]:
!wget https://raw.githubusercontent.com/siskasimandalahi/DestiGo/Machine-Learning/Dataset/data_destinasi_wisata.csv

--2023-06-06 14:35:19--  https://raw.githubusercontent.com/siskasimandalahi/DestiGo/Machine-Learning/Dataset/data_destinasi_wisata.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 378078 (369K) [text/plain]
Saving to: ‘data_destinasi_wisata.csv’


2023-06-06 14:35:19 (8.43 MB/s) - ‘data_destinasi_wisata.csv’ saved [378078/378078]



# **Import the required Library**
**Don't forget the important library and mandatory -> TensorFlow** ❤

**FYI**

Sastrawi is an open-source library for the Indonesian language stemming in the field of natural language processing (NLP). Stemming is the process of reducing words to their base or root form, which helps in normalizing and simplifying the text for further analysis or processing.

The Sastrawi library is specifically designed for the Indonesian language and provides various stemming algorithms and functions. It allows developers and researchers to perform tasks such as tokenization (splitting text into individual words or tokens), stemming, and stop-word removal.

In [2]:
#using Sastrawi
!pip install Sastrawi

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting Sastrawi
  Downloading Sastrawi-1.0.1-py2.py3-none-any.whl (209 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.7/209.7 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Sastrawi
Successfully installed Sastrawi-1.0.1


In [3]:
# for modelling
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# visualization
import matplotlib.pyplot as plt
import seaborn as sns

# for data processing
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from sklearn.metrics.pairwise import cosine_similarity

# Initialize
tv = TfidfVectorizer(max_features=5000)
stem = StemmerFactory().create_stemmer()
stopword = StopWordRemoverFactory().create_stop_word_remover()

# Data Understanding
this is section that read the data and we make a upload manual the csv file.
We make a plan that the data is put into github so we not upload every time reconnect the runtike of colab

In [4]:
# read the data
all_data = pd.read_csv('data_destinasi_wisata.csv')

In [5]:
all_data.sample(5)

Unnamed: 0,id,nama_tempat,kategori,kota,alamat,deskripsi,lat,long
21,22,Happyfarm Ciwidey,Taman Hiburan,Bandung,"Cikembang, Panundaan, Kec. Ciwidey, Kabupaten ...",Happyfarm Ciwidey adalah destinasi wisata yang...,-7.114788,107.438086
287,288,Nampan Bistro,Kuliner,Jakarta Selatan,Jl Raya Pasar Minggu KM 18.2 No 72R RT.6/RW.1 ...,Indonesian Western Fusion Bistro menyediakan t...,-6.267507,106.845548
34,35,Selasar Sunaryo Art Space,Taman Hiburan,Bandung,"Jl. Bukit Pakar Timur No.100, Ciburial, Kec. C...",Selasar Sunaryo Art Space adalah sebuah galeri...,-6.858541,107.636549
296,297,Handayani Prima,Kuliner,Jakarta Timur,"Jl. Matraman Raya No.45, RT.1/RW.3, Palmeriam,...",Handayani Prima merupakan sebuah restoran yang...,-6.201739,106.856766
103,104,Babakan Siliwangi City Forest Path Bandung,Alam,Bandung,"Jl. Tamansari No.90, Lb. Siliwangi, Kecamatan ...",Babakan Siliwangi City Forest Path adalah sebu...,-6.885375,107.61119


In [6]:
print(f"Number of places in the datasets : {len(all_data.id.unique())}")

Number of places in the datasets : 515


# Exploratory Data Analysis
In this section, we analysis the data that we get

In [7]:
# check info of dataset such as how many column and what is the type
all_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 515 entries, 0 to 514
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           515 non-null    int64  
 1   nama_tempat  515 non-null    object 
 2   kategori     515 non-null    object 
 3   kota         515 non-null    object 
 4   alamat       515 non-null    object 
 5   deskripsi    514 non-null    object 
 6   lat          515 non-null    float64
 7   long         515 non-null    float64
dtypes: float64(2), int64(1), object(5)
memory usage: 32.3+ KB


In [8]:
# check if there is null
all_data.isnull().sum()

id             0
nama_tempat    0
kategori       0
kota           0
alamat         0
deskripsi      1
lat            0
long           0
dtype: int64

In [9]:
# drop the null data
all_data = all_data.dropna(subset=['deskripsi'])

In [10]:
# check the data unique of column 'Kategori'
all_data.kategori.unique()

array(['Taman Hiburan', 'Budaya', 'Alam', 'Tempat Ibadah', 'Kuliner',
       'Bahari', 'Pusat Perbelanjaan'], dtype=object)

# Data Preprocessing
Clean and preprocess the collected data to prepare it for further analysis. This may involve removing duplicates, handling missing values, normalizing text, and feature engineering.

In [11]:
# see an overview of the data
all_data.head()

Unnamed: 0,id,nama_tempat,kategori,kota,alamat,deskripsi,lat,long
0,1,Trans Studio Bandung,Taman Hiburan,Bandung,"Jl. Gatot Subroto No.289A, Cibangkong, Kec. Ba...",Trans Studio Bandung adalah sebuah kompleks ta...,-6.925094,107.636494
1,2,Farm House Susu Lembang,Taman Hiburan,Bandung,"Jl. Raya Lembang No.108, Gudangkahuripan, Kec....",Farm House Susu Lembang adalah destinasi yang ...,-6.832969,107.605618
2,3,Dusun Bambu,Taman Hiburan,Bandung,"Jl. Kolonel Masturi No.KM. 11, Kertawangi, Kec...",Dusun Bambu adalah destinasi wisata yang menaw...,-6.789715,107.579163
3,4,Kebun Binatang Bandung,Taman Hiburan,Bandung,"Jl. Kebun Binatang No.6, Lb. Siliwangi, Kecama...","Kebun Binatang Bandung, juga dikenal sebagai K...",-6.889718,107.607728
4,5,Dago Dreampark,Taman Hiburan,Bandung,"Jl. Dago Giri No.Km. 2.2, Pagerwangi, Kec. Lem...",Dago Dreampark adalah kompleks rekreasi yang t...,-6.848642,107.625939


In [12]:
# make just column 'id', 'nama_tempat', 'kategori', 'kota', 'deskripsi'
# so drop the column 'alamat', 'lat', 'long'
all_data.drop(['alamat', 'lat', 'long'], axis=1, inplace=True)
all_data

Unnamed: 0,id,nama_tempat,kategori,kota,deskripsi
0,1,Trans Studio Bandung,Taman Hiburan,Bandung,Trans Studio Bandung adalah sebuah kompleks ta...
1,2,Farm House Susu Lembang,Taman Hiburan,Bandung,Farm House Susu Lembang adalah destinasi yang ...
2,3,Dusun Bambu,Taman Hiburan,Bandung,Dusun Bambu adalah destinasi wisata yang menaw...
3,4,Kebun Binatang Bandung,Taman Hiburan,Bandung,"Kebun Binatang Bandung, juga dikenal sebagai K..."
4,5,Dago Dreampark,Taman Hiburan,Bandung,Dago Dreampark adalah kompleks rekreasi yang t...
...,...,...,...,...,...
510,511,Klenteng Pak Kik Bio,Tempat Ibadah,Surabaya,Kelenteng Pak KIk Bio adalah sebuah kelenteng ...
511,512,Klenteng Sanggar Agung,Tempat Ibadah,Surabaya,Kelenteng Sanggar Agung atau Klenteng Hong San...
512,513,Masjid Agung Sunan Ampel,Tempat Ibadah,Surabaya,Masjid Agung Sunan Ampel adalah sebuah masjid ...
513,514,Masjid Muhammad Cheng Hoo,Tempat Ibadah,Surabaya,Masjid Cheng Hoo Surabaya adalah Masjid bernua...


In [None]:
# optional, if the text code in above cannot run
# make just column 'id', 'nama_tempat', 'kategori', 'deskripsi, 'kota'
# new_data = all_data.drop('alamat', axis=1)
# new_data

In [None]:
# new_data.info()
# Define columns with float type, which is lat and long
# float_cols = new_data.select_dtypes(include=['float']).columns

# Delete columns with float type
# new_data = new_data.drop(columns=float_cols)

# Content Based Filtering
technique used in recommender systems to recommend items or content to users based on the characteristics or properties of the items themselves.

*make a modelling data using content-based filtering technique*

In [13]:
# Modelling with content based filtering
def preprocessing(data):
    data = str(data).lower()
    data = stem.stem(data)
    data = stopword.remove(data)
    return data

In [14]:
# make a copy of "all_data" that retrieve column 'id', 'nama_tempat', 'tags' -> merge of 'deskripsi' and 'kategori'
data_content_based_filtering = all_data.copy()
data_content_based_filtering['tags'] = data_content_based_filtering['deskripsi'] + ' ' + data_content_based_filtering['kota']
data_content_based_filtering.drop(['kota', 'kategori', 'deskripsi'], axis=1, inplace=True)
data_content_based_filtering

Unnamed: 0,id,nama_tempat,tags
0,1,Trans Studio Bandung,Trans Studio Bandung adalah sebuah kompleks ta...
1,2,Farm House Susu Lembang,Farm House Susu Lembang adalah destinasi yang ...
2,3,Dusun Bambu,Dusun Bambu adalah destinasi wisata yang menaw...
3,4,Kebun Binatang Bandung,"Kebun Binatang Bandung, juga dikenal sebagai K..."
4,5,Dago Dreampark,Dago Dreampark adalah kompleks rekreasi yang t...
...,...,...,...
510,511,Klenteng Pak Kik Bio,Kelenteng Pak KIk Bio adalah sebuah kelenteng ...
511,512,Klenteng Sanggar Agung,Kelenteng Sanggar Agung atau Klenteng Hong San...
512,513,Masjid Agung Sunan Ampel,Masjid Agung Sunan Ampel adalah sebuah masjid ...
513,514,Masjid Muhammad Cheng Hoo,Masjid Cheng Hoo Surabaya adalah Masjid bernua...


In [15]:
data_content_based_filtering.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 514 entries, 0 to 514
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           514 non-null    int64 
 1   nama_tempat  514 non-null    object
 2   tags         514 non-null    object
dtypes: int64(1), object(2)
memory usage: 16.1+ KB


In [16]:
# preprocessing column tags in the data copy of "all_data"
data_content_based_filtering.tags = data_content_based_filtering.tags.apply(preprocessing)
data_content_based_filtering

Unnamed: 0,id,nama_tempat,tags
0,1,Trans Studio Bandung,trans studio bandung buah kompleks taman hibur...
1,2,Farm House Susu Lembang,farm house susu lembang destinasi tawar alam t...
2,3,Dusun Bambu,dusun bambu destinasi wisata tawar alam satu a...
3,4,Kebun Binatang Bandung,kebun binatang bandung kenal bagai kebun binat...
4,5,Dago Dreampark,dago dreampark kompleks rekreasi letak dago ba...
...,...,...,...
510,511,Klenteng Pak Kik Bio,kelenteng pak kik bio buah kelenteng letak dae...
511,512,Klenteng Sanggar Agung,kelenteng sanggar agung klenteng hong san tang...
512,513,Masjid Agung Sunan Ampel,masjid agung sunan ampel buah masjid kuno leta...
513,514,Masjid Muhammad Cheng Hoo,masjid cheng hoo surabaya masjid nuansa muslim...


# TF-IDF Vectorizer
TF-IDF Vectorizer is a commonly used technique in natural language processing (NLP) and information retrieval to convert text documents into numerical feature vectors. TF-IDF stands for "Term Frequency-Inverse Document Frequency."

In [17]:
# change that to one-hot encoding
vectors = tv.fit_transform(data_content_based_filtering.tags).toarray()
vectors

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.04168378, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

Calculate the cosine similarity

In [18]:
similarity = cosine_similarity(vectors)
similarity[0][1:10]

array([0.06014466, 0.07571811, 0.06575713, 0.15694725, 0.06763835,
       0.08839171, 0.08618244, 0.09639942, 0.21235394])

In [20]:
nama_tempat = data_content_based_filtering['nama_tempat'].tolist()[:5]
cosine_sim_df = pd.DataFrame(similarity[:5, :5], index=nama_tempat, columns=nama_tempat)
print(cosine_sim_df)


                         Trans Studio Bandung  Farm House Susu Lembang  \
Trans Studio Bandung                 1.000000                 0.060145   
Farm House Susu Lembang              0.060145                 1.000000   
Dusun Bambu                          0.075718                 0.141027   
Kebun Binatang Bandung               0.065757                 0.112839   
Dago Dreampark                       0.156947                 0.117568   

                         Dusun Bambu  Kebun Binatang Bandung  Dago Dreampark  
Trans Studio Bandung        0.075718                0.065757        0.156947  
Farm House Susu Lembang     0.141027                0.112839        0.117568  
Dusun Bambu                 1.000000                0.090674        0.096211  
Kebun Binatang Bandung      0.090674                1.000000        0.068844  
Dago Dreampark              0.096211                0.068844        1.000000  


# Recommendation


In [23]:
def recommend(place_name):
    place_name_index = data_content_based_filtering[data_content_based_filtering['nama_tempat']==place_name].index[0]
    distancess = similarity[place_name_index]
    place_name_list = sorted(list(enumerate(distancess)),key=lambda x: x[1],reverse=True)[1:10]
    
    recommended = []
    for i in place_name_list:
        recommended.append([data_content_based_filtering.iloc[i[0]].nama_tempat]+[i[1]])
        # print(nama_tempats.iloc[i[0]].original_title)
        
    return recommended

In [24]:
recommend("Pantai Ancol")

[['Taman Impian Jaya Ancol', 0.33969271423803293],
 ['Pulau Tidung', 0.23888008669183597],
 ['Pulau Pari', 0.2107826477604192],
 ['Dunia Fantasi', 0.2058831075322734],
 ['Pantai Ancol', 0.2013618410222182],
 ['Waterboom PIK (Pantai Indah Kapuk)', 0.19342386984638593],
 ['Pulau Semak Daun', 0.19162166530861197],
 ['Pulau Pramuka', 0.18893691888216033],
 ['Atlantis Water Adventure', 0.18771101763628004]]

In [25]:
recommend("Curug Siliwangi")

[['Curug Dago', 0.48538865869640985],
 ['Curug Bugbrug', 0.44678123864504626],
 ['Curug Cinulang', 0.441104835840292],
 ['Curug Aseupan', 0.4368517475811361],
 ['Curug Cipanji', 0.4276211106346095],
 ['Curug Cipanas', 0.4062423223327241],
 ['Curug Anom', 0.40476923523769637],
 ['Curug Luhur Waterfall', 0.3749256710467978],
 ['Curug Cilengkrang', 0.3579332592250124]]

## Succeded

This is the recommendation based on "Kategori" and "Deskripsi"