# Installation and Setup

In [164]:
!pip install gensim sentence-transformers



In [165]:
import re
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from gensim.models import Word2Vec
from sentence_transformers import SentenceTransformer

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

# Download and Import Dataset

In [166]:
df = pd.read_csv('/content/HandsOnTask1.csv', sep=',')

In [167]:
df = pd.read_csv('/content/world_tourism_economy_data.csv', sep=',')

In [168]:
df_tourist = pd.read_csv('HandsOnTask1.csv')
df_tourist.head()

Unnamed: 0,name,capital,population,area,name_clean,capital_clean,name_lower,capital_lower
0,Andorra,Andorra la Vella,84000,468.0,andorra,andorra la vella,andorra,andorra la vella
1,United Arab Emirates,Abu Dhabi,4975593,82880.0,unit arab emir,abu dhabi,united arab emirates,abu dhabi
2,Afghanistan,Kabul,29121286,647500.0,afghanistan,kabul,afghanistan,kabul
3,Antigua and Barbuda,St. John's,86754,443.0,antigua barbuda,st john,antigua and barbuda,st. john's
4,Anguilla,The Valley,13254,102.0,anguilla,valley,anguilla,the valley


In [169]:
df_travel = pd.read_csv('world_tourism_economy_data.csv', sep=',')
display(df_travel.head())

Unnamed: 0,country,country_code,year,tourism_receipts,tourism_arrivals,tourism_exports,tourism_departures,tourism_expenditures,gdp,inflation,unemployment
0,Aruba,ABW,1999,782000000.0,972000.0,62.542949,,9.495387,1722905000.0,2.280372,
1,Africa Eastern and Southern,AFE,1999,8034209000.0,15309380.0,12.20403,,7.760536,265429300000.0,7.819865,
2,Afghanistan,AFG,1999,,,,,,,,
3,Africa Western and Central,AFW,1999,1443613000.0,3897975.0,3.974476,,6.147291,139468300000.0,0.372266,
4,Angola,AGO,1999,31000000.0,45000.0,0.583858,,2.489638,6152923000.0,248.195902,


# Text Processing

In [170]:
df_tourist['text'] = df_course['name'] +  ' ' + df_tourist['capital'] + ' ' + df_tourist['population'].astype(str) + ' ' + df_tourist['area'].astype(str)
df_tourist = df_tourist[['name', 'text']]
display(df_tourist.head())

Unnamed: 0,name,text
0,Andorra,Andorra Andorra la Vella 84000 468.0
1,United Arab Emirates,United Arab Emirates Abu Dhabi 4975593 82880.0
2,Afghanistan,Afghanistan Kabul 29121286 647500.0
3,Antigua and Barbuda,Antigua and Barbuda St. John's 86754 443.0
4,Anguilla,Anguilla The Valley 13254 102.0


In [171]:
df_travel['text2'] = df_travel['country'] + ' ' + df_travel['country_code'] + ' ' + df_travel['year'].astype(str)
df_travel = df_travel[['country', 'text2']]
display(df_travel.head())

Unnamed: 0,country,text2
0,Aruba,Aruba ABW 1999
1,Africa Eastern and Southern,Africa Eastern and Southern AFE 1999
2,Afghanistan,Afghanistan AFG 1999
3,Africa Western and Central,Africa Western and Central AFW 1999
4,Angola,Angola AGO 1999


In [172]:
def clean_noise(text):
  text = re.sub(r'<.*?>', ' ', text) # Hapus tag HTML
  text = re.sub(r'https?://\S+|www\.\S+', ' ', text) # Hapus URL
  text = re.sub(r'#\w+', ' ', text) # Hapus hashtag
  text = re.sub(r'[^\w\s]', ' ', text) # Hapus tanda baca dan karakter khusus
  text = re.sub(r'\d+', ' ', text) # Hapus angka
  text = re.sub(r'\s+', ' ', text).strip() # Hapus spasi berlebih
  return text

def remove_stopwords(text):
  stopwords_set = set(stopwords.words('english'))
  words = text.split()
  filtered_words = [word for word in words if word.lower() not in stopwords_set]
  return ' '.join(filtered_words)

def stem_text(text):
  stemmer = PorterStemmer()
  words = text.split()
  stemmed_words = [stemmer.stem(word) for word in words]
  return ' '.join(stemmed_words)

def process(text):
    text = str(text) # Convert to string to handle non-string types
    text = text.lower()
    text = clean_noise(text)
    text = remove_stopwords(text)
    text = stem_text(text)
    return text

In [173]:
df_tourist['text'] = df_tourist['text'].apply(process)
display(df_tourist.head())

Unnamed: 0,name,text
0,Andorra,andorra andorra la vella
1,United Arab Emirates,unit arab emir abu dhabi
2,Afghanistan,afghanistan kabul
3,Antigua and Barbuda,antigua barbuda st john
4,Anguilla,anguilla valley


In [174]:
df_travel['text2'] = df_travel['text2'].apply(process)
display(df_travel.head())

Unnamed: 0,country,text2
0,Aruba,aruba abw
1,Africa Eastern and Southern,africa eastern southern afe
2,Afghanistan,afghanistan afg
3,Africa Western and Central,africa western central afw
4,Angola,angola ago


# Data Vectorization

# Bag-of-Words (BoW)

In [175]:
# Inisialisasi CountVectorizer
cv = CountVectorizer(max_features=5000, stop_words='english')

# Fit dan transform teks kursus menjadi vektor
course_vectors = cv.fit_transform(df_tourist['text'])

# Menampilkan fitur (kata unik)
print("Fitur (Kata unik):")
print(cv.get_feature_names_out())

# Menampilkan matriks fitur dalam bentuk array
print("\nMatriks BoW:")
print(course_vectors.toarray())

# Simpan ke CSV
df_bow.to_csv('hasil_bow_tourist.csv', index=False)

Fitur (Kata unik):
['aaiún' 'ababa' 'abu' 'abuja' 'accra' 'adamstown' 'addi' 'afghanistan'
 'africa' 'african' 'air' 'albania' 'algeria' 'algier' 'alofa' 'alofi'
 'amali' 'american' 'amman' 'amsterdam' 'andorra' 'angola' 'anguilla'
 'ankara' 'antananarivo' 'antigua' 'apia' 'arab' 'arabia' 'argentina'
 'armenia' 'aruba' 'ashgabat' 'asmara' 'astana' 'asunción' 'athen' 'au'
 'australia' 'austria' 'aux' 'avarua' 'azerbaijan' 'baghdad' 'bahama'
 'bahrain' 'baku' 'bamako' 'bandar' 'bangkok' 'bangladesh' 'bangui'
 'barbado' 'barbuda' 'barthélemi' 'bass' 'basseterr' 'bathurst' 'bator'
 'begawan' 'beij' 'beirut' 'belaru' 'belgium' 'belgrad' 'beliz' 'belmopan'
 'benin' 'berlin' 'bermuda' 'bern' 'bhutan' 'bishkek' 'bissau' 'bogotá'
 'bolivia' 'bonair' 'bosnia' 'botswana' 'brasília' 'bratislava' 'brazil'
 'brazzavil' 'bridgetown' 'british' 'brunei' 'brussel' 'bucharest'
 'budapest' 'bueno' 'bujumbura' 'bulgaria' 'burkina' 'burma' 'burundi'
 'caico' 'cairo' 'caledonia' 'cambodia' 'cameroon' 'canada

In [176]:
# Inisialisasi CountVectorizer
cv = CountVectorizer(max_features=5000, stop_words='english')

# Fit dan transform teks kursus menjadi vektor
course_vectors = cv.fit_transform(df_travel['text2'])

# Menampilkan fitur (kata unik)
print("Fitur (Kata unik):")
print(cv.get_feature_names_out())

# Menampilkan matriks fitur dalam bentuk array
print("\nMatriks BoW:")
print(course_vectors.toarray())
# Simpan ke CSV
df_bow.to_csv('hasil_bow_travel.csv', index=False)

Fitur (Kata unik):
['abw' 'afe' 'affect' 'afg' 'afghanistan' 'africa' 'african' 'afw' 'ago'
 'alb' 'albania' 'algeria' 'america' 'american' 'andorra' 'angola'
 'antigua' 'arab' 'arabia' 'arb' 'area' 'arg' 'argentina' 'arm' 'armenia'
 'aruba' 'asia' 'asm' 'atg' 'au' 'australia' 'austria' 'aut' 'aze'
 'azerbaijan' 'bahama' 'bahrain' 'baltic' 'bangladesh' 'bank' 'barbado'
 'barbuda' 'bdi' 'bel' 'belaru' 'belgium' 'beliz' 'ben' 'benin' 'bermuda'
 'bfa' 'bgd' 'bgr' 'bh' 'bhr' 'bhutan' 'bih' 'bissau' 'blend' 'blr' 'blz'
 'bmu' 'bol' 'bolivia' 'bosnia' 'botswana' 'bra' 'brazil' 'brb' 'british'
 'brn' 'brunei' 'btn' 'bulgaria' 'burkina' 'burundi' 'bwa' 'cabo' 'caf'
 'caico' 'caledonia' 'cambodia' 'cameroon' 'canada' 'caribbean' 'cayman'
 'ceb' 'central' 'chad' 'channel' 'che' 'chi' 'chile' 'china' 'chl' 'chn'
 'civ' 'classif' 'classifi' 'cmr' 'cod' 'cog' 'col' 'colombia' 'com'
 'comoro' 'conflict' 'congo' 'costa' 'cote' 'countri' 'cpv' 'cri'
 'croatia' 'css' 'cub' 'cuba' 'curacao' 'cuw' 'cym' 

In [177]:
# Fungsi untuk preprocessing
def preprocess(text):
    # Lowercase
    text = text.lower()
    # Tokenize
    tokens = word_tokenize(text)
    # Hapus tanda baca dan angka
    tokens = [word for word in tokens if word.isalpha()]
    # Hapus stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)  # Kembalikan string lagi untuk vectorizer

# Preprocess teks
df_tourist['clean_text'] = df_tourist['text'].apply(preprocess)

# Inisialisasi CountVectorizer
vectorizer = CountVectorizer()
X_bow = vectorizer.fit_transform(df_tourist['clean_text'])

# Konversi ke DataFrame agar bisa dilihat
df_bow = pd.DataFrame(X_bow.toarray(), columns=vectorizer.get_feature_names_out())

# Tampilkan sebagian hasil BoW
print(df_bow.head())

# Simpan ke CSV
df_bow.to_csv('hasil_bow.csv', index=False)

   aaiún  ababa  abu  abuja  accra  adamstown  addi  afghanistan  africa  \
0      0      0    0      0      0          0     0            0       0   
1      0      0    1      0      0          0     0            0       0   
2      0      0    0      0      0          0     0            1       0   
3      0      0    0      0      0          0     0            0       0   
4      0      0    0      0      0          0     0            0       0   

   african  ...  yamoussoukro  yaoundé  yaren  yemen  yerevan  zagreb  zambia  \
0        0  ...             0        0      0      0        0       0       0   
1        0  ...             0        0      0      0        0       0       0   
2        0  ...             0        0      0      0        0       0       0   
3        0  ...             0        0      0      0        0       0       0   
4        0  ...             0        0      0      0        0       0       0   

   zealand  zimbabw  åland  
0        0        0      0 

In [72]:
# Fungsi untuk preprocessing
def preprocess(text):
    # Lowercase
    text = text.lower()
    # Tokenize
    tokens = word_tokenize(text)
    # Hapus tanda baca dan angka
    tokens = [word for word in tokens if word.isalpha()]
    # Hapus stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)  # Kembalikan string lagi untuk vectorizer

# Preprocess teks
df_travel['clean_text2'] = df_travel['text2'].apply(preprocess)

# Inisialisasi CountVectorizer
vectorizer = CountVectorizer()
X_bow = vectorizer.fit_transform(df_travel['clean_text2'])

# Konversi ke DataFrame agar bisa dilihat
df_bow = pd.DataFrame(X_bow.toarray(), columns=vectorizer.get_feature_names_out())

# Tampilkan sebagian hasil BoW
print(df_bow.head())

   abw  afe  affect  afg  afghanistan  africa  african  afw  ago  alb  ...  \
0    1    0       0    0            0       0        0    0    0    0  ...   
1    0    1       0    0            0       1        0    0    0    0  ...   
2    0    0       0    1            1       0        0    0    0    0  ...   
3    0    0       0    0            0       1        0    1    0    0  ...   
4    0    0       0    0            0       0        0    0    1    0  ...   

   wsm  xkx  yem  yemen  zaf  zambia  zealand  zimbabw  zmb  zwe  
0    0    0    0      0    0       0        0        0    0    0  
1    0    0    0      0    0       0        0        0    0    0  
2    0    0    0      0    0       0        0        0    0    0  
3    0    0    0      0    0       0        0        0    0    0  
4    0    0    0      0    0       0        0        0    0    0  

[5 rows x 568 columns]


#TF-IDF

Metode ini sama persis dengan metode sebelumnya, hanya saja implementasinya menggunakan Term Frequency-Inverse Document Frequency (TF-IDF). Sama seperti sebelumnya, kita hanya perlu menginisialisasi vectorizer dan implementasikan ke dataset teks. Berikut parameter dasar yang kita gunakan di vectorizer ini,

- ngram_range: Rentang banyak kata yang memberikan konteks tambahan. Karena kita setting banyak kata terendahnya adalah 1 dan tertingginya adalah 2, artinya, kita menggunakan unigram dan bigram (kata tunggal dan pasangan kata).

- max_features: Jumlah maksimum fitur yang disimpan berdasarkan perhitungan TF-IDF.

In [178]:
# Inisialisasi TF-IDF Vectorize
tfidfv = TfidfVectorizer(ngram_range=(1, 2), max_features=5000)

# Fit dan transform teks kursus menjadi vektor
course_vectors = tfidfv.fit_transform(df_tourist['text'])

# Menampilkan fitur (kata unik)
print("Fitur (Kata unik):")
print(tfidfv.get_feature_names_out())

# Menampilkan matriks fitur dalam bentuk array
print("\nMatriks TF-IDF:")
print(course_vectors.toarray())

# Simpan ke CSV
df_tfidf.to_csv('hasil_tfidf_tourist.csv', index=False)

Fitur (Kata unik):
['aaiún' 'ababa' 'abu' 'abu dhabi' 'abuja' 'accra' 'adamstown' 'addi'
 'addi ababa' 'afghanistan' 'afghanistan kabul' 'africa' 'africa pretoria'
 'african' 'african republ' 'air' 'albania' 'albania tirana' 'algeria'
 'algeria algier' 'algier' 'alofa' 'alofi' 'amali' 'american'
 'american samoa' 'amman' 'amsterdam' 'andorra' 'andorra andorra'
 'andorra la' 'angola' 'angola luanda' 'anguilla' 'anguilla valley'
 'ankara' 'antananarivo' 'antigua' 'antigua barbuda' 'apia' 'arab'
 'arab emir' 'arabia' 'arabia riyadh' 'argentina' 'argentina bueno'
 'armenia' 'armenia yerevan' 'aruba' 'aruba oranjestad' 'ashgabat'
 'asmara' 'astana' 'asunción' 'athen' 'au' 'au princ' 'australia'
 'australia canberra' 'austria' 'austria vienna' 'aux' 'aux françai'
 'avarua' 'azerbaijan' 'azerbaijan baku' 'baghdad' 'bahama'
 'bahama nassau' 'bahrain' 'bahrain manama' 'baku' 'bamako' 'bandar'
 'bandar seri' 'bangkok' 'bangladesh' 'bangladesh dhaka' 'bangui'
 'barbado' 'barbado bridgetown' 'barb

In [179]:
# Inisialisasi TF-IDF Vectorize
tfidfv = TfidfVectorizer(ngram_range=(1, 2), max_features=5000)

# Fit dan transform teks kursus menjadi vektor
course_vectors = tfidfv.fit_transform(df_travel['text2'])

# Menampilkan fitur (kata unik)
print("Fitur (Kata unik):")
print(tfidfv.get_feature_names_out())

# Menampilkan matriks fitur dalam bentuk array
print("\nMatriks TF-IDF:")
print(course_vectors.toarray())

# Simpan ke CSV
df_tfidf.to_csv('hasil_tfidf_travel.csv', index=False)

Fitur (Kata unik):
['abw' 'afe' 'affect' 'affect situat' 'afg' 'afghanistan'
 'afghanistan afg' 'africa' 'africa eastern' 'africa exclud' 'africa ida'
 'africa mea' 'africa ssf' 'africa western' 'africa zaf' 'african'
 'african republ' 'afw' 'ago' 'alb' 'albania' 'albania alb' 'algeria'
 'algeria dza' 'america' 'america caribbean' 'america nac' 'american'
 'american samoa' 'andorra' 'angola' 'angola ago' 'antigua'
 'antigua barbuda' 'arab' 'arab emir' 'arab rep' 'arab republ'
 'arab world' 'arabia' 'arabia sau' 'arb' 'area' 'area emu' 'arg'
 'argentina' 'argentina arg' 'arm' 'armenia' 'armenia arm' 'aruba'
 'aruba abw' 'asia' 'asia ec' 'asia exclud' 'asia ida' 'asia pacif'
 'asia sa' 'asm' 'atg' 'au' 'australia' 'australia au' 'austria'
 'austria aut' 'aut' 'aze' 'azerbaijan' 'azerbaijan aze' 'bahama'
 'bahama bh' 'bahrain' 'bahrain bhr' 'baltic' 'baltic ceb' 'bangladesh'
 'bangladesh bgd' 'bank' 'bank gaza' 'barbado' 'barbado brb' 'barbuda'
 'barbuda atg' 'bdi' 'bel' 'belaru' 'belaru 

In [75]:
# Inisialisasi TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(df_tourist['clean_text'])

# Konversi hasil ke DataFrame
df_tfidf = pd.DataFrame(X_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Tampilkan hasil TF-IDF pertama
print(df_tfidf.head())

   aaiún  ababa      abu  abuja  accra  adamstown  addi  afghanistan  africa  \
0    0.0    0.0  0.00000    0.0    0.0        0.0   0.0     0.000000     0.0   
1    0.0    0.0  0.45756    0.0    0.0        0.0   0.0     0.000000     0.0   
2    0.0    0.0  0.00000    0.0    0.0        0.0   0.0     0.707107     0.0   
3    0.0    0.0  0.00000    0.0    0.0        0.0   0.0     0.000000     0.0   
4    0.0    0.0  0.00000    0.0    0.0        0.0   0.0     0.000000     0.0   

   african  ...  yamoussoukro  yaoundé  yaren  yemen  yerevan  zagreb  zambia  \
0      0.0  ...           0.0      0.0    0.0    0.0      0.0     0.0     0.0   
1      0.0  ...           0.0      0.0    0.0    0.0      0.0     0.0     0.0   
2      0.0  ...           0.0      0.0    0.0    0.0      0.0     0.0     0.0   
3      0.0  ...           0.0      0.0    0.0    0.0      0.0     0.0     0.0   
4      0.0  ...           0.0      0.0    0.0    0.0      0.0     0.0     0.0   

   zealand  zimbabw  åland  
0  

In [76]:
# Inisialisasi TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(df_travel['clean_text2'])

# Konversi hasil ke DataFrame
df_tfidf = pd.DataFrame(X_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Tampilkan hasil TF-IDF pertama
print(df_tfidf.head())

        abw       afe  affect       afg  afghanistan    africa  african  \
0  0.707107  0.000000     0.0  0.000000     0.000000  0.000000      0.0   
1  0.000000  0.538512     0.0  0.000000     0.000000  0.360575      0.0   
2  0.000000  0.000000     0.0  0.707107     0.707107  0.000000      0.0   
3  0.000000  0.000000     0.0  0.000000     0.000000  0.387681      0.0   
4  0.000000  0.000000     0.0  0.000000     0.000000  0.000000      0.0   

        afw       ago  alb  ...  wsm  xkx  yem  yemen  zaf  zambia  zealand  \
0  0.000000  0.000000  0.0  ...  0.0  0.0  0.0    0.0  0.0     0.0      0.0   
1  0.000000  0.000000  0.0  ...  0.0  0.0  0.0    0.0  0.0     0.0      0.0   
2  0.000000  0.000000  0.0  ...  0.0  0.0  0.0    0.0  0.0     0.0      0.0   
3  0.578995  0.000000  0.0  ...  0.0  0.0  0.0    0.0  0.0     0.0      0.0   
4  0.000000  0.707107  0.0  ...  0.0  0.0  0.0    0.0  0.0     0.0      0.0   

   zimbabw  zmb  zwe  
0      0.0  0.0  0.0  
1      0.0  0.0  0.0  
2    

# Word2Vec

Metode Word2Vec lumayan berbeda dengan dua metode sebelumnya. Metode ini adalah algoritma yang mengubah kata-kata menjadi vektor berdimensi tetap. Hasil dari penggunaan metode ini adalah word embeddings, yaitu representasi kata dalam ruang vektor. Untuk menggunakannya, teks kata harus berbentuk tokenisasi list of words. Oleh karena itu, kita dapat menggunakan fungsi word_tokenize(), lalu kita ubah tokenisasi tersebut menjadi representasi vektor. Berikut parameter dasar yang kita gunakan dalam Word2Vec,

- sentences: Data teks yang sudah ditokenisasi.
- vector_size: Ukuran vektor untuk setiap kata.
- window: Ukuran konteks kata dari suatu kata melihat dari kiri dan kanannya.
- min_count: Minimum frekuensi kata yang muncul untuk mejadi fitur.

In [181]:
# Preprocessing tokenisasi ke list of words
course_tokens = df_tourist['text'].apply(lambda x: word_tokenize(x)).tolist()

# Inisialisasi model Word2Vec
model_w2v_course = Word2Vec(sentences=course_tokens, vector_size=100, window=3, min_count=1)

# Contoh penggunaan: Melihat vektor dari kata "peru"
print("Vektor kata 'peru':")
print(model_w2v_course.wv["peru"])

# Buat DataFrame semua vektor kata
df_word_vectors = pd.DataFrame(
    {word: model_w2v_course.wv[word] for word in model_w2v_course.wv.index_to_key}
).T  # transpose agar satu kata per baris

# Simpan ke CSV
df_word_vectors.to_csv("word2vec_vectors_tourist.csv")

Vektor kata 'peru':
[ 0.00344073  0.00088623  0.00214528 -0.00830133  0.0038985  -0.0045264
  0.00738437 -0.00457292  0.00541834  0.00015152  0.00839598  0.00922447
  0.00626449  0.00608065  0.00759118 -0.00895653 -0.00960823 -0.00488193
  0.00591414  0.00592615  0.00423593  0.00210737  0.00139578 -0.00932677
 -0.00156331 -0.00079554  0.00702137  0.00464328  0.00044908  0.00748728
 -0.00293589 -0.00480084 -0.00537472 -0.00476303  0.00094143  0.00472604
 -0.00190794 -0.0052977  -0.00571465 -0.00110519 -0.00645853  0.00560531
  0.001537   -0.00956677  0.00520407 -0.00408182 -0.00770374 -0.008489
  0.00808304  0.00806224  0.00035403 -0.00842858 -0.00366496  0.00699191
 -0.00068922 -0.00450112  0.00053201  0.00218022  0.00196655 -0.00246105
  0.0095953   0.00253001 -0.00616662 -0.00558303  0.00093463  0.00373404
 -0.00475616  0.00018334  0.00658571  0.00603787  0.00326106  0.00071078
  0.00897497  0.00586676  0.00805542  0.00582407 -0.005734   -0.0023239
 -0.00443875 -0.00789794 -0.0091400

In [182]:
# Preprocessing tokenisasi ke list of words
course_tokens = df_travel['text2'].apply(lambda x: word_tokenize(x)).tolist()

# Inisialisasi model Word2Vec
model_w2v_course = Word2Vec(sentences=course_tokens, vector_size=100, window=3, min_count=1)

# Contoh penggunaan: Melihat vektor dari kata "peru"
print("Vektor kata 'peru':")
print(model_w2v_course.wv["peru"])

# Simpan semua vektor kata ke CSV
df_word_vectors = pd.DataFrame(
    {word: model_w2v_course.wv[word] for word in model_w2v_course.wv.index_to_key}
).T
df_word_vectors.to_csv("word2vec_vectors_travel.csv")


Vektor kata 'peru':
[-0.00849793  0.04205229  0.03078141  0.00986291  0.00309279 -0.05342143
  0.03211753  0.03178703 -0.03004361 -0.01744129 -0.0120101  -0.04099682
  0.01770283  0.01744247  0.00417975 -0.02601542  0.01414264 -0.03153427
 -0.00535037 -0.07310891 -0.0020177   0.02918163  0.03159295 -0.02202705
 -0.02539517 -0.0120494  -0.04854174 -0.01105958 -0.03788699  0.02410604
  0.03967522  0.03180631  0.00131236 -0.00076978 -0.0312478   0.04943949
 -0.01158261 -0.0246421  -0.03042184 -0.04808255  0.00073796 -0.02641534
 -0.01021316 -0.00459709  0.03038757 -0.0217098  -0.01075645 -0.020036
  0.00546562  0.00089631  0.01100837 -0.0438608  -0.02646026 -0.01505228
 -0.04090481  0.00444675  0.03357971 -0.00081019 -0.03774981 -0.0149395
  0.02181588 -0.00480074 -0.00538403 -0.01953777 -0.03904467  0.01477305
  0.01869062  0.01270324 -0.03634322  0.02733567 -0.02325479 -0.0037838
  0.01501731  0.00433838  0.02418676  0.00298437  0.00827655 -0.00562895
 -0.04676383  0.03048865 -0.0013214

#BERT/Sentence Transformers

In [183]:
# Inisialisasi model pretrained BERT
model_bert = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Encode text kursus menjadi vektor
tourist_vectors = model_bert.encode(df_tourist['text'])

# Menampilkan matriks fitur dalam bentuk array
print("\nMatriks BERT:")
print(tourist_vectors)

# Simpan ke CSV
df_bert_vectors = pd.DataFrame(tourist_vectors)
df_bert_vectors.to_csv("bert_vectors_tourist.csv", index=False)


Matriks BERT:
[[ 0.25501588 -0.12288816 -0.59135586 ... -0.16181467  0.3869061
  -0.6524838 ]
 [ 0.09220998  0.9132193   0.20019507 ... -0.39736706  0.20830427
  -0.17475186]
 [-0.3915762   0.5534272   0.8118701  ...  0.16449413 -0.079286
   0.8935574 ]
 ...
 [-0.54784024  0.707515   -1.2233012  ... -0.08478893 -1.0603138
  -0.3474168 ]
 [-0.8383358   0.17590444 -0.72274613 ...  0.15481684 -0.5079393
   0.2850133 ]
 [-1.1128788   0.6254533  -0.7193723  ... -0.71741605 -0.20445171
   0.330011  ]]


In [184]:
# Encode text kursus menjadi vektor
travel_vectors = model_bert.encode(df_travel['text2'])

# Menampilkan matriks fitur dalam bentuk array
print("\nMatriks BERT:")
print(travel_vectors)

# Simpan ke CSV
df_bert_vectors = pd.DataFrame(tourist_vectors)
df_bert_vectors.to_csv("bert_vectors_travel.csv", index=False)


Matriks BERT:
[[ 0.00774575  0.51757276  0.14068587 ...  0.16140512 -0.05169243
   0.219724  ]
 [ 0.13712236  0.5385743  -0.44658104 ...  0.04463444 -0.5124917
   0.08244359]
 [-0.31444305  0.4746231   0.1567239  ... -0.24312644  0.19480337
   0.9577109 ]
 ...
 [-0.78343606  0.4212806  -1.0354271  ... -0.11129626 -0.32959387
  -0.49453047]
 [-1.0343096   0.30945167 -1.2272873  ... -0.04939453 -0.40174228
  -0.46828407]
 [-1.4554045   0.5991408  -0.7384674  ... -0.5634479  -0.40319693
  -0.19068038]]


# 2. Similarity Calculation

In [86]:
# Hitung similaritas antara setiap vektor pekerjaan dengan setiap vektor kursus
similarity = cosine_similarity(tourist_vectors, travel_vectors)

# Matrix hasil similaritas vector
print("Matrix Cosine Similarity:")
print(similarity)

Matrix Cosine Similarity:
[[ 0.1563937   0.24368821  0.01573665 ...  0.1970882   0.03783054
  -0.02522401]
 [ 0.1496972   0.35193318  0.35861653 ...  0.3174108   0.1363554
   0.12019108]
 [ 0.06524061  0.09153728  0.6705103  ... -0.06774826  0.07457025
  -0.04474084]
 ...
 [ 0.10553943  0.66223234  0.1196847  ...  0.76139176  0.5810523
   0.39696878]
 [ 0.11158752  0.3883838   0.22333536 ...  0.4776153   0.71951973
   0.36167067]
 [ 0.1871346   0.39064297  0.20441777 ...  0.5575026   0.62880945
   0.83172756]]


# 3. Build Recommender System

In [117]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Buat TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit dan transform ke data kamu
tfidf_matrix = tfidf_vectorizer.fit_transform(df_travel['text2'])

In [122]:
def recommend(text):
    try:
        text_vector = tfidf_vectorizer.transform([text])
        similarity_scores = cosine_similarity(text_vector, tfidf_matrix)[0]

        # Urutkan dari skor tertinggi ke rendah
        idx_sorted = similarity_scores.argsort()[::-1]

        # Ambil hasil unik selain dirinya sendiri (text yg mirip tapi bukan dia)
        top_idx = []
        seen = set()
        for i in idx_sorted:
            item = df_travel['text2'].iloc[i]
            if item != text and item not in seen:
                seen.add(item)
                top_idx.append(i)
            if len(top_idx) == 5:
                break

        return df_travel.iloc[top_idx]['text2'].tolist()
    except Exception:
        idx = np.random.choice(len(df_travel), 5, replace=False)
        return df_travel.iloc[idx]['text2'].drop_duplicates().tolist()

In [123]:
for country in df_travel['text2'].tolist()[:5]:  # atau ganti jumlah sesuai keinginan
    recommended_places = recommend(country)
    print(f"Rekomendasi tempat wisata untuk: {country}")
    for i, place in enumerate(recommended_places, 1):
        print(f"{i}. {place}")
    print()

Rekomendasi tempat wisata untuk: aruba abw
1. hong kong sar china hkg
2. ghana gha
3. high incom hic
4. gibraltar gib
5. guyana guy

Rekomendasi tempat wisata untuk: africa eastern southern afe
1. south africa zaf
2. sub saharan africa ssf
3. africa western central afw
4. middl east north africa mea
5. sub saharan africa ida ibrd countri tss

Rekomendasi tempat wisata untuk: afghanistan afg
1. gibraltar gib
2. georgia geo
3. guinea gin
4. ghana gha
5. zimbabw zwe

Rekomendasi tempat wisata untuk: africa western central afw
1. europ central asia ec
2. south africa zaf
3. central african republ caf
4. central europ baltic ceb
5. europ central asia ida ibrd countri tec

Rekomendasi tempat wisata untuk: angola ago
1. gambia gmb
2. gibraltar gib
3. guinea gin
4. zimbabw zwe
5. grenada grd



# 4. Evaluation

# 4.1 Manual Inspection

In [130]:
queries = ["afghanistan afg", "aruba abw", "angola ago", "africa western central afw"]

for country_name in queries:
    recommended_places = recommend(country_name)

    print(f"Rekomendasi tempat wisata untuk: {country_name}")
    for i, place in enumerate(recommended_places, start=1):
        print(f"{i}. {place}")
    print()

Rekomendasi tempat wisata untuk: afghanistan afg
1. gibraltar gib
2. georgia geo
3. guinea gin
4. ghana gha
5. zimbabw zwe

Rekomendasi tempat wisata untuk: aruba abw
1. hong kong sar china hkg
2. ghana gha
3. high incom hic
4. gibraltar gib
5. guyana guy

Rekomendasi tempat wisata untuk: angola ago
1. gambia gmb
2. gibraltar gib
3. guinea gin
4. zimbabw zwe
5. grenada grd

Rekomendasi tempat wisata untuk: africa western central afw
1. europ central asia ec
2. south africa zaf
3. central african republ caf
4. central europ baltic ceb
5. europ central asia ida ibrd countri tec



#Ground Truth

In [135]:
# Ground truth mapping: key = index df_travel['text'], value = list index df_tourist['text2'] yang relevan
ground_truth = {
    0: [10, 25, 50],
    1: [5, 15, 30],
    2: [12, 28],
}

# Bangun DataFrame ground truth
ground_truth_list = []
for travel_id, tourist_ids in ground_truth.items():
    for tourist_id in tourist_ids:
        ground_truth_list.append({
            'travel_id': travel_id,
            'tourist_id': tourist_id,
            'relevance': 1
        })
df_ground_truth = pd.DataFrame(ground_truth_list)
df_ground_truth

Unnamed: 0,travel_id,tourist_id,relevance
0,0,10,1
1,0,25,1
2,0,50,1
3,1,5,1
4,1,15,1
5,1,30,1
6,2,12,1
7,2,28,1


In [154]:
# ----- Simulasi data wisata (df_tourist) -----
import pandas as pd

df_tourist = pd.DataFrame({
    'Place': [
        "Eagle Beach", "Arikok National Park", "California Lighthouse",
        "Oranjestad Market", "Palm Beach", "Baby Beach", "Natural Pool",
        "Alto Vista Chapel", "Butterfly Farm", "Bushiribana Gold Mill Ruins"
    ]
})

# ----- Simulasi data travel (df_travel) -----
df_travel = pd.DataFrame({
    'Country Name': ["aruba abw"]
})

# ----- Fungsi rekomendasi dummy -----
# Anggap kamu sudah punya versi pakai TF-IDF, tapi ini dummy contohnya
def recommend(travel_title):
    if travel_title.lower() == "aruba abw":
        return [
            "Eagle Beach", "Palm Beach", "Natural Pool", "Butterfly Farm", "Oranjestad Market"
        ]
    else:
        return []

# ----- Ground truth relevan -----
ground_truth_places = [
    "Eagle Beach", "California Lighthouse", "Arikok National Park"
]

# ----- Precision@5 Evaluation -----
travel_title = "aruba abw"
top_k = 5

recommended_places = recommend(travel_title)[:top_k]
recommended_indices = df_tourist[df_tourist['Place'].isin(recommended_places)].index.tolist()
ground_truth_indices = df_tourist[df_tourist['Place'].isin(ground_truth_places)].index.tolist()

hits = sum([1 for idx in recommended_indices if idx in ground_truth_indices])
precision_at_5 = hits / top_k

# ----- Output -----
print(f'Precision@5 untuk travel \"{travel_title}\": {precision_at_5:.2f}')
print(f'Index hasil rekomendasi (df_tourist): {recommended_indices}')
print(f'Index tempat relevan dari ground truth: {ground_truth_indices}')

Precision@5 untuk travel "aruba abw": 0.20
Index hasil rekomendasi (df_tourist): [0, 3, 4, 6, 8]
Index tempat relevan dari ground truth: [0, 1, 2]
