In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import os

GOOGLE_DRIVE_PATH_AFTER_MYDRIVE = 'SKN'
GOOGLE_DRIVE_PATH = os.path.join('drive', 'My Drive', GOOGLE_DRIVE_PATH_AFTER_MYDRIVE)
print(os.listdir(GOOGLE_DRIVE_PATH))

['travel_recommend.ipynb', '01_seq2seq_translation.ipynb', '01_transformer_by_pytorch.ipynb', '02_BERT_transfer_learning.ipynb', 'travel_dataset.csv']


In [136]:
df = pd.read_csv(f'{GOOGLE_DRIVE_PATH}/travel_dataset.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200 entries, 0 to 1199
Data columns (total 14 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   id                        1200 non-null   object
 1   name                      1200 non-null   object
 2   city                      1200 non-null   object
 3   country                   1200 non-null   object
 4   continent                 1200 non-null   object
 5   climate                   1200 non-null   object
 6   budget                    1200 non-null   object
 7   group_size                1200 non-null   object
 8   vibe                      1200 non-null   object
 9   best_months               1200 non-null   object
 10  typical_cost_per_day_usd  1200 non-null   int64 
 11  activities                1200 non-null   object
 12  description               1200 non-null   object
 13  tags                      1200 non-null   object
dtypes: int64(1), object(13)


In [148]:
def create_description_concise(row):
    return (
        f"{row['id']} blends {row['vibe']} and {row['climate']} vibes—great for {row['group_size']} trips. "
        f"Expect {row['activities']}. "
        f"Best in {row['best_months']}."
    )

In [145]:
df['description'][0]

'Bali, Indonesia blends relaxation and tropical vibes—great for family trips. Expect spas, slow cafés. Best in April, December, March.'

In [154]:
df.loc[df[df['climate'] == 'Tropical'].index, 'climate'] = 'Hot'
df.loc[df[df['climate'] == 'Desert'].index, 'climate'] = 'Hot'
df.loc[df[df['climate'] == 'Mediterranean'].index, 'climate'] = 'Temperate'

df['new_description'] = df.apply(create_description_concise, axis=1)
df['new_description'][0]
df['climate']

Unnamed: 0,climate
0,Hot
1,Hot
2,Temperate
3,Hot
4,Hot
...,...
1195,Hot
1196,Temperate
1197,Hot
1198,Temperate


In [6]:
X = df['new_description']
y = df['id']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=df['city'], random_state=42)
X_train = X_train.values.tolist()
X_test = X_test.values.tolist()

y_train = y_train.values.tolist()
y_test = y_test.values.tolist()

### model

In [7]:
# !pip install -U sentence-transformers



In [150]:
from sentence_transformers import SentenceTransformer

emb_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

new_desc = df['new_description'].values.tolist()

emb_vecs = emb_model.encode(new_desc)
print(emb_vecs.shape)

(1200, 384)


In [151]:
from transformers import pipeline

classifier = pipeline('zero-shot-classification', model='facebook/bart-large-mnli')
# generator = pipeline('text-generation', model='gpt2')

Device set to use cuda:0


In [152]:
climate_labels = df['climate'].unique()
query = 'what a sound'

results = classifier(query, climate_labels)
print(results)


{'sequence': 'what a sound', 'labels': ['Hot', 'Temperate', 'Cold'], 'scores': [0.44440263509750366, 0.40680935978889465, 0.14878803491592407]}


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

user_query = 'not temperate places in September'

climate_result = classifier(user_query, climate_labels)

idx = None
if climate_result['scores'][0] >= 0.8:
  idx = df[df['climate']==climate_result['labels'][0]].index
elif climate_result['scores'][2] <= 0.1:
  idx = df[df['climate']!=climate_result['labels'][2]].index
else:
  idx = df.index

print(climate_result)

query_vector = emb_model.encode([user_query])

similarities = cosine_similarity(query_vector, emb_vecs[idx]).reshape(-1)

print(similarities.shape)

sim_idx = similarities.argsort()[:-4:-1]

print(sim_idx)
# sim_idx = idx[sim_idx]

ndf = df.iloc[idx].reset_index(drop=True)

recommended_city = ndf.iloc[sim_idx]['id']
recommended_desc = ndf.iloc[sim_idx]['new_description']
similarity_score = similarities[sim_idx]

print('user query:', user_query)
print('recommended city:', recommended_city.values)
print('recommended description:', recommended_desc.values)
print('similarity score:', similarity_score)

first = recommended_city.values[0]
second = recommended_city.values[1]
third = recommended_city.values[2]
first_desc = recommended_desc.values[0]

print(f'''
If you want {user_query} places, what about {first}?
{first_desc}.
What about {second} and {third}?''')


{'sequence': 'not temperate places in September', 'labels': ['Cold', 'Hot', 'Temperate'], 'scores': [0.6488567590713501, 0.2900884449481964, 0.061054788529872894]}
(625,)
[612 210 220]
user query: not temperate places in September
recommended city: ['Tulum-Mexico-677320' 'Goa-India-313886' 'Malé-Maldives-539179']
recommended description: ['Tulum-Mexico-677320 blends Historic and Temperate vibes—great for Group trips. Expect old towns, ancient sites. Best in February, October, December.'
 'Goa-India-313886 blends Culture and Temperate vibes—great for Solo trips. Expect theaters, museums. Best in July, August, March.'
 'Malé-Maldives-539179 blends Relaxation and Hot vibes—great for Group trips. Expect spas, sunbathing. Best in February, September, August.']
similarity score: [0.49303234 0.48721647 0.48601907]

If you want not temperate places in September places, what about Tulum-Mexico-677320?
Tulum-Mexico-677320 blends Historic and Temperate vibes—great for Group trips. Expect old town