### **1. 필요 라이브러리 다운 및 임포트**

In [None]:
!pip install -U pymilvus
!pip install --upgrade openai
!pip install -U sentence-transformers

Collecting pymilvus
  Downloading pymilvus-2.4.9-py3-none-any.whl.metadata (5.6 kB)
Collecting environs<=9.5.0 (from pymilvus)
  Downloading environs-9.5.0-py2.py3-none-any.whl.metadata (14 kB)
Collecting ujson>=2.0.0 (from pymilvus)
  Downloading ujson-5.10.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.3 kB)
Collecting milvus-lite<2.5.0,>=2.4.0 (from pymilvus)
  Downloading milvus_lite-2.4.10-py3-none-manylinux2014_x86_64.whl.metadata (9.0 kB)
Collecting marshmallow>=3.0.0 (from environs<=9.5.0->pymilvus)
  Downloading marshmallow-3.23.1-py3-none-any.whl.metadata (7.5 kB)
Collecting python-dotenv (from environs<=9.5.0->pymilvus)
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Downloading pymilvus-2.4.9-py3-none-any.whl (201 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m201.1/201.1 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading environs-9.5.0-py2.py3-none-any.whl (12 kB)
Downloading milvus_lite-2.4.10-p

In [None]:
from pymilvus import MilvusClient
from pymilvus import FieldSchema, DataType
from pymilvus import FieldSchema, CollectionSchema

import pandas as pd
import numpy as np
import time
import openai
from openai import OpenAI
import os

### **2. DB 생성 및 컬렉션 생성**

In [None]:
INDEX_TYPE = "FLAT"
DIMENSION = 1024
METRIC_TYPE = "COSINE"
NUM_PARTITIONS = 17

class MakeCollections:
  def __init__(self, client, index_type, metric_type, dimension):
    self.client = client
    self.index_type = index_type
    self.metric_type = metric_type
    self.dimension = dimension


  # 스키마 생성
  def create_schema(self):
    fields = [
      FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
      FieldSchema(name="place_name", dtype=DataType.VARCHAR, max_length=100, description="the name of place"),
      FieldSchema(name="area_name", dtype=DataType.VARCHAR, max_length=100, description="the name of administrative district"),
      FieldSchema(name="mood", dtype=DataType.INT16, description="the mood of the place"),
      FieldSchema(name="parking", dtype=DataType.BOOL, description="parking available"),
      FieldSchema(name="reservation", dtype=DataType.BOOL, description="reservation available"),
      FieldSchema(name="child", dtype=DataType.BOOL, description="children can enter"),
      FieldSchema(name="animal", dtype=DataType.BOOL, description="can be accompanied by pets"),
      FieldSchema(name="walking_distance", dtype=DataType.VARCHAR, max_length=30, description="walking distance from a nearby stop"),
      FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=2000, description="elements of travel sites"),
      FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=self.dimension, description="vector"),
    ]
    schema = CollectionSchema(fields=fields, description="travel sites", partition_key_field="area_name")
    return schema

  # 인덱스 생성
  def create_index(self):
    index_params = self.client.prepare_index_params()

    index_params.add_index(
      field_name="embedding",
      index_type=self.index_type,
      metric_type=self.metric_type
    )

    return index_params

  # 컬렉션 생성
  def create_collection(self, collection_name):
    self.client.create_collection(
      collection_name=collection_name,
      schema=self.create_schema(),
      index_params=self.create_index(),
      num_partitions=NUM_PARTITIONS
    )

    time.sleep(2)

    res = self.client.get_load_state(
      collection_name=collection_name
    )
    print(res)

    return self.client


In [None]:
# 서버 경로 입력
url = "http://ip:port"

# 데이터베이스 연결
client = MilvusClient(url)

In [None]:
collection = MakeCollections(client, INDEX_TYPE, METRIC_TYPE, DIMENSION)

if client.has_collection(collection_name="kstartup_travel_sites") or client.has_collection(collection_name="nowlocal_travel_sites"):
    client.drop_collection(
        collection_name="kstartup_travel_sites"
    )

    client.drop_collection(
        collection_name="nowlocal_travel_sites"
    )

kstartup_collection = collection.create_collection("kstartup_travel_sites")
nowlocal_collection = collection.create_collection("nowlocal_travel_sites")

DEBUG:pymilvus.milvus_client.milvus_client:Successfully created collection: kstartup_travel_sites
DEBUG:pymilvus.milvus_client.milvus_client:Successfully created an index on collection: kstartup_travel_sites
DEBUG:pymilvus.milvus_client.milvus_client:Successfully created collection: nowlocal_travel_sites


{'state': <LoadState: Loaded>}


DEBUG:pymilvus.milvus_client.milvus_client:Successfully created an index on collection: nowlocal_travel_sites


{'state': <LoadState: Loaded>}


In [None]:
# 만들어진 컬렉션 확인
client.list_collections()

['kstartup_travel_sites', 'nowlocal_travel_sites']

In [None]:
# 만든 컬렉션 정보 확인
res = client.describe_collection(
    collection_name="kstartup_travel_sites"
)
res

In [None]:
collection_list = ['kstartup_travel_sites', 'nowlocal_travel_sites']

# ------- RAG 테스트 로직 ---------
# 컬렉션 로드하기
for collection_name in collection_list:
    client.load_collection(
      collection_name=collection_name,
      #replica_number=1 # Number of replicas to create on query nodes. Max value is 1 for Milvus Standalone, and no greater than `queryNode.replicas` for Milvus Cluster.
  )

In [None]:
# 컬렉션 로드 된지 확인
for collection_name in collection_list:
  res = client.get_load_state(
    collection_name=collection_name
  )
  print(res)

{'state': <LoadState: Loaded>}
{'state': <LoadState: Loaded>}


### **3. 강원도 지역의 장소 데이터 셋 생성 및 입력**


In [None]:
# 임베딩 모델 로드
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("upskyy/bge-m3-korean")

  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.2k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/664 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.17k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/297 [00:00<?, ?B/s]

In [None]:
# 데이터 가져올 파일명
data_list = {
    # 컬렉션 이름 : 가져올 데이터 파일이름
    'kstartup_travel_sites' : 'kang_kstartup.xlsx',
    'nowlocal_travel_sites' : 'kang_nowLocal.xlsx'
}

try:
   collection_list = ['kstartup_travel_sites', 'nowlocal_travel_sites']
   # 컬렉션 로드하기
   for collection_name in collection_list:
      client.load_collection(
         collection_name=collection_name,
         #replica_number=1 # Number of replicas to create on query nodes. Max value is 1 for Milvus Standalone, and no greater than `queryNode.replicas` for Milvus Cluster.
      )
      res = client.get_load_state(
         collection_name=collection_name
      )
      print(res)
except Exception as e:
   print("오류: ", e)


def make_dataset_2(row):
   id          = row['store ID']
   place_name  = row['상호명']
   area_name   = row['주소1'].split()[0]
   mood = row['분위기 분류 코드']
   parking = row['근처 주차 가능']
   reservation = row['예약 가능']
   child = row['아이 입장 가능']
   animal = row['반려동물 동반 가능']
   walking_distance = row['근처 정류장과의 도보 거리']



   document = f"""- 장소명: {row['상호명']}
카테고리: {row['카테고리']}
장소 설명:{row['키워드']}
장소 분위기: {row['분위기']}
위치: {row['주소2']}
서비스: {row['서비스']}
휠체어 이용 가능 여부 : {row['휠체어 이용 가능 여부']}
근처 정류장과의 도보 거리: {row['근처 정류장과의 도보 거리']}
장소 특징: {row['특징']}
1인 평균 소비 금액: {row['1인 객단가']}
해시태그: {row['해시태그']}"""

   vector = model.encode(document)

   return {'id': id, 'place_name':place_name, 'area_name':area_name, 'mood':mood, 'parking':parking, 'reservation':reservation, 'child':child, 'animal':animal, 'walking_distance': walking_distance,'text':document, 'embedding':vector}


def make_document_hashtag_2(row):
   area_1 = row['주소1'].split()[1]
   place_hashtag = ""

   if row['해시태그'] != '':
      place_hashtag = row['해시태그'].replace(', ',' #')

   return f"#{area_1} {place_hashtag} #추천 여행지 #가볼 만한 곳"


def make_features(row):
   # 조건 상태를 매핑하는 딕셔너리
   status_mapping = {
      True: {
         '반려동물 동반 가능': '반려동물 입장 가능',
         '아이 입장 가능': '아이 입장 가능',
         '근처 주차 가능': '근처 주차 가능',
         '예약 가능': '예약 가능',
      },
      False: {
         '반려동물 동반 가능': '반려동물 입장 금지',
         '아이 입장 가능': '아이 입장 금지',
         '근처 주차 가능': '근처 주차 불가',
         '예약 가능': '예약 불가능',
      }
   }

   # 결과 할당
   animal = status_mapping[row['반려동물 동반 가능']]['반려동물 동반 가능']
   child = status_mapping[row['아이 입장 가능']]['아이 입장 가능']
   parking = status_mapping[row['근처 주차 가능']]['근처 주차 가능']
   reservation = status_mapping[row['예약 가능']]['예약 가능']

   words = [animal, child, row['와이파이 여부'], parking, reservation]
   print('words: ',words)
   data = []

   for word in words:
      if word != '':
         data.append(word)

   features = ", ".join(data)
   return features


for collection_name, file_name in data_list.items():
   # 엑셀 파일 읽어오기
   original_df = pd.read_excel(file_name, engine='openpyxl')

   # 필요없는 열 버리기
   columns_to_drop = ['설명', '운영정보']
   original_df = original_df.drop(columns=columns_to_drop)

   # 원본 파일 복사, nan값 처리
   copy_df = original_df.copy()
   copy_df = copy_df.replace(np.nan, '')

   section_df = copy_df[copy_df['주소1'].str.contains("강원특별자치도")].copy()

   # 문서 추가 키워드 및 장소 특징 생성
   section_df["해시태그"] = section_df.apply(make_document_hashtag_2, axis=1)
   section_df["특징"] = section_df.apply(make_features, axis=1)

   # 데이터셋 생성
   dataset = section_df.apply(make_dataset_2, axis=1).tolist()

   client.upsert(
      collection_name=collection_name,
      data=dataset
   )


### **4. 삽입된 데이터 결과 확인**




In [None]:
collection_list = ['kstartup_travel_sites', 'nowlocal_travel_sites']

# 컬렉션 구체적 정보(인덱스 지정 필드, 삽입 데이터 개수) 확인
for collection_name in collection_list:
  print(f'-- {collection_name} --')
  res = client.list_indexes(collection_name=collection_name)
  print(f'인덱스 지정된 필드 리스트: {res}')

  res = client.query(
      collection_name=collection_name,
      filter="",
      output_fields=["count(*)"]
  )
  print(f'삽입 데이터 개수: {res}')
  print()


-- kstartup_travel_sites --
인덱스 지정된 필드 리스트: ['embedding']
삽입 데이터 개수: data: ["{'count(*)': 229}"] 

-- nowlocal_travel_sites --
인덱스 지정된 필드 리스트: ['embedding']
삽입 데이터 개수: data: ["{'count(*)': 150}"] 



In [None]:
# 인덱스 정보 확인
for collection_name in collection_list:
  res = client.describe_index(
    collection_name=collection_name,
    index_name="embedding"
  )
  print(res)

  '''
  # 6. Drop index
  client.drop_index(
    collection_name=collection_name,
    index_name="embedding"
  )
  '''


{'index_type': 'FLAT', 'metric_type': 'COSINE', 'field_name': 'embedding', 'index_name': 'embedding', 'total_rows': 0, 'indexed_rows': 0, 'pending_index_rows': 0, 'state': 'Finished'}
{'index_type': 'FLAT', 'metric_type': 'COSINE', 'field_name': 'embedding', 'index_name': 'embedding', 'total_rows': 0, 'indexed_rows': 0, 'pending_index_rows': 0, 'state': 'Finished'}


###**5. DB 연결 끊기**

In [None]:
client.close()