In [None]:
!pip install -U pymilvus
!pip install --upgrade openai

Collecting pymilvus
  Downloading pymilvus-2.4.4-py3-none-any.whl.metadata (5.4 kB)
Collecting grpcio<=1.63.0,>=1.49.1 (from pymilvus)
  Downloading grpcio-1.63.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.2 kB)
Collecting environs<=9.5.0 (from pymilvus)
  Downloading environs-9.5.0-py2.py3-none-any.whl.metadata (14 kB)
Collecting ujson>=2.0.0 (from pymilvus)
  Downloading ujson-5.10.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.3 kB)
Collecting milvus-lite<2.5.0,>=2.4.0 (from pymilvus)
  Downloading milvus_lite-2.4.8-py3-none-manylinux2014_x86_64.whl.metadata (6.8 kB)
Collecting marshmallow>=3.0.0 (from environs<=9.5.0->pymilvus)
  Downloading marshmallow-3.21.3-py3-none-any.whl.metadata (7.1 kB)
Collecting python-dotenv (from environs<=9.5.0->pymilvus)
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Downloading pymilvus-2.4.4-py3-none-any.whl (196 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [

In [None]:
from pymilvus import MilvusClient
from pymilvus import FieldSchema, DataType
from pymilvus import FieldSchema, CollectionSchema

import pandas as pd
import numpy as np
import time
import openai
from openai import OpenAI
import os

In [None]:
INDEX_TYPE = "FLAT"
DIMENSION = 768
METRIC_TYPE = "IP"

class MakeCollections:
  def __init__(self, client, index_type, metric_type, dimension):
    self.client = client
    self.index_type = index_type
    self.metric_type = metric_type
    self.dimension = dimension


  # 스키마 생성
  def create_schema(self):
    fields = [
      FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
      FieldSchema(name="area_name", dtype=DataType.VARCHAR, max_length=100, description="the name of administrative district"),
      FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=500, description="elements of travel sites"),
      FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=self.dimension, description="vector"),
      FieldSchema(name="operation", dtype=DataType.VARCHAR, max_length=300, description="operation information")
    ]
    schema = CollectionSchema(fields=fields, auto_id=True, description="travel sites")
    #print(schema)
    return schema

  # 인덱스 생성
  def create_index(self):
    index_params = self.client.prepare_index_params()

    index_params.add_index(
        field_name="area_name",
        index_type=""
    )

    index_params.add_index(
      field_name="embedding",
      index_type=self.index_type,
      metric_type=self.metric_type
    )

    #print(index_params)
    return index_params

  # 컬렉션 생성
  def create_collection(self, collection_name):
    self.client.create_collection(
      collection_name=collection_name,
      schema=self.create_schema(),
      index_params=self.create_index()
    )

    time.sleep(2)

    res = self.client.get_load_state(
      collection_name=collection_name
    )
    print(res)

    return self.client


In [None]:
# 컬렉션 생성하기

client = MilvusClient("milvus_scalar_index2.db")

DEBUG:pymilvus.milvus_client.milvus_client:Created new connection using: de1a23307ebd46a8bd08b8eddcfefb29


In [None]:
collection = MakeCollections(client, INDEX_TYPE, METRIC_TYPE, DIMENSION)
kstartup_collection = collection.create_collection("kstartup_travel_sites")
nowlocal_collection = collection.create_collection("nowlocal_travel_sites")
nature_collection = collection.create_collection("nature_travel_sites")

DEBUG:pymilvus.milvus_client.milvus_client:Successfully created collection: kstartup_travel_sites
DEBUG:pymilvus.milvus_client.milvus_client:Successfully created an index on collection: kstartup_travel_sites
DEBUG:pymilvus.milvus_client.milvus_client:Successfully created an index on collection: kstartup_travel_sites
DEBUG:pymilvus.milvus_client.milvus_client:Successfully created collection: nowlocal_travel_sites


{'state': <LoadState: Loaded>}


DEBUG:pymilvus.milvus_client.milvus_client:Successfully created an index on collection: nowlocal_travel_sites
DEBUG:pymilvus.milvus_client.milvus_client:Successfully created an index on collection: nowlocal_travel_sites
DEBUG:pymilvus.milvus_client.milvus_client:Successfully created collection: nature_travel_sites


{'state': <LoadState: Loaded>}


DEBUG:pymilvus.milvus_client.milvus_client:Successfully created an index on collection: nature_travel_sites
DEBUG:pymilvus.milvus_client.milvus_client:Successfully created an index on collection: nature_travel_sites


{'state': <LoadState: Loaded>}


In [None]:
# 만들어진 컬렉션 확인
client.list_collections()

['kstartup_travel_sites', 'nature_travel_sites', 'nowlocal_travel_sites']

In [None]:
# 만든 컬렉션 확인
res = client.describe_collection(
    collection_name="kstartup_travel_sites"
)
res

{'collection_name': 'kstartup_travel_sites',
 'auto_id': True,
 'num_shards': 0,
 'description': 'travel sites',
 'fields': [{'field_id': 100,
   'name': 'id',
   'description': '',
   'type': <DataType.INT64: 5>,
   'params': {},
   'auto_id': True,
   'is_primary': True},
  {'field_id': 101,
   'name': 'area_name',
   'description': 'the name of administrative district',
   'type': <DataType.VARCHAR: 21>,
   'params': {'max_length': 100}},
  {'field_id': 102,
   'name': 'text',
   'description': 'elements of travel sites',
   'type': <DataType.VARCHAR: 21>,
   'params': {'max_length': 500}},
  {'field_id': 103,
   'name': 'embedding',
   'description': 'vector',
   'type': <DataType.FLOAT_VECTOR: 101>,
   'params': {'dim': 768}},
  {'field_id': 104,
   'name': 'operation',
   'description': 'operation information',
   'type': <DataType.VARCHAR: 21>,
   'params': {'max_length': 300}}],
 'aliases': [],
 'collection_id': 0,
 'consistency_level': 0,
 'properties': {},
 'num_partitions': 

In [None]:
#--------데이터 입력하기---------
from google.colab import userdata

#openai api 쓰기 위한 환경변수 설정
EMBEDDINGS_KEY = userdata.get('EMBEDDINGS_KEY')
os.environ["OPENAI_API_KEY"] = EMBEDDINGS_KEY

openAI_api = OpenAI(
    # This is the default and can be omitted
    api_key=os.environ.get("OPENAI_API_KEY"),
)

In [None]:
# 임베딩 하는 함수
def embed_string(string):
    response = openAI_api.embeddings.create(
        input=string,
        model="text-embedding-3-small",
        dimensions=768
    )
    embedding = response.data[0].embedding
    return embedding

In [None]:
# 데이터 가져올 파일명
data_list = {
    # 컬렉션 이름 : 가져올 데이터 파일이름
    'kstartup_travel_sites' : 'data_kstartup_time.xlsx',
    'nowlocal_travel_sites' : 'data_nowlocal_time.xlsx',
    'nature_travel_sites' : 'data_natural_attractions.xlsx'
}


# 행정구역 구분
sections = ["서울특별시", "부산광역시", "인천광역시", "대구광역시", "대전광역시",
            "광주광역시", "울산광역시", "세종특별자치시", "경기도", "충청북도",
            "충청남도", "전라남도", "경상북도", "경상남도", "강원특별자치도",
            "전북특별자치도", "제주특별자치도"]


#sections = ["광주광역시", "경기도", "제주특별자치도"]

def make_dataset(row):
    string_data = f"""장소명: {row['상호명']}
카테고리: {row['카테고리']}
장소 키워드: {row['키워드']}
위치: {row['주소2']}
해시태그: {row['해시태그']}
"""
    vector = embed_string(string_data)
    area_name = row['주소1'].split()[0]

    return {'text':string_data, 'area_name':area_name, 'embedding':vector, 'operation':row['운영정보']}


def make_document_hashtag(row):
  city = row['주소2'][0:2]
  area_1 = row['주소1'].split()[1]
  area_2 = row['주소1'].split()[1][0:-1]
  category = row['카테고리']

  return f"{city}, {area_1}, {area_2}, {category}, 추천 여행지, 놀만한 곳, 가볼 만한 곳"


for collection_name, file_name in data_list.items():
  # 엑셀 파일 읽어오기
  original_df = pd.read_excel(file_name, engine='openpyxl')

  # 필요없는 열 버리기
  columns_to_drop = ['설명','(네이버)평점', '(카카오맵)평점']
  original_df = original_df.drop(columns=columns_to_drop)

  # 원본 파일 복사
  copy_df = original_df.copy()

  for section in sections:
    section_df = copy_df[copy_df['주소1'].str.contains(section)].copy()

    # 문서 추가 키워드 생성
    section_df["해시태그"] = section_df.apply(make_document_hashtag, axis=1)

    dataset = section_df.apply(make_dataset, axis=1).tolist()

    client.insert(
      collection_name=collection_name,
      data=dataset
    )

In [None]:
data_list = ['kstartup_travel_sites', 'nowlocal_travel_sites', 'nature_travel_sites']

# 컬렉션 구체적 정보(인덱스 지정 필드, 삽입 데이터 개수) 확인
for collection_name in data_list:
  print(f'-- {collection_name} --')
  res = client.list_indexes(collection_name=collection_name)
  print(f'인덱스 지정된 필드 리스트: {res}')

  res = client.query(
      collection_name=collection_name,
      filter="",
      output_fields=["count(*)"]
  )
  print(f'삽입 데이터 개수: {res}')
  print()


-- kstartup_travel_sites --
인덱스 지정된 필드 리스트: ['area_name', 'embedding']
삽입 데이터 개수: data: ["{'count(*)': 244}"] , extra_info: {'cost': 0}

-- nowlocal_travel_sites --
인덱스 지정된 필드 리스트: ['area_name', 'embedding']
삽입 데이터 개수: data: ["{'count(*)': 154}"] , extra_info: {'cost': 0}

-- nature_travel_sites --
인덱스 지정된 필드 리스트: ['area_name', 'embedding']
삽입 데이터 개수: data: ["{'count(*)': 68}"] , extra_info: {'cost': 0}



In [None]:
data_list = ['kstartup_travel_sites', 'nowlocal_travel_sites', 'nature_travel_sites']

# 인덱스 정보 확인
for collection_name in data_list:
  res = client.describe_index(
    collection_name=collection_name,
    index_name="area_name"
  )
  print(res)
  '''
  # 6. Drop index
  client.drop_index(
    collection_name=collection_name,
    index_name="embedding"
  )
  '''


{'index_type': 'Trie', 'max_length': '100', 'field_name': 'area_name', 'index_name': 'area_name'}
{'index_type': 'Trie', 'max_length': '100', 'field_name': 'area_name', 'index_name': 'area_name'}
{'index_type': 'Trie', 'max_length': '100', 'field_name': 'area_name', 'index_name': 'area_name'}
