In [1]:

from pymilvus import MilvusClient, DataType, model
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
client = MilvusClient("sfc_syllabus.db")

In [3]:
# embeddingモデル定義
embedding_fn = model.dense.SentenceTransformerEmbeddingFunction(
    model_name="all-MiniLM-L6-v2",
    device="cpu",
)
dim = embedding_fn.dim

In [4]:
# スキーマ定義
schema = MilvusClient.create_schema(
    auto_id=True,
    enable_dynamic_field=False,
)

schema.add_field(field_name="id", datatype=DataType.INT64, is_primary=True) # ID
schema.add_field(field_name="subject_name", datatype=DataType.VARCHAR, max_length=64) # 科目名
schema.add_field(field_name="faculty", datatype=DataType.BOOL) # 学部
schema.add_field(field_name="category", datatype=DataType.VARCHAR, max_length=32) # 分野
schema.add_field(field_name="credits", datatype=DataType.INT8) # 単位
schema.add_field(field_name="year", datatype=DataType.INT16) # 年度
schema.add_field(field_name="semester", datatype=DataType.VARCHAR, max_length=1) # 学期
# schema.add_field(field_name="day", datatype=DataType.ARRAY, element_type=DataType.VARCHAR, max_capacity=4, max_length=1, nullable=True) # 曜日
# schema.add_field(field_name="period", datatype=DataType.ARRAY, element_type=DataType.INT8, max_capacity=4, nullable=True) # 時限
schema.add_field(field_name="delivery_mode", datatype=DataType.VARCHAR, max_length=8) # 実施形態
schema.add_field(field_name="language", datatype=DataType.VARCHAR, max_length=16) # 言語
schema.add_field(field_name="english_support", datatype=DataType.BOOL) # 英語サポート
schema.add_field(field_name="selection", datatype=DataType.VARCHAR, max_length=4) # 履修選抜
schema.add_field(field_name="giga", datatype=DataType.BOOL) # GIGA
schema.add_field(field_name="summary", datatype=DataType.FLOAT_VECTOR, dim=dim) # 講義概要
schema.add_field(field_name="goals", datatype=DataType.FLOAT_VECTOR, dim=dim) # 主題と目標
schema.add_field(field_name="schedule", datatype=DataType.FLOAT_VECTOR, dim=dim) # 授業計画
schema.add_field(field_name="url", datatype=DataType.VARCHAR, max_length=64) # URL

{'auto_id': True, 'description': '', 'fields': [{'name': 'id', 'description': '', 'type': <DataType.INT64: 5>, 'is_primary': True, 'auto_id': False}, {'name': 'subject_name', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 64}}, {'name': 'faculty', 'description': '', 'type': <DataType.BOOL: 1>}, {'name': 'category', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 32}}, {'name': 'credits', 'description': '', 'type': <DataType.INT8: 2>}, {'name': 'year', 'description': '', 'type': <DataType.INT16: 3>}, {'name': 'semester', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 1}}, {'name': 'delivery_mode', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 8}}, {'name': 'language', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 16}}, {'name': 'english_support', 'description': '', 'type': <DataType.BOOL: 1>}, {'name': 'selection', 'description': '', 'type': <

In [5]:
# インデックス定義
index_params = client.prepare_index_params()

index_params.add_index(
    field_name="summary",
    metric_type="COSINE",
    index_type="FLAT",
)

In [6]:
# コレクション作成
collection_name = "sfc_syllabus_collection"

if client.has_collection(collection_name=collection_name):
    client.drop_collection(collection_name=collection_name)

client.create_collection(
    collection_name=collection_name,
    schema=schema,
    index_params=index_params,
)

In [7]:
df = pd.read_csv("../csvs/sfc_syllabus.csv")

data_list = []

for index, row in df.iterrows():
    if row["学部・研究科"] not in ("総合政策・環境情報学部", "政策・メディア研究科"):
        print(f"error {index}")
        continue

    # 欠損値を除外
    if pd.isna(row["授業概要"]) or pd.isna(row["主題と目標"]) or pd.isna(row["授業計画"]):
        print(f"error {index}")
        continue


    docs = [row["授業概要"], row["主題と目標"], row["授業計画"]]
    vectors = embedding_fn.encode_documents(docs)

    data = {
        "subject_name": row["科目名"],
        "faculty": True if row["学部・研究科"] == "総合政策・環境情報学部" else False,
        "category": row["分野"],
        "credits": int(row["単位"][0]),
        "year": int(row["開講年度・学期"].split()[0]),
        "semester": row["開講年度・学期"].split()[1][0],
        "delivery_mode": row["実施形態"],
        "language": row["授業で使う言語"],
        "english_support": True if row["英語サポート"] == "あり" else False,
        "selection": row["履修制限"],
        "giga": True if row["GIGA"] == "対象" else False,
        "url": row["URL"],
        "summary": vectors[0],
        "goals": vectors[1],
        "schedule": vectors[2]
    }

    data_list.append(data)

res = client.insert(collection_name=collection_name, data=data_list)
print(res)

error 634
error 635
error 636
error 637
error 803
error 1489
error 1490
error 1491
error 1492
error 1688
{'insert_count': 1679, 'ids': [457043186518327296, 457043186518327297, 457043186518327298, 457043186518327299, 457043186518327300, 457043186518327301, 457043186518327302, 457043186518327303, 457043186518327304, 457043186518327305, 457043186518327306, 457043186518327307, 457043186518327308, 457043186518327309, 457043186518327310, 457043186518327311, 457043186518327312, 457043186518327313, 457043186518327314, 457043186518327315, 457043186518327316, 457043186518327317, 457043186518327318, 457043186518327319, 457043186518327320, 457043186518327321, 457043186518327322, 457043186518327323, 457043186518327324, 457043186518327325, 457043186518327326, 457043186518327327, 457043186518327328, 457043186518327329, 457043186518327330, 457043186518327331, 457043186518327332, 457043186518327333, 457043186518327334, 457043186518327335, 457043186518327336, 457043186518327337, 457043186518327338, 4570

In [8]:
# 検索クエリ
queries = ["微分積分"]
query_vectors = embedding_fn.encode_queries(queries)
res = client.search(
    collection_name=collection_name,
    anns_field="summary",
    data=query_vectors,
    limit=5,
    search_params={"metric_type": "COSINE"},
    output_fields=["subject_name", "url"],
)

for hits in res:
    for hit in hits:
        print(hit)

{'id': 457043186518328739, 'distance': 0.5229434967041016, 'entity': {'subject_name': '質量分析', 'url': 'https://syllabus.sfc.keio.ac.jp/courses/2024_25397?locale=ja'}}
{'id': 457043186518327666, 'distance': 0.5175166130065918, 'entity': {'subject_name': '統計基礎', 'url': 'https://syllabus.sfc.keio.ac.jp/courses/2024_25495?locale=ja'}}
{'id': 457043186518328099, 'distance': 0.4779036343097687, 'entity': {'subject_name': '研究会Ａ', 'url': 'https://syllabus.sfc.keio.ac.jp/courses/2024_46931?locale=ja'}}
{'id': 457043186518327339, 'distance': 0.4779036343097687, 'entity': {'subject_name': '研究会Ａ', 'url': 'https://syllabus.sfc.keio.ac.jp/courses/2024_46930?locale=ja'}}
{'id': 457043186518328740, 'distance': 0.4761720299720764, 'entity': {'subject_name': '生命分子構造', 'url': 'https://syllabus.sfc.keio.ac.jp/courses/2024_25398?locale=ja'}}
