## connect to weaviate

In [1]:
import weaviate
import os
from dotenv import load_dotenv
load_dotenv(override=True  )
headers = {
    "X-JinaAI-Api-Key": os.getenv("JINAAI_API_KEY")
}

client = weaviate.connect_to_local(headers=headers)



## Check existing collections

In [2]:
## get all collections
client.collections.list_all()

{'Business_data_collection': _CollectionConfigSimple(name='Business_data_collection', description=None, generative_config=None, properties=[_Property(name='text', description=None, data_type=<DataType.TEXT: 'text'>, index_filterable=True, index_range_filters=False, index_searchable=True, nested_properties=None, tokenization=<Tokenization.WORD: 'word'>, vectorizer_config=None, vectorizer=None, vectorizer_configs={'text2vec-jinaai': _PropertyVectorizerConfig(skip=False, vectorize_property_name=False)}), _Property(name='source', description=None, data_type=<DataType.TEXT: 'text'>, index_filterable=True, index_range_filters=False, index_searchable=True, nested_properties=None, tokenization=<Tokenization.WORD: 'word'>, vectorizer_config=None, vectorizer=None, vectorizer_configs={'text2vec-jinaai': _PropertyVectorizerConfig(skip=False, vectorize_property_name=False)})], references=[], reranker_config=_RerankerConfig(model={'model': 'jina-reranker-v2-base-multilingual'}, reranker=<Rerankers.J

In [None]:
## connect single collection
collection = client.collections.get("DemoCollection")
print("config of collection:",collection.config.get())
print("properties :",collection.config.get().properties)

config of collection: _CollectionConfig(name='DemoCollection', description=None, generative_config=None, inverted_index_config=_InvertedIndexConfig(bm25=_BM25Config(b=0.75, k1=1.2), cleanup_interval_seconds=60, index_null_state=False, index_property_length=False, index_timestamps=False, stopwords=_StopwordsConfig(preset=<StopwordsPreset.EN: 'en'>, additions=None, removals=None)), multi_tenancy_config=_MultiTenancyConfig(enabled=False, auto_tenant_creation=False, auto_tenant_activation=False), properties=[], references=[], replication_config=_ReplicationConfig(factor=1, async_enabled=False, deletion_strategy=<ReplicationDeletionStrategy.NO_AUTOMATED_RESOLUTION: 'NoAutomatedResolution'>), reranker_config=None, sharding_config=_ShardingConfig(virtual_per_physical=128, desired_count=1, actual_count=1, desired_virtual_count=128, actual_virtual_count=128, key='_id', strategy='hash', function='murmur3'), vector_index_config=None, vector_index_type=None, vectorizer_config=None, vectorizer=None

## Create connection
Weaviate follows GraphQL naming conventions. 
- Start collection names with an upper case letter.
- Start property names with a lower case letter.

N.T: if mismatch GraphQL format, weaviate chates it internally.

### Production ready collections
1. **Manually define you data schema:-** Avoid using the auto-schema feature, instead, manually define the properties for your collection.
2. **Avoid creating too many collections:-** Using too many collections can lead to scalability issues like high memory usage and degraded query performance. Instead, consider using multi-tenancy, where a single collection is subdivided into multiple tenants.

In [None]:
import weaviate.classes.config as wc
collection  = client.collections.create(
    name="DemoCollection1",
    properties=[
            wc.Property(name="name", data_type=wc.DataType.TEXT),
            wc.Property(name="age", data_type=wc.DataType.INT),
            wc.Property(name="email", data_type=wc.DataType.TEXT),
        ],
    vector_config=wc.Configure.Vectors.text2vec_transformers(
        name="text_vector",
        source_properties=["text"]
    )
    )
collection.config.get()

_CollectionConfig(name='DemoCollection1', description=None, generative_config=None, inverted_index_config=_InvertedIndexConfig(bm25=_BM25Config(b=0.75, k1=1.2), cleanup_interval_seconds=60, index_null_state=False, index_property_length=False, index_timestamps=False, stopwords=_StopwordsConfig(preset=<StopwordsPreset.EN: 'en'>, additions=None, removals=None)), multi_tenancy_config=_MultiTenancyConfig(enabled=False, auto_tenant_creation=False, auto_tenant_activation=False), properties=[_Property(name='name', description=None, data_type=<DataType.TEXT: 'text'>, index_filterable=True, index_range_filters=False, index_searchable=True, nested_properties=None, tokenization=<Tokenization.WORD: 'word'>, vectorizer_config=None, vectorizer=None, vectorizer_configs={'text2vec-transformers': _PropertyVectorizerConfig(skip=False, vectorize_property_name=False)}), _Property(name='age', description=None, data_type=<DataType.INT: 'int'>, index_filterable=True, index_range_filters=False, index_searchabl

In [23]:
### Add a new property to the collection
import weaviate.classes.config as wc
collection.config.add_property(
    wc.Property(
        name="new_",
        data_type=wc.DataType.TEXT,
        description="A new property added to the collection"
    )
)
print("properties :",collection.config.get().properties)

properties : [_Property(name='name', description=None, data_type=<DataType.TEXT: 'text'>, index_filterable=True, index_range_filters=False, index_searchable=True, nested_properties=None, tokenization=<Tokenization.WORD: 'word'>, vectorizer_config=None, vectorizer=None, vectorizer_configs={'text2vec-transformers': _PropertyVectorizerConfig(skip=False, vectorize_property_name=False)}), _Property(name='age', description=None, data_type=<DataType.INT: 'int'>, index_filterable=True, index_range_filters=False, index_searchable=False, nested_properties=None, tokenization=None, vectorizer_config=None, vectorizer=None, vectorizer_configs={'text2vec-transformers': _PropertyVectorizerConfig(skip=False, vectorize_property_name=False)}), _Property(name='email', description=None, data_type=<DataType.TEXT: 'text'>, index_filterable=True, index_range_filters=False, index_searchable=True, nested_properties=None, tokenization=<Tokenization.WORD: 'word'>, vectorizer_config=None, vectorizer=None, vectoriz

In [None]:
#### update the collection definition

import weaviate.classes.config as wc

articles = client.collections.get("DemoCollection1")

# Update the collection definition
articles.config.update(
    description="An updated collection description.",
    # property_descriptions={
    #     "title": "The updated title description for article",
    # },  # Available from Weaviate v1.31.0
    inverted_index_config=wc.Reconfigure.inverted_index(bm25_b=0.75,  
        bm25_k1=1.5,  
    ),
 
    vector_config=wc.Reconfigure.Vectors.update(
        name="text_vector",
        vector_index_config=wc.Reconfigure.VectorIndex.hnsw(
            filter_strategy=wc.VectorFilterStrategy.ACORN  # Available from Weaviate v1.27.0
        ),
    ),
    replication_config=wc.Reconfigure.replication(
        deletion_strategy=wc.ReplicationDeletionStrategy.TIME_BASED_RESOLUTION  # Available from Weaviate v1.28.0
    ),
)


In [None]:
articles = client.collections.get("DemoCollection1")

article_shards = articles.config.update_shards(
    status="READY",
    # shard_names=["shard_names"],  # The names (List[str]) of the shard to update (or a shard name)
)
print(article_shards)

{'36h64iLx7vL9': 'READY'}


## Delete Collections

In [34]:
## single collection delete
client.collections.delete("DemoCollection1")
## delete all collections
client.collections.delete_all()

In [35]:
client.collections.list_all()

{}

## test weaviate dataset by viewing

In [15]:
import weaviate
import os
from dotenv import load_dotenv
load_dotenv(override=True  )
headers = {
    "X-JinaAI-Api-Key": os.getenv("JINAAI_API_KEY")
}

client = weaviate.connect_to_local(headers=headers)

In [17]:
client.collections.list_all()

{'Business_data_collection': _CollectionConfigSimple(name='Business_data_collection', description=None, generative_config=None, properties=[_Property(name='text', description=None, data_type=<DataType.TEXT: 'text'>, index_filterable=True, index_range_filters=False, index_searchable=True, nested_properties=None, tokenization=<Tokenization.WORD: 'word'>, vectorizer_config=None, vectorizer=None, vectorizer_configs={'text2vec-jinaai': _PropertyVectorizerConfig(skip=False, vectorize_property_name=False)}), _Property(name='source', description=None, data_type=<DataType.TEXT: 'text'>, index_filterable=True, index_range_filters=False, index_searchable=True, nested_properties=None, tokenization=<Tokenization.WORD: 'word'>, vectorizer_config=None, vectorizer=None, vectorizer_configs={'text2vec-jinaai': _PropertyVectorizerConfig(skip=False, vectorize_property_name=False)})], references=[], reranker_config=_RerankerConfig(model={'model': 'jina-reranker-v2-base-multilingual'}, reranker=<Rerankers.J

In [19]:
collection_name= "Business_data_collection1" 
collection = client.collections.get(collection_name)
collection.config.get()

            Please make sure to close the connection using `client.close()`.
  for group in groupby(strings, lambda s: s[0] == first[0])) \


UnexpectedStatusCodeError: Collection configuration could not be retrieved.! Unexpected status code: 404, with response body: None.

In [18]:
import json
collection_name= "Business_data_collection1"  # Replace with your actual collection name
# Replace "YourCollection" with the actual name of your collection
collection = client.collections.get(collection_name)

# Iterate through all objects in the collection
print(f"Retrieving all objects from collection: {collection_name}")
for item in collection.iterator(include_vector=True):
    print(json.dumps(item.properties, indent=2))
    # Check if vector is included
    # if item.vector:
    #     print(f"  Vector: {item.vector[:5]}...") # print first 5 elements of vector
print("Finished retrieving objects.")

Retrieving all objects from collection: Business_data_collection1


WeaviateQueryError: Query call with protocol GRPC search failed with message could not find class Business_data_collection1 in schema.

In [6]:
collection.config.get()

_CollectionConfig(name='Business_data_collection', description=None, generative_config=None, inverted_index_config=_InvertedIndexConfig(bm25=_BM25Config(b=0.75, k1=1.2), cleanup_interval_seconds=60, index_null_state=False, index_property_length=False, index_timestamps=False, stopwords=_StopwordsConfig(preset=<StopwordsPreset.EN: 'en'>, additions=None, removals=None)), multi_tenancy_config=_MultiTenancyConfig(enabled=False, auto_tenant_creation=False, auto_tenant_activation=False), properties=[_Property(name='text', description=None, data_type=<DataType.TEXT: 'text'>, index_filterable=True, index_range_filters=False, index_searchable=True, nested_properties=None, tokenization=<Tokenization.WORD: 'word'>, vectorizer_config=None, vectorizer=None, vectorizer_configs={'text2vec-jinaai': _PropertyVectorizerConfig(skip=False, vectorize_property_name=False)}), _Property(name='source', description=None, data_type=<DataType.TEXT: 'text'>, index_filterable=True, index_range_filters=False, index_s

In [11]:
import weaviate
import os
from weaviate.classes.query import MetadataQuery
from weaviate.classes.query import Rerank
response = collection.query.near_text(
    query="MechanicsPOM cove",
    limit=5,
    rerank=Rerank(
        prop="text",
    ),
    return_metadata=MetadataQuery(distance=True),
    return_properties=["text","source"],
)
for o in response.objects:
    print(o.properties)
    print(o.metadata.distance)
    print(o.metadata.rerank_score)

{'text': '## 8photonics pricelist for Sevensix\n\nlogo\n\n<!-- image -->\n\nlogo\n\n<!-- image -->\n\nother\n\n<!-- image -->\n\nlogo\n\n<!-- image -->\n\nlogo\n\n<!-- image -->\n\nlogo\n\n<!-- image -->\n\nlogo\n\n<!-- image -->\n\nlogo\n\n<!-- image -->\n\nother\n\n<!-- image -->\n\nlogo\n\n<!-- image -->\n\nlogo\n\n<!-- image -->\n\nother\n\n<!-- image -->\n\nother\n\n<!-- image -->\n\nother\n\n<!-- image -->\n\n| Category                                                        | Short Description  SKU   | List Price End Customer   | Sevensix Price (-20%)   |\n|-----------------------------------------------------------------|--------------------------|---------------------------|-------------------------|\n| Mechanics Small Grid-Kit                                        | GKITS                    | EUR 773.00                | EUR 618.40              |\n| Mechanics  Medium Grid-Kit                                      | GKITM                    | EUR 1’610.00              | EUR 1’28

In [None]:
{
  "metadata": {
    "history": {
      "value": "平成11年11月、科学技術振興事業団（現：国立研究開発法人科学技術振興機構）の新規事業志向型研究開発成果展開事業（プレ・ベンチャー事業）の研究課題に選出され、実用化を目指して平成14年4月に法人化。以降、複数回の本店移転や代表者交代、社名変更（光コム→XTIA→OptoComb）を経て現在に至る。",
      "confidence": null,
      "chunk_references": [
        "fef25e10-aa5f-4b7c-9cd7-a91a8f844a17",
        "56ecb6ad-7305-4412-b4c2-4fecc9e68e6c",
        "aaf96f59-d4d6-41f7-997d-229df01b19ec"
      ]
    },
    "sales_trends": {
      "value": "近年は研究開発投資を優先し、毎期多額の赤字を計上している。令和1年6月期から令和5年6月期までの当期純損益は連続して赤字（例：令和5年6月期△338,712千円）。売上高や業績の詳細は非開示だが、資金調達により資金繰りは維持されている。",
      "confidence": null,
      "chunk_references": [
        "8e71546d-ea28-43b2-9c53-3e69a32e5f90",
        "60ea9de4-58f3-43f1-a1b3-aa32294210a1",
        "bd456916-9c60-47e6-bce2-9addf7315caf"
      ]
    },
    "group_companies": {
      "value": "",
      "confidence": null,
      "chunk_references": []
    },
    "company_overview": {
      "value": "株式会社OptoCombは、光コム技術を用いた各種測定装置・検査装置（ハードウェア）の企画・開発・製造・販売、並びにソフトウェアの開発・販売を手掛けている企業である。光コムを用いた測定装置・検査装置類は、主に自動車や輸送機器等の製造ラインに組み込まれるほか、装置単独での使用も可能である。世界で唯一この技術を産業応用に成功したとされ、国内外の大手自動車メーカー等が主要顧客である。",
      "confidence": null,
      "chunk_references": [
        "abd21c67-cd3d-4dc6-ad5e-c00f31e9c338",
        "b97d0eba-efe0-430e-949f-2ede568c771d",
        "34c550f4-6b3f-4ccb-b22a-c2e7657a7b82"
      ]
    },
    "organization_name": {
      "value": "株式会社OptoComb",
      "confidence": null,
      "chunk_references": [
        "646851b3-4043-4608-ac8d-12d33e42582a",
        "412f013d-f1cb-4d7f-a5e3-8202f2e1b5f8"
      ]
    },
    "past_transactions": {
      "value": "令和2年2月にニコン（8億円）、INCJ（6億円）、JUKI（2億円）、双日（1億円）の4社と業務提携を合意し、各社を引受先とした第三者割当増資を実施。同年9月に双日を引受先とする第三者割当で4億円の増資。令和5年9月には第三者割当増資で総額約4.8億円を調達。令和6年3月にも増資で約1億円を調達した見込み。",
      "confidence": null,
      "chunk_references": [
        "367e40e8-9c88-4ad3-b895-1723d7344a9e",
        "39a65c3e-a0e9-48e4-9ac4-1543489dc0c6"
      ]
    },
    "interview_articles": {
      "value": "",
      "confidence": null,
      "chunk_references": []
    },
    "business_activities": {
      "value": "光コム技術を用いた測定装置・検査装置の企画・開発・製造・販売、及び関連ソフトウェアの開発・販売。主な製品は「OptoComb Sensors」「OptoComb SA」「OptoComb HS」「OptoComb ATC S」「OptoComb ATC R」など。顧客の用途に合わせたカスタム品が主体で、一部汎用品もある。自動車・輸送機器、精密・医療機器業界向けが中心。",
      "confidence": null,
      "chunk_references": [
        "abd21c67-cd3d-4dc6-ad5e-c00f31e9c338",
        "378a31c5-5efa-44a7-997f-a68cfcec3a26",
        "3ab6fdeb-d5e3-4c24-9f89-e2840ebcb06e"
      ]
    }
  },
  "extraction": {
    "organization_name": "株式会社OptoComb",
    "company_overview": "株式会社OptoCombは、光コム技術を用いた各種測定装置・検査装置（ハードウェア）の企画・開発・製造・販売、並びにソフトウェアの開発・販売を手掛けている企業である。光コムを用いた測定装置・検査装置類は、主に自動車や輸送機器等の製造ラインに組み込まれるほか、装置単独での使用も可能である。世界で唯一この技術を産業応用に成功したとされ、国内外の大手自動車メーカー等が主要顧客である。",
    "business_activities": "光コム技術を用いた測定装置・検査装置の企画・開発・製造・販売、及び関連ソフトウェアの開発・販売。主な製品は「OptoComb Sensors」「OptoComb SA」「OptoComb HS」「OptoComb ATC S」「OptoComb ATC R」など。顧客の用途に合わせたカスタム品が主体で、一部汎用品もある。自動車・輸送機器、精密・医療機器業界向けが中心。",
    "sales_trends": "近年は研究開発投資を優先し、毎期多額の赤字を計上している。令和1年6月期から令和5年6月期までの当期純損益は連続して赤字（例：令和5年6月期△338,712千円）。売上高や業績の詳細は非開示だが、資金調達により資金繰りは維持されている。",
    "past_transactions": "令和2年2月にニコン（8億円）、INCJ（6億円）、JUKI（2億円）、双日（1億円）の4社と業務提携を合意し、各社を引受先とした第三者割当増資を実施。同年9月に双日を引受先とする第三者割当で4億円の増資。令和5年9月には第三者割当増資で総額約4.8億円を調達。令和6年3月にも増資で約1億円を調達した見込み。",
    "interview_articles": "",
    "history": "平成11年11月、科学技術振興事業団（現：国立研究開発法人科学技術振興機構）の新規事業志向型研究開発成果展開事業（プレ・ベンチャー事業）の研究課題に選出され、実用化を目指して平成14年4月に法人化。以降、複数回の本店移転や代表者交代、社名変更（光コム→XTIA→OptoComb）を経て現在に至る。",
    "group_companies": ""
  }
}