In [189]:
import requests
import markdown
from bs4 import BeautifulSoup
import pandas as pd
import google.generativeai as genai

In [190]:
md_list = ['https://raw.githubusercontent.com/qdrant/landing_page/master/qdrant-landing/content/documentation/concepts/collections.md',
           'https://raw.githubusercontent.com/qdrant/landing_page/master/qdrant-landing/content/documentation/concepts/explore.md',
           'https://raw.githubusercontent.com/qdrant/landing_page/master/qdrant-landing/content/documentation/concepts/filtering.md',
           'https://raw.githubusercontent.com/qdrant/landing_page/master/qdrant-landing/content/documentation/concepts/hybrid-queries.md',
           'https://raw.githubusercontent.com/qdrant/landing_page/master/qdrant-landing/content/documentation/concepts/indexing.md',
           'https://raw.githubusercontent.com/qdrant/landing_page/master/qdrant-landing/content/documentation/concepts/optimizer.md',
           'https://raw.githubusercontent.com/qdrant/landing_page/master/qdrant-landing/content/documentation/concepts/payload.md',
           'https://raw.githubusercontent.com/qdrant/landing_page/master/qdrant-landing/content/documentation/concepts/search.md',
           'https://raw.githubusercontent.com/qdrant/landing_page/master/qdrant-landing/content/documentation/concepts/points.md',
           'https://raw.githubusercontent.com/qdrant/landing_page/master/qdrant-landing/content/documentation/concepts/snapshots.md',
           'https://raw.githubusercontent.com/qdrant/landing_page/master/qdrant-landing/content/documentation/concepts/storage.md',
           'https://raw.githubusercontent.com/qdrant/landing_page/master/qdrant-landing/content/documentation/concepts/vectors.md']

In [199]:
def extract_text_from_md(url):
    # Загрузка содержимого Markdown с URL
    response = requests.get(url)
    response.raise_for_status()  # Проверка на наличие ошибок при загрузке

    # Конвертация содержимого Markdown в HTML
    html_content = markdown.markdown(response.text)

    # Использование BeautifulSoup для извлечения текста
    soup = BeautifulSoup(html_content, features="html.parser")
    text = soup.get_text()

    # Удаление лишних пробелов и переносов строк
    cleaned_text = ' '.join(text.split())

    return cleaned_text

# Инициализация списка для хранения результатов
results = []

for url in md_list:
    text_only = extract_text_from_md(url)
    name = url.split('concepts/')[1].split('.md')[0]
    if name != 'collections':
        text_only = text_only.split('/'+name)[1]
    results.append({'name': name, 'text': text_only})

# Создание DataFrame из результатов
df = pd.DataFrame(results)

df.iloc[0, 1] = df['text'][0].split('/ Collections ')[1]

# Сохранение в файл CSV
df.to_csv('extracted_texts.csv', index=False)

# Вывод результатов
df

Unnamed: 0,name,text
0,collections,A collection is a named set of points (vectors...
1,explore,Explore the data After mastering the concepts...
2,filtering,"Filtering With Qdrant, you can set conditions..."
3,hybrid-queries,"hideInSidebar: false # Optional. If true, the..."
4,indexing,Indexing A key feature of Qdrant is the effec...
5,optimizer,Optimizer It is much more efficient to apply ...
6,payload,Payload One of the significant features of Qd...
7,search,Similarity search Searching for the nearest v...
8,points,Points The points are the central entity that...
9,snapshots,Snapshots Available as of v0.8.4 Snapshots ar...


## Разметим датасет

In [201]:
data = {
    "questions": [
        {
            "question": "What is a collection in the context of Qdrant?",
            "answer": "A collection is a named set of points (vectors with payload) among which search operations can be performed."
        },
        {
            "question": "What must be true about the dimensionality of vectors within a single collection?",
            "answer": "Vectors for each element within a single collection must have the same dimensionality and be compared using a selected metric."
        },
        {
            "question": "How does Qdrant support different metrics for comparing vectors?",
            "answer": "Qdrant supports various popular metrics such as dot product, cosine similarity, Euclidean distance, and Manhattan distance."
        },
        {
            "question": "When should multiple collections be created instead of just one?",
            "answer": "Multiple collections should be created when there is a limited number of users and isolation is necessary, though this can be more resource-intensive."
        },
        {
            "question": "What are some parameters that can be tuned for a collection?",
            "answer": "Parameters for tuning a collection include optimization settings, index building, and managing data write operations to disk."
        },
        {
            "question": "What happens if different types of vectors are used within a collection?",
            "answer": "A single collection can accommodate multiple vectors, each assigned a unique name, along with corresponding dimensionality and metric settings."
        },
        {
            "question": "How can the existence of a collection in Qdrant be checked?",
            "answer": "The existence of a collection can be verified using the GET method, typically through the path /collections/{collection_name}."
        },
        {
            "question": "What is the purpose of the payload in vectors?",
            "answer": "The payload in vectors allows for the inclusion of additional data associated with each point, which can be useful for filtering and returning relevant results."
        },
        {
            "question": "Can collections be updated after their creation?",
            "answer": "Yes, collections can be updated, including modifications to their parameters and settings as the needs of the application evolve."
        },
        {
            "question": "How does the choice of metric influence search results in Qdrant?",
            "answer": "The choice of metric directly affects how similarity between vectors is determined, impacting the results returned from search queries based on the geometric properties defined by the selected metric."
        }
    ]
}
pd.DataFrame(data['questions'])

Unnamed: 0,question,answer
0,What is a collection in the context of Qdrant?,A collection is a named set of points (vectors...
1,What must be true about the dimensionality of ...,Vectors for each element within a single colle...
2,How does Qdrant support different metrics for ...,Qdrant supports various popular metrics such a...
3,When should multiple collections be created in...,Multiple collections should be created when th...
4,What are some parameters that can be tuned for...,Parameters for tuning a collection include opt...
5,What happens if different types of vectors are...,A single collection can accommodate multiple v...
6,How can the existence of a collection in Qdran...,The existence of a collection can be verified ...
7,What is the purpose of the payload in vectors?,The payload in vectors allows for the inclusio...
8,Can collections be updated after their creation?,"Yes, collections can be updated, including mod..."
9,How does the choice of metric influence search...,The choice of metric directly affects how simi...


In [202]:
df_collection = pd.concat([pd.concat([df.head(1)] * len(pd.DataFrame(data['questions'])), ignore_index=True),
           pd.DataFrame(data['questions'])], axis=1)

In [203]:
data = {
  "questions": [
    {
      "question": "What Stack of APIs does Qdrant provide for data exploration?",
      "answer": "Qdrant provides a stack of APIs that allow users to find similar and dissimilar vectors, useful for recommendation systems, data exploration, and data cleaning."
    },
    {
      "question": "How does the Recommendation API enhance the search functionality?",
      "answer": "The Recommendation API allows searching based on multiple positive and negative examples, leveraging either point IDs or raw vectors."
    },
    {
      "question": "What is the default strategy for recommendations in Qdrant?",
      "answer": "The default strategy is called 'average_vector,' which preprocesses input examples to create a single vector for search, ensuring fast performance."
    },
    {
      "question": "What is the purpose of the 'best_score' strategy introduced in Qdrant v1.6.0?",
      "answer": "The 'best_score' strategy finds similar vectors by measuring each candidate against positive and negative examples and selecting the best scores, balancing similarities to improve recommendations."
    },
    {
      "question": "How can users find the most dissimilar vectors using only negative examples?",
      "answer": "Users can employ the 'best_score' strategy with only negative examples, enabling them to identify outliers or the most dissimilar vectors in their datasets."
    },
    {
      "question": "What can be specified in the recommendation request when a collection is created with multiple vectors?",
      "answer": "The name of the specific vector to use for recommendation must be specified in the request using the 'using' parameter."
    },
    {
      "question": "What is the function of the 'lookup_from' parameter in the recommendation request?",
      "answer": "The 'lookup_from' parameter allows users to find recommendations in one collection based on vectors from another collection with the same dimensionality."
    },
    {
      "question": "How does the Discovery API differ from the Recommendation API?",
      "answer": "The Discovery API uses context (positive-negative pairs) to split the space and return points similar to a target while being constrained by this context, enabling more complex conditions for searching."
    },
    {
      "question": "What is the significance of using a context in Discovery search?",
      "answer": "Context allows operators to split space into positive and negative zones, influencing the search to favor points within these zones, enhancing the ability to apply complex conditions."
    },
    {
      "question": "Can the Distance Matrix API be used for clustering similar vectors?",
      "answer": "Yes, the Distance Matrix API is designed to compute distances between sampled pairs of vectors and return results in formats suitable for clustering and visualizing connections among data points."
    }
  ]
}

In [204]:
df_explore = pd.concat([pd.concat([df.iloc[[1]]] * len(pd.DataFrame(data['questions'])), ignore_index=True),
           pd.DataFrame(data['questions'])], axis=1)

In [205]:
data = {
  "questions": [
    {
      "question": "What types of conditions can you set when filtering points in Qdrant?",
      "answer": "You can impose conditions on both the payload and the id of the point, allowing for business-specific filters like stock availability, user location, or desired price range."
    },
    {
      "question": "What logical operations are available when combining filtering conditions in Qdrant?",
      "answer": "Qdrant allows the use of logical operations such as OR, AND, and NOT, and these clauses can be recursively nested."
    },
    {
      "question": "What is the function of the 'must' clause in Qdrant filtering?",
      "answer": "'Must' requires every condition listed inside to be satisfied, functioning similarly to the AND operator."
    },
    {
      "question": "What does the 'should' clause do in Qdrant filtering?",
      "answer": "'Should' means the clause becomes true if at least one condition listed inside is satisfied, analogous to the OR operator."
    },
    {
      "question": "How does the 'must_not' clause work in Qdrant?",
      "answer": "'Must_not' becomes true if none of the conditions listed inside are satisfied, working like the expression (NOT A) AND (NOT B)."
    },
    {
      "question": "Can you filter using nested fields in Qdrant, and how?",
      "answer": "Yes, you can filter using nested fields with dot notation or by focusing on specific payload elements using nested object filters."
    },
    {
      "question": "What condition would you use to check if a field has multiple values in Qdrant?",
      "answer": "You would use the 'values_count' condition to filter by the amount of values stored within a field."
    },
    {
      "question": "How can you check if a field exists with no value in Qdrant?",
      "answer": "You can use the 'IsEmpty' condition to filter out records that have either a null value or an empty array."
    },
    {
      "question": "What is the purpose of the 'has_id' condition in Qdrant?",
      "answer": "The 'has_id' condition is used to filter results based on specific IDs, allowing users to focus on certain search results."
    },
    {
      "question": "How do you filter records using geographic conditions in Qdrant?",
      "answer": "Geographic conditions such as 'geo_bounding_box', 'geo_radius', and 'geo_polygon' allow for filtering points based on their geographical locations."
    }
  ]
}

In [206]:
df_filtering = pd.concat([pd.concat([df.iloc[[2]]] * len(pd.DataFrame(data['questions'])), ignore_index=True),
           pd.DataFrame(data['questions'])], axis=1)

In [207]:
data = {
  "questions": [
    {
      "question": "What new feature was introduced in Qdrant version 1.10.0?",
      "answer": "The introduction of Hybrid and Multi-Stage Queries."
    },
    {
      "question": "What is the role of the prefetch parameter in Qdrant's Query API?",
      "answer": "It enables making sub-requests, allowing for the performance of prefetch queries before applying the main query over the results."
    },
    {
      "question": "What are the two fusion methods available in Qdrant for hybrid searches?",
      "answer": "Reciprocal Rank Fusion (rrf) and Distribution-Based Score Fusion (dbsf)."
    },
    {
      "question": "What is the benefit of using multi-stage queries in Qdrant?",
      "answer": "They allow for using a smaller vector representation to get a large list of candidates first, which are then re-scored using a larger, more accurate representation."
    },
    {
      "question": "How does one filter points in Qdrant queries based on payload values?",
      "answer": "By applying filters to the payload fields to only retrieve points that match the specified filter conditions."
    },
    {
      "question": "What is the purpose of the group_by field in Qdrant queries?",
      "answer": "It is used to group results by a certain field to avoid redundancy of items in the results."
    },
    {
      "question": "How can you reference a point ID from a different collection in Qdrant?",
      "answer": "By using the lookup_from parameter to specify the collection and the vector name of the other collection."
    },
    {
      "question": "Can you perform a query using a point ID as an input in Qdrant?",
      "answer": "Yes, you can use a point ID to fetch the default vector and use it as the query vector."
    },
    {
      "question": "What is the maximum number of groups you can limit a query to in Qdrant?",
      "answer": "The limit can be set to a specific number when querying groups, but the actual number varies based on the specific implementation in the query."
    },
    {
      "question": "What is the command for fetching results using the default vector from a point ID in Qdrant?",
      "answer": "You can use the command to perform a query with the specific point ID in the request body."
    }
  ]
}

In [208]:
df_hybrid = pd.concat([pd.concat([df.iloc[[3]]] * len(pd.DataFrame(data['questions'])), ignore_index=True),
           pd.DataFrame(data['questions'])], axis=1)

In [209]:
data = {
  "questions": [
    {
      "question": "What is a key feature of Qdrant related to indexing?",
      "answer": "The effective combination of vector and traditional indexes."
    },
    {
      "question": "Why is a vector index not sufficient for effective filtering in Qdrant?",
      "answer": "A vector index alone cannot speed up filtering; it must be combined with payload indexes to enhance performance when also applying filters."
    },
    {
      "question": "What does the payload index in Qdrant resemble?",
      "answer": "It resembles the index in conventional document-oriented databases."
    },
    {
      "question": "How are indexes configured in Qdrant?",
      "answer": "The parameters of the indexes are configured for the whole collection and not all segments automatically have indexes."
    },
    {
      "question": "What types of payload column types are available in Qdrant for indexing?",
      "answer": "The available column types include keyword, integer, float, boolean, geo, datetime, text, and uuid."
    },
    {
      "question": "What is the purpose of the full-text index in Qdrant?",
      "answer": "The full-text index allows filtering points by the presence of a word or phrase in the payload field."
    },
    {
      "question": "What is the IDF modifier used for in Qdrant?",
      "answer": "The IDF modifier is used to consider how often an item occurs in a collection to rank search results based on the rarity of a word."
    },
    {
      "question": "What type of index does Qdrant use for efficient querying of similar vectors?",
      "answer": "Qdrant currently only uses HNSW (Hierarchical Navigable Small World Graph) as its dense vector index."
    },
    {
      "question": "What does the 'on_disk' parameter do when configuring indexes in Qdrant?",
      "answer": "The 'on_disk' parameter determines whether the index is stored on disk, which can save memory but may slow down search performance."
    },
    {
      "question": "What type of optimization does the tenant index allow in a multi-tenant scenario?",
      "answer": "The tenant index optimizes storage to allow for faster searching specific to tenant data by structuring data storage based on tenant identification."
    }
  ]
}

df_indexing = pd.concat([pd.concat([df.iloc[[4]]] * len(pd.DataFrame(data['questions'])), ignore_index=True),
           pd.DataFrame(data['questions'])], axis=1)

In [210]:
data = {
  "questions": [
    {
      "question": "Why is it more efficient to apply changes in batches in Qdrant?",
      "answer": "It is more efficient because many other databases perform each change individually, which is less effective."
    },
    {
      "question": "What does Qdrant use to handle data changes during segment optimization?",
      "answer": "Qdrant wraps the segment into a proxy that transparently manages data changes."
    },
    {
      "question": "What is the purpose of the Vacuum Optimizer in Qdrant?",
      "answer": "The Vacuum Optimizer is used to remove accumulated deleted records in segments to prevent memory consumption and system slow down."
    },
    {
      "question": "What criteria determine when to trigger the Vacuum Optimizer?",
      "answer": "The criteria are defined in the configuration file, based on the fraction of deleted vectors and the minimum number of vectors in a segment."
    },
    {
      "question": "What does the Merge Optimizer do in Qdrant?",
      "answer": "The Merge Optimizer reduces the number of segments to improve search performance by merging small segments together."
    },
    {
      "question": "How does the Indexing Optimizer determine when to enable indexes in Qdrant?",
      "answer": "The Indexing Optimizer is triggered when the number of records exceeds a certain threshold, making indexes more efficient."
    },
    {
      "question": "What happens to segments larger than the specified memmap_threshold in Qdrant?",
      "answer": "Segments larger than the memmap_threshold will be stored as read-only memory-mapped files."
    },
    {
      "question": "What can the user configure in the Qdrant configuration file related to optimizers?",
      "answer": "The user can configure parameters such as deleted_threshold, vacuum_min_vector_number, default_segment_number, max_segment_size_kb, memmap_threshold, and indexing_threshold_kb."
    },
    {
      "question": "Why might a user choose to disable indexing during initial data loading in Qdrant?",
      "answer": "Disabling indexing during the upload process can save extra computational resources, allowing for a more efficient initial loading of points."
    },
    {
      "question": "What is the primary challenge faced by Qdrant regarding deleted records?",
      "answer": "The challenge is that deleted records accumulate over time, occupying memory and slowing down the system."
    }
  ]
}

df_optimizer = pd.concat([pd.concat([df.iloc[[5]]] * len(pd.DataFrame(data['questions'])), ignore_index=True),
           pd.DataFrame(data['questions'])], axis=1)

In [211]:
data = {
  "questions": [
    {
      "question": "What is the term used in Qdrant for storing additional information along with vectors?",
      "answer": "The term is 'payload'."
    },
    {
      "question": "What data format does Qdrant allow for the representation of payload information?",
      "answer": "Qdrant allows any information that can be represented using JSON."
    },
    {
      "question": "What happens during filtering if the stored value type does not fit the filtering condition in Qdrant?",
      "answer": "It will be considered not satisfied, and you will get an empty output."
    },
    {
      "question": "What type of numbers does Qdrant support for integer values?",
      "answer": "Qdrant supports 64-bit integers in the range from -9223372036854775808 to 9223372036854775807."
    },
    {
      "question": "What are the two methods to update payloads mentioned in the text?",
      "answer": "The two methods are 'set payload' to update specific fields and 'overwrite' to replace the entire payload."
    },
    {
      "question": "Which method is used to remove all payload keys from specified points in Qdrant?",
      "answer": "The 'clear payload' method is used."
    },
    {
      "question": "What type of indexing does Qdrant allow for payload fields to improve search efficiency?",
      "answer": "Qdrant allows creating indexes for payload fields."
    },
    {
      "question": "What is faceting in the context of Qdrant, and what can it be used for?",
      "answer": "Faceting is a counting technique used for knowing unique values for a payload key, the number of points for each unique value, and understanding how restrictive a filter could be."
    },
    {
      "question": "What should you do before using faceting on a field in Qdrant?",
      "answer": "You should create a field index that supports MatchValue conditions for the field."
    },
    {
      "question": "What will the response contain when performing a facet count for a field?",
      "answer": "The response will contain the counts for each unique value in the field."
    }
  ]
}

df_payload = pd.concat([pd.concat([df.iloc[[6]]] * len(pd.DataFrame(data['questions'])), ignore_index=True),
           pd.DataFrame(data['questions'])], axis=1)

In [212]:
data = {
  "questions": [
    {
      "question": "What is the primary function of the Qdrant Query API?",
      "answer": "The Qdrant Query API provides a single interface for all kinds of search and exploration requests."
    },
    {
      "question": "Which similarity search method is referred to as k-NN?",
      "answer": "Nearest Neighbors Search is also known as k-NN."
    },
    {
      "question": "What types of metrics does Qdrant support for estimating vector similarity?",
      "answer": "Qdrant supports Dot Product, Cosine Similarity, Euclidean Distance, and Manhattan Distance as metrics."
    },
    {
      "question": "What does the 'limit' parameter specify in a search query?",
      "answer": "The 'limit' parameter specifies the number of most similar results to retrieve."
    },
    {
      "question": "In which version of Qdrant is the batch search API available?",
      "answer": "The batch search API is available as of v0.10.0."
    },
    {
      "question": "How can you filter search results based on a specific payload key using the Qdrant search API?",
      "answer": "You can filter search results using the 'filter' parameter with conditions that specify the key and matching value."
    },
    {
      "question": "What is the default scoring metric for sparse queries in Qdrant?",
      "answer": "The default scoring metric for sparse queries is Dot Product."
    },
    {
      "question": "What is the purpose of the 'with_lookup' parameter in the grouping API?",
      "answer": "The 'with_lookup' parameter is used to bring information from points in another collection into each group."
    },
    {
      "question": "What does the 'offset' parameter do when searching with Qdrant?",
      "answer": "The 'offset' parameter allows for skipping a specified number of results, effectively enabling pagination."
    },
    {
      "question": "What is a unique feature of the random sampling API in Qdrant?",
      "answer": "The random sampling API allows retrieving a random sample of points from the collection for debugging, testing, or exploration."
    }
  ]
}

df_search = pd.concat([pd.concat([df.iloc[[7]]] * len(pd.DataFrame(data['questions'])), ignore_index=True),
           pd.DataFrame(data['questions'])], axis=1)

In [213]:
data = {
  "questions": [
    {
      "question": "What is a point in Qdrant?",
      "answer": "A point is a record consisting of a vector and an optional payload."
    },
    {
      "question": "What identifier types does Qdrant support for points?",
      "answer": "Qdrant supports both 64-bit unsigned integers and UUID as identifiers for points."
    },
    {
      "question": "How can you modify a point in Qdrant?",
      "answer": "Point modification operations are asynchronous and take place in two steps, where the operation is first written to the Write-ahead-log."
    },
    {
      "question": "What types of vectors does Qdrant support?",
      "answer": "Qdrant supports Dense Vectors, Sparse Vectors, and MultiVectors."
    },
    {
      "question": "Can multiple types of vectors be attached to a single point in Qdrant?",
      "answer": "Yes, it is possible to attach more than one type of vector to a single point, referred to as Named Vectors."
    },
    {
      "question": "What does the batch loading feature in Qdrant do?",
      "answer": "The batch loading feature in Qdrant allows for loading several points into the service in one API call to optimize performance."
    },
    {
      "question": "What is the purpose of the update_vectors method in Qdrant?",
      "answer": "The update_vectors method updates the specified vectors on the given points while keeping unspecified vectors unchanged."
    },
    {
      "question": "How does Qdrant handle deleting vectors?",
      "answer": "Qdrant provides a method to delete just the specified vectors from the given points while keeping other vectors unchanged."
    },
    {
      "question": "What response is received when an API call is made with wait=false in Qdrant?",
      "answer": "The response indicates acknowledgment of receiving data, but it does not mean that the data is available for retrieval yet."
    },
    {
      "question": "What is the Scroll API used for in Qdrant?",
      "answer": "The Scroll API is used to retrieve all stored points without knowing their IDs or to iterate over points that correspond to a specific filter."
    }
  ]
}

df_point = pd.concat([pd.concat([df.iloc[[8]]] * len(pd.DataFrame(data['questions'])), ignore_index=True),
           pd.DataFrame(data['questions'])], axis=1)

In [214]:
data = {
  "questions": [
    {
      "question": "What are snapshots in Qdrant?",
      "answer": "Snapshots are tar archive files that contain data and configuration of a specific collection on a specific node at a specific time."
    },
    {
      "question": "How are snapshots created in a distributed deployment?",
      "answer": "In a distributed deployment, you must create snapshots for each node separately when dealing with a single collection."
    },
    {
      "question": "What is the purpose of using snapshots?",
      "answer": "Snapshots can be used to archive data or easily replicate an existing deployment."
    },
    {
      "question": "What is the difference between snapshots and backups in Qdrant Cloud?",
      "answer": "Snapshots are used for archiving and replication, while backups are physical disk-level copies of the data preferred for disaster recovery."
    },
    {
      "question": "What does the API endpoint /collections/{collection_name}/snapshots do?",
      "answer": "This endpoint is used to create a new snapshot for an existing collection."
    },
    {
      "question": "What command is used to list snapshots for a collection?",
      "answer": "The command used is `http GET /collections/{collection_name}/snapshots`."
    },
    {
      "question": "What limitations exist when restoring snapshots?",
      "answer": "Snapshots generated in one Qdrant cluster can only be restored to other clusters that share the same minor version."
    },
    {
      "question": "How can you recover a snapshot from a URL?",
      "answer": "To recover from a URL, you use the endpoint `http PUT /collections/{collection_name}/snapshots/recover`, specifying the URL of the snapshot."
    },
    {
      "question": "What is the significance of snapshot priority during recovery?",
      "answer": "The snapshot priority setting controls how Qdrant handles conflicts between the snapshot data and existing data, with options for preference to existing data or snapshot data."
    },
    {
      "question": "Where are snapshots stored by default?",
      "answer": "By default, snapshots are stored at ./snapshots or at /qdrant/snapshots when using the Docker image."
    }
  ]
}

df_snapshot = pd.concat([pd.concat([df.iloc[[9]]] * len(pd.DataFrame(data['questions'])), ignore_index=True),
           pd.DataFrame(data['questions'])], axis=1)

In [215]:
data = {
  "questions": [
    {
      "question": "What is the structure of data storage within a collection in Qdrant?",
      "answer": "All data within one collection is divided into segments, each with independent vector and payload storage, indexes, and an id mapper."
    },
    {
      "question": "What types of segments are there in Qdrant, and what operations can be performed on them?",
      "answer": "Segments can be appendable or non-appendable. In appendable segments, you can freely add, delete, and query data, while in non-appendable segments, you can only read and delete data."
    },
    {
      "question": "What is the difference between in-memory storage and memmap storage in Qdrant?",
      "answer": "In-memory storage stores all vectors in RAM and has the highest speed, while memmap storage uses a virtual address space associated with a file on disk, allowing for flexible memory use."
    },
    {
      "question": "What parameter is used to configure memmap storage for vectors during collection creation?",
      "answer": "The parameter to configure memmap storage is the `on_disk` option set to true during collection creation."
    },
    {
      "question": "What is the recommended approach for using memmap storage in Qdrant?",
      "answer": "It is recommended to create a collection with vectors stored in memmap storage if the Qdrant instance operates with fast disks and the collection is large."
    },
    {
      "question": "How does Qdrant handle the versioning of data and ensure data integrity?",
      "answer": "Qdrant performs data changes in two stages, first writing to a Write-ahead-log (WAL) to order operations, followed by storing changes in segments, which keeps the last version of each change."
    },
    {
      "question": "What types of payload storage does Qdrant support, and what are their characteristics?",
      "answer": "Qdrant supports InMemory and OnDisk payload storage. InMemory stores payloads in RAM for fast access, while OnDisk reads and writes payloads directly to RocksDB, requiring less RAM but with higher access latency."
    },
    {
      "question": "What should be done if large payload values are attached in Qdrant?",
      "answer": "If payload values are large, it is better to use OnDisk payload storage and consider creating a payload index for filtering conditions to avoid excessive disk access."
    },
    {
      "question": "What configuration parameter is used to specify the type of payload storage during collection creation?",
      "answer": "The desired type of payload storage can be specified with the `on_disk_payload` parameter during collection creation."
    },
    {
      "question": "What is the purpose of the memmap_threshold option in Qdrant?",
      "answer": "The memmap_threshold option sets the threshold after which the segment will be converted to memmap storage, optimizing performance based on usage scenarios."
    }
  ]
}

df_storage = pd.concat([pd.concat([df.iloc[[10]]] * len(pd.DataFrame(data['questions'])), ignore_index=True),
           pd.DataFrame(data['questions'])], axis=1)

In [216]:
data = {
  "questions": [
    {
      "question": "What are vectors in the context of Qdrant Vector Search engine?",
      "answer": "Vectors (or embeddings) are the core concept of the Qdrant Vector Search engine. Vectors define the similarity between objects in the vector space."
    },
    {
      "question": "How does a vector representation relate to the similarity of objects?",
      "answer": "If a pair of vectors are similar in vector space, it means that the objects they represent are similar in some way."
    },
    {
      "question": "What types of vectors does Qdrant support?",
      "answer": "Qdrant supports dense vectors, sparse vectors, multivectors, and named vectors, among others."
    },
    {
      "question": "What is the difference between dense vectors and sparse vectors?",
      "answer": "Dense vectors are simple lists of numbers with fixed length, while sparse vectors contain many zeros and therefore have a dynamic length during insertion."
    },
    {
      "question": "What configuration must be set to create a collection with sparse vectors in Qdrant?",
      "answer": "To create a collection with sparse vectors, you need to provide a list of non-zero elements and their indexes, such as: { 'indexes': [1, 3, 5, 7], 'values': [0.1, 0.2, 0.3, 0.4] }."
    },
    {
      "question": "What are multivectors and what scenarios are they useful in?",
      "answer": "Multivectors are used to store a variable amount of the same-shaped dense vectors in a single point, useful for multiple representations of the same object or late interaction embeddings."
    },
    {
      "question": "What datatype is used by default for vectors in Qdrant?",
      "answer": "The default datatype for vectors in Qdrant is Float32."
    },
    {
      "question": "How does Qdrant optimize memory usage for large-dimensionality vectors?",
      "answer": "Qdrant supports different datatypes such as Float16 and Uint8 to optimize memory usage while maintaining precision."
    },
    {
      "question": "What is the purpose of quantization in Qdrant?",
      "answer": "Quantization is used to create quantized representations of vectors alongside the original ones to enable quicker candidate selection for rescoring with the original vectors."
    },
    {
      "question": "What trade-offs must be considered when using different storage options in Qdrant?",
      "answer": "You will have to trade off between search speed and the size of RAM used when choosing storage options in Qdrant."
    }
  ]
}

df_vectors = pd.concat([pd.concat([df.iloc[[11]]] * len(pd.DataFrame(data['questions'])), ignore_index=True),
           pd.DataFrame(data['questions'])], axis=1)

In [217]:
df = pd.concat([df_collection,
               df_explore,
               df_filtering,
               df_hybrid,
               df_hybrid,
               df_optimizer,
               df_payload,
               df_search,
               df_point,
               df_snapshot,
               df_storage,
               df_vectors])

In [218]:
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,name,text,question,answer
0,collections,A collection is a named set of points (vectors...,What is a collection in the context of Qdrant?,A collection is a named set of points (vectors...
1,collections,A collection is a named set of points (vectors...,What must be true about the dimensionality of ...,Vectors for each element within a single colle...
2,collections,A collection is a named set of points (vectors...,How does Qdrant support different metrics for ...,Qdrant supports various popular metrics such a...
3,collections,A collection is a named set of points (vectors...,When should multiple collections be created in...,Multiple collections should be created when th...
4,collections,A collection is a named set of points (vectors...,What are some parameters that can be tuned for...,Parameters for tuning a collection include opt...


In [230]:
df = df.replace(';', ',', regex=True)

df[['question', 'answer']].to_csv('texts_with_answers.csv', sep = ',', index=False)