In [1]:
import re
from collections import defaultdict
import numpy as np
import pandas as pd 

### Loading logs to Milvus

In [2]:
log_data = 'metadata_report.log.txt'

# Function to chunk the log data based on log level
def chunk_logs_by_level(file_path):
    log_levels = defaultdict(list)
    try:
        with open(file_path, 'r') as file:
            line_count = 0
            for line in file:
                line_count += 1
                match = re.search(r'\s-\s(INFO|WARNING|ERROR)\s-', line)
                if match:
                    log_level = match.group(1)
                    log_levels[log_level].append(line)
            if line_count == 0:
                print("The file is empty.")
            elif len(log_levels) == 0:
                print("No log levels matched. Please check the regex and log format.")
    except FileNotFoundError:
        print("File not found. Please check the file path.")
    except Exception as e:
        print(f"An error occurred: {e}")
    return log_levels

In [3]:
# Usage of the function
logs_by_level = chunk_logs_by_level(log_data)

In [4]:
if logs_by_level:
    for level, entries in logs_by_level.items():
        #print(f"--------------------- Level: {level} ---------------------")
        #print("\n")
        for entry in entries:
            print(entry)
        print("\n")
else:
    print("No data to display.")

2024-05-14 14:05:16,680 - INFO - Checking metadata quality for table: analyse risque

2024-05-14 14:05:16,693 - INFO - Status of table 'analyse risque' is ACTIVE.

2024-05-14 14:05:16,694 - INFO - Raw Data Size for table 'analyse risque': 4440

2024-05-14 14:05:16,694 - INFO - Number of Files for table 'analyse risque': 1

2024-05-14 14:05:16,694 - INFO - Classification Names for attribute 'date_creation' in table 'analyse risque': Secret

2024-05-14 14:05:16,695 - INFO - Classification Names for attribute 'niveau' in table 'analyse risque': Restreint

2024-05-14 14:05:16,695 - INFO - Classification Names for attribute 'description' in table 'analyse risque': Confidentiel

2024-05-14 14:05:16,695 - INFO - Classification Names for attribute 'type_risque' in table 'analyse risque': Confidentiel

2024-05-14 14:05:16,695 - INFO - Classification Names for attribute 'id' in table 'analyse risque': Restreint

2024-05-14 14:05:16,695 - INFO - Checking metadata quality for table: @annonces ata


In [5]:
import torch
from transformers import BertModel, BertTokenizer
from collections import defaultdict
import re

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
# Load pre-trained model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


#### Save chunks with their embeddings

In [23]:
from collections import defaultdict
import re
import torch
from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection, utility

In [49]:
# Connect to Milvus
connections.connect("default", host="localhost", port="19530")

# Define your collection schema
fields = [
    FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
    FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=768),
    FieldSchema(name="chunk", dtype=DataType.VARCHAR, max_length=1024)
]
schema = CollectionSchema(fields, "Log embeddings collection")

# Create a collection if it does not exist
collection_name = "log_embeddings"
if collection_name not in utility.list_collections():
    collection = Collection(collection_name, schema)
else:
    collection = Collection(collection_name)

# Create an index for the collection
index_params = {
    "index_type": "IVF_FLAT",
    "metric_type": "L2",
    "params": {"nlist": 28}
}
collection.create_index(field_name="embedding", index_params=index_params)
collection.load()

In [50]:
def get_embedding_for_group(text):
    inputs = tokenizer(text, truncation=True, padding=True, return_tensors="pt", max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    embedding = outputs.last_hidden_state[:, 0, :]
    return embedding

def save_embeddings_to_milvus(embedding, chunk, id):
    # Convert tensor to list
    embedding_list = embedding.squeeze().tolist()
    # Truncate the chunk if it exceeds the maximum length
    if len(chunk) > 1024:
        chunk = chunk[:1024]
    # Create lists for each field
    ids = [id]
    embeddings = [embedding_list]
    chunks = [chunk]
    
    # Create the entities list of lists
    entities = [
        ids,             # List of IDs
        embeddings,      # List of embeddings
        chunks           # List of text chunks
    ]
    mr = collection.insert(entities)
    collection.flush()
    print(f"Embedding and chunk saved to Milvus with ID {id}.")
    return mr

def process_logs(file_path):
    log_levels = defaultdict(str)
    try:
        with open(file_path, 'r') as file:
            for line in file:
                match = re.search(r'\s-\s(INFO|WARNING|ERROR)\s-', line)
                if match:
                    log_level = match.group(1)
                    log_levels[log_level] += line.strip() + " "

        id_counter = 1  # Initialize a counter for manual ID generation
        for level, text in log_levels.items():
            if text:
                print(f"Processing {level} logs...")
                embedding = get_embedding_for_group(text)
                print(f"Generated embedding for {level} level with shape: {embedding.shape}")
                mr = save_embeddings_to_milvus(embedding, text, id_counter)
                print(f"Insert result: {mr.insert_count}")
                id_counter += 1  # Increment the ID counter

    except FileNotFoundError:
        print("File not found. Please check the file path.")
    except Exception as e:
        print(f"An error occurred: {e}")

def display_collection_data():
    # Load the collection
    collection.load()
    # Perform a search to verify the data
    search_param = {"metric_type": "L2", "params": {"nprobe": 10}}
    results = collection.search([], "embedding", search_params=search_param, limit=10)
    print(f"Number of results in collection: {len(results)}")
    for result in results:
        print(result)

In [51]:
# Read and process the log file
process_logs(log_data)

Processing INFO logs...
Generated embedding for INFO level with shape: torch.Size([1, 768])
Embedding and chunk saved to Milvus with ID 1.
Insert result: 1
Embedding and chunk saved to Milvus with ID 2.
Insert result: 1
Processing ERROR logs...
Generated embedding for ERROR level with shape: torch.Size([1, 768])
Embedding and chunk saved to Milvus with ID 3.
Insert result: 1


### Query Testing

In [52]:
import google.generativeai as genai
import time 

genai.configure(api_key="AIzaSyC6EpDYb85c7IqxdLkIu5xGn8CrKadc08A")

In [53]:
# Set up the model
generation_config = {
  "temperature": 0.01,
  "top_p": 0.75,
  "max_output_tokens": 10000,
}

In [54]:
safety_settings = [
  {
    "category": "HARM_CATEGORY_HARASSMENT",
    "threshold": "BLOCK_NONE"
  },
  {
    "category": "HARM_CATEGORY_HATE_SPEECH",
    "threshold": "BLOCK_NONE"
  },
  {
    "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
    "threshold": "BLOCK_NONE"
  },
  {
    "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
    "threshold": "BLOCK_NONE"
  },
]

In [55]:
gemini_model = genai.GenerativeModel(model_name="gemini-1.5-pro-latest",
                              generation_config=generation_config,
                              safety_settings=safety_settings)

In [56]:
# Connect to Milvus
connections.connect(host='localhost', port='19530')

# Define collection name
collection_name = 'log_embeddings'

# Check if the collection exists, if not, create it
if not utility.has_collection(collection_name):
    raise Exception(f"Collection {collection_name} does not exist.")

# Load the collection
collection = Collection(collection_name)

In [78]:
from IPython.display import display
from IPython.display import Markdown
import textwrap


def to_markdown(text):
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

In [57]:
# Load pre-trained model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

In [81]:
def get_answer(query):
    # Step 1: Generate embedding for the query
    inputs = tokenizer(query, truncation=True, padding=True, return_tensors="pt", max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    embedding = outputs.last_hidden_state[:, 0, :].squeeze().numpy().astype(np.float32).tolist()
    
    # Debug: Print the embedding type and shape
    print(f"Embedding type: {type(embedding)}, Embedding length: {len(embedding)}")
    
    # Step 2: Perform similarity check in Milvus
    search_params = {"metric_type": "L2", "params": {"nprobe": 10}}
    results = collection.search(
        data=[embedding],
        anns_field='embedding',
        param=search_params,
        limit=5,
        expr=None
    )
    
    # Step 3: Extract the most similar document texts
    document_texts = []
    if results:
        for result in results[0]:
            doc_id = result.id
            similarity_score = result.distance
            document_text = get_document_text_by_id(doc_id)
            document_texts.append(document_text)
    else:
        return "No similar documents found."
    
    # Combine the texts of the top 5 similar documents
    combined_text = "\n".join(document_texts)
    
    # Step 4: Use the generative model to generate an answer
    template = f"Instruction:\n{query}\nINPUTDATA:{combined_text}\nResponse:\n"
    response = gemini_model.generate_content(template)
    
    # Extract the answer from the response
    answer = response.text
    
    return answer

# Function to retrieve document text by ID from the Milvus collection
def get_document_text_by_id(doc_id):
    # Query Milvus to get the document text by ID
    collection.load()
    expr = f"id == {doc_id}"
    results = collection.query(expr=expr, output_fields=["chunk"])
    if results:
        return results[0]["chunk"]
    return "No document found for the given ID."

In [82]:
# Example usage
query = "List of errors ?"
answer = get_answer(query)
print("Awnser")

Embedding type: <class 'list'>, Embedding length: 768


In [84]:
to_markdown(answer)

> Let's break down these errors and warnings to understand what's going wrong in your system.
> 
> **Database and Table Structure Issues**
> 
> * **Invalid Database and Table Names:**
>     *  `Invalid database name: 'default bcp'` suggests a problem with spaces or special characters in the database name. Database names should typically adhere to strict naming conventions.
>     * `Invalid table name 'analyse risque'` and `Invalid table name '@annonces ata'` indicate similar issues with spaces and special characters in table names.
> 
> * **Table Structure Mismatches:**
>     * `Number of Attributes '0' for table '@annonces ata' does not match the actual number of columns '4'` and `Number of Attributes '3' for table 'personne' does not match the actual number of columns '6'` highlight discrepancies between the defined table structure (metadata) and the actual data within the tables. This can lead to data corruption or inability to access data correctly.
> 
> **Metadata and Data Quality Problems**
> 
> * **Missing Classification Names:** Warnings like `No classification names found for table analyse risque` suggest issues with data governance and security. Classification names (e.g., "Secret," "Confidential") are crucial for controlling access to sensitive data.
> 
> * **Metadata Quality Check Failures:**
>     * `Table 'analyse_risque' metadata quality check failed` indicates a broader problem with the table's metadata. This could be due to inconsistencies, missing information, or incorrect data types.
> 
> * **Inconsistent Timestamps:**
>     * `Create Time '2027-05-02 14:28:14' for table 'analyse risque' is in the future` and `lastAccessTime '2024-05-02 14:28:14' for table 'analyse risque' is before its Create Time '2027-05-02 14:28:14'` point to serious issues with how your system is handling time. This could be a configuration error or a bug in the software.
> 
> * **Invalid Data Values:**
>     * Errors like `Total Size for table 'analyse risque' has an invalid value: -4490` and `Number of Rows for table 'analyse risque' has an invalid value: -50` indicate corrupted or nonsensical data in your metadata. Negative sizes and row counts are impossible.
>     * The errors related to the 'page_view' table (`Total Size`, `Number of Rows`, `Raw Data Size`, `Number of Files` all having invalid values of 'N/A') suggest a potentially serious problem with this table. It might be completely inaccessible or corrupted.
> 
> **Recommendations**
> 
> 1. **Review Naming Conventions:**  Ensure your database and table names comply with your system's requirements, avoiding spaces and special characters.
> 
> 2. **Reconcile Table Structures:** Carefully examine the definitions of your tables ('@annonces ata' and 'personne') and make sure the number of attributes in the metadata matches the actual number of columns in the data.
> 
> 3. **Investigate Missing Classifications:** Implement or correct data classification procedures to ensure all sensitive data is appropriately tagged for security and compliance.
> 
> 4. **Address Metadata Quality:**  Thoroughly investigate the reasons behind the metadata quality check failure for the 'analyse_risque' table. This might involve correcting inconsistencies, adding missing information, or fixing incorrect data types.
> 
> 5. **Fix Timestamp Issues:**  Determine the root cause of the incorrect timestamps. This could involve checking system clocks, time zone settings, or investigating potential software bugs.
> 
> 6. **Handle Invalid Data:**  Address the invalid values for table sizes and row counts. This might require data restoration from backups, data repair tools, or a deeper analysis of the data corruption.
> 
> 7. **Focus on 'page_view' Table:** The 'page_view' table seems to have significant issues. Prioritize investigating and resolving the 'N/A' values, as this could indicate severe data corruption or inaccessibility.
> 
> These errors and warnings indicate potential data integrity and security risks. It's crucial to address them promptly to ensure the reliability and security of your data. 
