In [1]:
import json
import re
import os
import ast
import hashlib
import weaviate
from tqdm.asyncio import tqdm_asyncio
import asyncio
from weaviate.classes.init import Auth
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import List, Dict, Tuple
from langchain.schema import Document
from langchain.prompts import ChatPromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter, MarkdownHeaderTextSplitter
import time
import streamlit as st
from sentence_transformers import SentenceTransformer

In [2]:
def load_model():
    return SentenceTransformer("JALLAJ/5epo")


embed_model = load_model()


def split_text(md_text, file_path):

    text_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=[
        ("#", 1), ("##", 2), ("###", 3)
    ])

    chunks = text_splitter.split_text(md_text)

    splitter = RecursiveCharacterTextSplitter(chunk_size=2500, chunk_overlap=400)
    final_chunks = []
    for chunk in chunks:
        text = chunk.page_content
        if len(text) > 2500:
            final_chunks.extend(splitter.split_text(text))
        else:
            final_chunks.append(chunk.page_content)

    return final_chunks


def text_hash(text):
    return hashlib.sha256(text.encode('utf-8')).hexdigest()
    

def merge_json_files(folder_path):
    merged_data = {}

    for filename in os.listdir(folder_path):

        if filename.endswith(".json"):
            file_path = os.path.join(folder_path, filename)
            try:
                with open(file_path, "r", encoding="utf-8") as f:
                    data = json.load(f)
                    merged_data.update(data)
            except Exception as e:
                print(f"Error processing file {filename}: {e}")

    return merged_data

folder_path = "papers" 
merged_data = merge_json_files(folder_path)

all_documents = []

for real_title, data in merged_data.items():
    text = data.get("text", "")
    references = data.get("references", [])
    figs = data.get("figs", [])

    ref_dict = {str(k): v for ref in references for k, v in ref.items()}
    chunks = split_text(text, real_title)

    for chunk in chunks:
        all_documents.append(Document(
            page_content=chunk,
            metadata={
                "title": real_title,
                "references": ref_dict,
                "figs": figs
            }
        ))

text_data = []
text_dict = {}

for doc_id, chunk in enumerate(all_documents):
    text = chunk.page_content
    id = doc_id
    text_data.append({
        "text": text,
        "id": id,
    })
    key = text_hash(text)
    text_dict[key] = id

In [3]:
def is_title(line):
    return re.match(r'^\s*#{1,6}\s+', line)


def is_non_empty_non_title(line):
    return line.strip() and not is_title(line)


def text_split(text):
    lines = text.splitlines()
    blocks = []
    buffer = []          
    title_buffer = []    

    for line in lines:
        if is_title(line):
            if buffer:
                blocks.append('\n'.join(buffer).strip())
                buffer = []
            title_buffer.append(line)
        elif is_non_empty_non_title(line):
            buffer.extend(title_buffer)
            title_buffer = []
            buffer.append(line)
        else:
            if title_buffer:
                title_buffer.append(line)
            else:
                buffer.append(line)
                
    if title_buffer:
        buffer.extend(title_buffer)
    if buffer:
        blocks.append('\n'.join(buffer).strip())

    return blocks


def parse_ref_block(block: str) -> set:
    refs = set()
    parts = re.split(r"[,\s]+", block.strip())
    for part in parts:
        if "-" in part:
            try:
                start, end = map(int, part.split("-"))
                refs.update(range(start, end + 1))
            except:
                continue
        else:
            try:
                refs.add(int(part))
            except:
                continue
    return refs


def extract_references_from_chunk(doc: Document) -> Dict[str, str]:
    text = doc.page_content
    references = doc.metadata.get("references", {})
    ref_blocks = re.findall(r"\[([\d,\-\s]+)\]", text)

    all_refs = set()
    for block in ref_blocks:
        all_refs.update(parse_ref_block(block))

    all_refs = sorted(all_refs)
    matched_refs = {str(ref): references.get(str(ref)) for ref in all_refs}
    return matched_refs

In [15]:
def number_documents(doc_list):
    return {i + 1: doc for i, doc in enumerate(doc_list)}


def retriever(docs, wcd_url, wcd_api_key, huggingface_key, query, limit=5):
    query_vec = embed_model.encode(query, normalize_embeddings=True).tolist()
    response = docs.query.hybrid(
        query=query,
        vector=query_vec,
        alpha=0.3,
        limit=limit,
    )

    retrieved_docs = []
    for obj in response.objects:
        content = obj.properties.get("text", "")
        if content:
            retrieved_docs.append(content)

    return retrieved_docs


def extract_indices_from_output(output_text):  
    try:
        start = output_text.find('[')
        end = output_text.find(']', start) + 1
        list_str = output_text[start:end]
        return ast.literal_eval(list_str)
    except Exception as e:
        print("Failed to parse indices:", e)
        return []


def generate_with_context(llm, question, retrieved_docs):
    """
    用检索到的文档生成基础回答
    """
    contexts = number_documents(retrieved_docs)

    template = """
You are a professional expert in optical frequency combs.

You are asked the following question:
{question}

You are also given a set of retrieved documents. 
Use these documents as helpful context, but also draw on your own knowledge to provide a complete, accurate, and nuanced answer. 
But DO NOT output the original text and references(like[1]) from this contexts directly.
Generate a introduction first, than answer the question point by point.
You MUST use appropriate headings and subheadings (label '#' and serial numbers) to better clarify this question. 

Retrieved documents:
{contexts}

Now provide your answer:
    """
    prompt = ChatPromptTemplate.from_template(template)
    formatted_prompt = prompt.format_messages(
        question=question,
        contexts=contexts
    )
    response = llm.invoke(formatted_prompt)
    return response.content


def rerank(llm, paragraph, docs):
    contexts = number_documents(docs)

    template = '''
You are a professional expert in optical frequency combs.

You are given a paragraph that is part of a technical answer with a heading or subheading. 

Your task is to identify which documents are most relevant to supporting or enriching the heading and the content of this paragraph.

You will also be provided with a list of candidate documents. Each document has a number indicating its position in the list.

Select the **top 3 documents** that are most relevant to the given paragraph, considering their technical alignment, contextual consistency, and potential to enhance or validate the claims made.

Only output the indices (starting from 1) of the most relevant documents in a **Python-style list**, such as: [2, 4, 5].

Do not explain your choices.
    
Paragraph: {answer}

Contexts: {contexts}
    '''
    prompt = ChatPromptTemplate.from_template(template)
    formatted_prompt = prompt.format_messages(answer=paragraph, contexts=contexts)
    response = llm.invoke(formatted_prompt)

    nums = extract_indices_from_output(response.content)
    docs_out = [contexts[i] for i in nums if i in contexts]

In [12]:
def final_retriever(llm, question, wcd_url, wcd_api_key, huggingface_key, limit=7):

    headers = {
        "X-HuggingFace-Api-Key": huggingface_key,
    }
    
    client = weaviate.connect_to_weaviate_cloud(
        cluster_url=wcd_url,                      
        auth_credentials=Auth.api_key(wcd_api_key),      
        headers=headers
    )

    client.connect()
    docs = client.collections.get("Chunk2")

    retrieved_docs = retriever(docs, wcd_url, wcd_api_key, huggingface_key, question, limit=limit)
    text = generate_with_context(llm, question, retrieved_docs)
    
    blocks = text_split(text)

    doc_list = []

    for i, doc in enumerate(blocks):
        retrieved_docs = []
        query_vec = embed_model.encode(doc, normalize_embeddings=True).tolist()
        response = docs.query.hybrid(
            query=doc,
            vector=query_vec,
            alpha=0.3,
            limit=limit,
        )

        for obj in response.objects:
            all_response = json.dumps(obj.properties, indent=2)
            all_response = json.loads(all_response)
            content = all_response.get("text", "")
            retrieved_docs.append(content)
            
        doc_list.append({
            "answer": doc,
            "docs": retrieved_docs,
        })

    client.close()

    return doc_list, text


def number_extraction(text):
    # === 提取 Numbers of the useful contexts（增强冒号容错） ===
    contexts_match = re.search(
        r"\*{0,2}Numbers of the useful contexts\*{0,2}\s*[:：]?\s*(\[[^\]]*\])",
        text,
        re.IGNORECASE
    )
    try:
        contexts = ast.literal_eval(contexts_match.group(1)) if contexts_match else []
    except Exception as e:
        print(f"[number_extraction] context list parse error: {e}")
        contexts = []

    # === 提取 Analyze 块（支持加粗、空格、大小写、冒号容错）===
    analyze_match = re.search(
        r"\*{0,2}Analyze\*{0,2}\s*[:：]\s*\n((?:.|\n)*?)\s*$",
        text.strip(),
        re.IGNORECASE | re.DOTALL
    )
    analyze_block = analyze_match.group(1).strip() if analyze_match else ""

    # === 提取每条 Context 分析项（增强格式兼容性）===
    analyze_items = []
    if analyze_block:
        pattern = (
            r"(?:-?\s*)?"                 # 可选的 `- ` 开头
            r"(?:\*\*)?"                  # 可选的 `**`
            r"Context\s+(\d+)"            # 匹配 `Context 1`
            r"(?:\*\*)?"                  # 可选的 `**`
            r"\s*[:：]?\s*"               # 冒号可选，前后有空格
            r"(.*?)(?="                   # 匹配内容直到下一个 Context 或结尾
            r"(?:\n(?:-?\s*)?(?:\*\*)?Context\s+\d+(?:\*\*)?\s*[:：]?)|\Z)"
        )
        matches = re.findall(pattern, analyze_block, re.IGNORECASE | re.DOTALL)
        clean_content = lambda c: re.sub(r'^(\*+)', '', c.strip())
        analyze_items = [f"Context {num}: {clean_content(content)}" for num, content in matches]

    # === 清除原始块 ===
    if contexts_match:
        text = text.replace(contexts_match.group(0), '')
    if analyze_match:
        text = text.replace(analyze_match.group(0), '')

    cleaned_text = re.sub(r'\n{2,}', '\n\n', text).strip()

    return cleaned_text, contexts, analyze_items

In [17]:
from langchain_openai import ChatOpenAI

os.environ["OPENAI_API_KEY"] = "sk-8eca9400641949349ba17730270b006b"
llm = ChatOpenAI(model="qwen-plus-2025-07-14", base_url="https://dashscope.aliyuncs.com/compatible-mode/v1")

wcd_url = "https://lmdh2o3lsz6vmqdbfop0w.c0.asia-southeast1.gcp.weaviate.cloud"
wcd_api_key = "UXZaNyttL2FYd3RmRG50Yl9rRXRweWhhNURIU29LYkFBYy9FK1ptTHhqUGx1am1EYUdJNlpOYVBFaktvPV92MjAw"
huggingface_key = "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"

question = 'What are the limitations of Kerr frequency combs when use them as laser carriers for WDM optical communications?'
doc_list, text = final_retriever(llm, question, wcd_url, wcd_api_key, huggingface_key, limit=5)

In [18]:
print(text)

# Limitations of Kerr Frequency Combs in WDM Optical Communications

Kerr frequency combs, due to their unique properties such as equidistant frequency spacing and strong correlation of carrier fluctuations, offer significant promise for high-capacity optical communications through wavelength division multiplexing (WDM). However, despite their advantages, there are certain limitations when using Kerr frequency combs as laser carriers for WDM optical communications. This discussion will outline these limitations systematically.

## 1. Phase Noise and Stability Challenges

### High Phase Noise
One of the primary challenges of Kerr frequency combs is the presence of high phase noise, which arises from the intrinsic nonlinear dynamics within the microcavities. This phase noise can degrade the performance of high-speed data transmission systems, as it leads to increased bit error rates and reduced signal quality. The phase noise is particularly problematic in dense WDM systems where precise

In [16]:
print(doc_list)

[{'answer': '# Introduction\n\nKerr frequency combs have emerged as a promising technology for wavelength-division multiplexing (WDM) optical communications, offering the potential to overcome many of the limitations posed by traditional laser arrays. These combs are generated using the Kerr effect in nonlinear microresonators and provide a multitude of equidistant spectral lines that can be modulated individually. However, despite their advantages, there are several limitations associated with Kerr frequency combs when used as laser carriers in WDM systems. This document explores these limitations point by point.', 'docs': ["Optical carriers for WDM transmission are commonly generated by distributed feedback (DFB) laser arrays. Chip-scale transmitter systems with DFB lasers have been realized on InP substrates, showing potential for simultaneous operation of 40 channels15. However, these approaches cannot be transferred directly to the silicon photonic platform, as combining conventio

In [19]:
print(doc_list[1])

{'answer': '## 1. Phase Noise and Stability Challenges\n\n### High Phase Noise\nOne of the primary challenges of Kerr frequency combs is the presence of high phase noise, which arises from the intrinsic nonlinear dynamics within the microcavities. This phase noise can degrade the performance of high-speed data transmission systems, as it leads to increased bit error rates and reduced signal quality. The phase noise is particularly problematic in dense WDM systems where precise frequency stability is critical for maintaining the integrity of individual channels.', 'docs': ['While dense WDM (DWDM) analog coherent links in the Oband benefit from using suppressed-carrier modulation formats, they are not able to exploit some Kerr nonlinearity-mitigating factors present in traditional C-band DWDM coherent systems. For example, in contemporary dispersion-unmanaged C-band DWDM coherent systems, XPM effects are reduced by pulse walk-off [11] and FWM effects are suppressed because dispersion des

In [26]:
from pydantic import BaseModel, Field


class AnswerOutput(BaseModel):
    answer_text: str = Field(..., description="The rewritten answer without extra labels")
    numbers_list: List[int] = Field(..., description="List of useful context numbers, e.g. [1,2]")
    analyze: List[str] = Field(..., description="Detailed analysis of how each context contributes")

llm = ChatOpenAI(model="qwen-plus-2025-07-14", base_url="https://dashscope.aliyuncs.com/compatible-mode/v1")

template = '''
You are a professional expert of optical frequency comb. 
Your task is to make correction and supplement of the given answer fragment of the question with the help of the given contexts,
give the serial number of the useful contexts, and give a detailed analyze that how these contexts contribute to the answer.
You should first determine if the contexts are useful, then use this useful contexts to rewrite the answer.
But DO NOT output the original text and references(like[1]) from this contexts directly.

Pay attention to the subheadings in this answer fragment and make sure that the rewritten content is consistent with the title.
The length of the rewritten answer is best to be a little longer than the original one.

DO NOT mention phrases such as "the context", "in the picture", or references in the given context.
MUST keep the headings and subheadings of each answer fragment.  But DO NOT generate new headings or subheadings.
⚠️ Any new heading beginning with '#' (except the origin heading) will be considered invalid output and discarded. 

Give the numbers of the useful contexts and the analyze how these contexts contribute to the answer at the end of your answer and in the following format:

Numbers_list: [number1, number2, ...] (Please limit the sequence number range of the context to [1, 2, 3] based on the order)

Analyze: 
- Context 1: ...

- Context 2: ... 

Output strictly in JSON with the following fields and do not generate extra content:
- answer_text: string
- numbers_list: list of integers
- analyze: list of strings

Question: {question}
Answer Fragment: {answer}
Contexts: {contexts}
'''

formatted_prompt = template.format(
    question=question,
    answer=doc_list[1]['answer'],
    contexts=doc_list[1]['docs'][:3],
)

response = llm.invoke(formatted_prompt)

try:
    parsed = AnswerOutput.parse_raw(response.content)
except Exception as e:
    print("Validation error:", e)

print(parsed)

answer_text='## 1. Phase Noise and Stability Challenges\n\n### High Phase Noise\nOne of the principal limitations of Kerr frequency combs in their application as laser carriers for wavelength-division multiplexing (WDM) optical communications is the challenge posed by phase noise, which directly affects signal fidelity and system performance. The origin of this phase noise lies in the nonlinear dynamics inherent to the microresonators used to generate the combs. Depending on the operational regime—such as primary combs, chaotic combs, or stable soliton combs—the level of phase noise can vary significantly. Chaotic combs, for instance, exhibit high levels of phase noise due to multiple comb lines oscillating within a single resonance, leading to unstable output and degraded coherence. In contrast, soliton-based Kerr combs offer substantially lower phase noise by achieving a balance between anomalous dispersion and Kerr nonlinearity, making them more suitable for coherent communication s

C:\Users\Admin\miniconda3\lib\site-packages\pydantic\main.py:1179: PydanticDeprecatedSince20: The `parse_raw` method is deprecated; if your data is JSON use `model_validate_json`, otherwise load the data then use `model_validate` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.9/migration/
C:\Users\Admin\miniconda3\lib\site-packages\pydantic\main.py:1187: PydanticDeprecatedSince20: `load_str_bytes` is deprecated. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.9/migration/
  obj = parse.load_str_bytes(


In [27]:
print(response.content)

{
  "answer_text": "## 1. Phase Noise and Stability Challenges\n\n### High Phase Noise\nOne of the principal limitations of Kerr frequency combs in their application as laser carriers for wavelength-division multiplexing (WDM) optical communications is the challenge posed by phase noise, which directly affects signal fidelity and system performance. The origin of this phase noise lies in the nonlinear dynamics inherent to the microresonators used to generate the combs. Depending on the operational regime—such as primary combs, chaotic combs, or stable soliton combs—the level of phase noise can vary significantly. Chaotic combs, for instance, exhibit high levels of phase noise due to multiple comb lines oscillating within a single resonance, leading to unstable output and degraded coherence. In contrast, soliton-based Kerr combs offer substantially lower phase noise by achieving a balance between anomalous dispersion and Kerr nonlinearity, making them more suitable for coherent communic