# Nature Protocol Manuscript Conversion
Code to convert xQTL pipeline notebooks to format for Nature Protocol paper    
Information on format [here](https://www.nature.com/nprot/for-authors/protocols)

In [222]:
import json
import re
import yaml
import nbformat
from pathlib import Path

## Setup

In [223]:
def append_to_markdown(output_markdown_file, content):
    with open(output_markdown_file, 'a') as md_file:
        md_file.write(content + '\n\n')


In [224]:
manuscript_format_notebook = f"example_manuscript.ipynb"
output_markdown_file = f"output_markdown.md"
output_latex_file = f"output_latex.tex"

In [225]:
major_sections_keep = [
    'Reference data',
    'Molecular Phenotypes',
    'Data Pre-processing',
    'QTL Association Testing',
    'Multivariate Mixture Model',
    'Multiomics Regression Models',
    'GWAS Integration',
    'Enrichment and Validation'
]

miniprotocol_keep = [
    '../../code/reference_data/reference_data.ipynb',
    '../../code/molecular_phenotypes/bulk_expression.ipynb',
    '../../code/molecular_phenotypes/splicing.ipynb',
    '../../code/data_preprocessing/genotype_preprocessing.ipynb',
    '../../code/data_preprocessing/phenotype_preprocessing.ipynb',
    '../../code/data_preprocessing/covariate_preprocessing.ipynb',
    '../../code/association_scan/qtl_association_testing.ipynb',
    '../../code/multivariate_genome/multivariate_mixture_vignette.ipynb',
    '../../code/mnm_analysis/mnm_miniprotocol.ipynb',
    '../../code/pecotmr_integration/SuSiE_enloc.ipynb',
    '../../code/pecotmr_integration/twas_ctwas.ipynb',
    '../../code/mnm_analysis/mnm_methods/colocboost.ipynb',
    '../../code/enrichment/eoo_enrichment.ipynb'
]

In [226]:
module_skip = [
    '../../code/association_scan/APEX/APEX.ipynb',
    #'../../code/data_preprocessing/genotype/genotype_formatting.ipynb',
    '../../code/data_preprocessing/genotype/GRM.ipynb',
    '../../code/data_preprocessing/genotype/GWAS_QC.ipynb',
    '../../code/data_preprocessing/genotype/PCA.ipynb'
]

In [227]:

#list of miniprotocols to simplify for each section of the manuscript
miniprotocol_simplify = [
    #'../../code/reference_data/reference_data.ipynb',
]

In [228]:

# Specify the path to your YAML file
yaml_file_path = "../_toc.yml"

# Load the YAML file
with open(yaml_file_path, "r") as file:
    yaml_data = yaml.safe_load(file)

In [229]:
#dictionary with keys being the names of the major sections and 
#values being lists of the miniprotocol notebooks for that major section
#these values should match the keys used in the 'miniprotocol_dict' below
major_section_dict = {}
for part in yaml_data['parts']:
    caption = part['caption']
    #filter
    if caption in major_sections_keep: 
        print(caption)
        print(part['chapters'])
        miniprotocols = [f"../../{file['file']}" for file in part['chapters'] if f"../../{file['file']}" in miniprotocol_keep]
        major_section_dict[caption] = miniprotocols
major_section_dict

###used to be in this format:
#major_section_dict =  {
#    "Molecular Phenotype Quantification": [
#        f"{WRKDIR}/bulk_expression/bulk_expression.ipynb",
#        f"{WRKDIR}/splicing/splicing.ipynb"
#    ],
#    "Data Pre-Processing":[
#        f"{WRKDIR}/data_preprocessing/covariate/covariate_preprocessing.ipynb",
#        ],
#    "QTL Association Analysis":[],
#    "Integrative Analysis":[]
#}

Reference data
[{'file': 'code/reference_data/reference_data.ipynb', 'sections': [{'file': 'code/reference_data/reference_data_preparation.ipynb'}, {'file': 'code/reference_data/generalized_TADB.ipynb'}, {'file': 'code/reference_data/ld_prune_reference.ipynb'}, {'file': 'code/reference_data/ld_reference_generation.ipynb'}]}]
Molecular Phenotypes
[{'file': 'code/molecular_phenotypes/bulk_expression.ipynb', 'sections': [{'file': 'code/molecular_phenotypes/calling/RNA_calling.ipynb'}, {'file': 'code/molecular_phenotypes/QC/bulk_expression_QC.ipynb'}, {'file': 'code/molecular_phenotypes/QC/bulk_expression_normalization.ipynb'}]}, {'file': 'code/molecular_phenotypes/methylation.ipynb', 'sections': [{'file': 'code/molecular_phenotypes/calling/methylation_calling.ipynb'}]}, {'file': 'code/molecular_phenotypes/splicing.ipynb', 'sections': [{'file': 'code/molecular_phenotypes/calling/splicing_calling.ipynb'}, {'file': 'code/molecular_phenotypes/QC/splicing_normalization.ipynb'}]}]
Data Pre-proc

{'Reference data': ['../../code/reference_data/reference_data.ipynb'],
 'Molecular Phenotypes': ['../../code/molecular_phenotypes/bulk_expression.ipynb',
  '../../code/molecular_phenotypes/splicing.ipynb'],
 'Data Pre-processing': ['../../code/data_preprocessing/genotype_preprocessing.ipynb',
  '../../code/data_preprocessing/phenotype_preprocessing.ipynb',
  '../../code/data_preprocessing/covariate_preprocessing.ipynb'],
 'QTL Association Testing': ['../../code/association_scan/qtl_association_testing.ipynb'],
 'Multivariate Mixture Model': ['../../code/multivariate_genome/multivariate_mixture_vignette.ipynb'],
 'Multiomics Regression Models': ['../../code/mnm_analysis/mnm_miniprotocol.ipynb'],
 'GWAS Integration': ['../../code/pecotmr_integration/SuSiE_enloc.ipynb',
  '../../code/pecotmr_integration/twas_ctwas.ipynb',
  '../../code/mnm_analysis/mnm_methods/colocboost.ipynb'],
 'Enrichment and Validation': ['../../code/enrichment/eoo_enrichment.ipynb']}

In [230]:
#dictionary with keys being the mininprotocol notebooks and 
#values being lists of the module notebooks for the miniprotocol
miniprotocol_dict = {}
for part in yaml_data['parts']:
    for chapter in part['chapters']:
        miniprotocol = f"../../{chapter['file']}"
        #filter
        if miniprotocol in miniprotocol_keep:
            if 'sections' in chapter:
                miniprotocol_dict[miniprotocol] = [f"../../{module['file']}" for module in chapter['sections']]
            else:
                miniprotocol_dict[miniprotocol] = []

            miniprotocol_dict[miniprotocol] = [x for x in miniprotocol_dict[miniprotocol] if x not in module_skip]

            miniprotocol_dict[miniprotocol]
            
miniprotocol_dict
###used to be in this format:
#miniprotocol_dict = {
#    f"{WRKDIR}/bulk_expression/bulk_expression.ipynb":[
#        f"{WRKDIR}/bulk_expression/RNA_calling.ipynb",
#        f"{WRKDIR}/bulk_expression/bulk_expression_QC.ipynb",
#        f"{WRKDIR}/bulk_expression/bulk_expression_normalization.ipynb"
#    ],
#    f"{WRKDIR}/splicing/splicing.ipynb":[
#        f"{WRKDIR}/splicing/splicing_calling.ipynb",
#        f"{WRKDIR}/splicing/splicing_normalization.ipynb"
#    ],
#    f"{WRKDIR}/data_preprocessing/covariate/covariate_preprocessing.ipynb":[
#        f"{WRKDIR}/data_preprocessing/covariate/covariate_formatting.ipynb",
#        f"{WRKDIR}/data_preprocessing/covariate/covariate_hidden_factor.ipynb"
#    ]
#}        

{'../../code/reference_data/reference_data.ipynb': ['../../code/reference_data/reference_data_preparation.ipynb',
  '../../code/reference_data/generalized_TADB.ipynb',
  '../../code/reference_data/ld_prune_reference.ipynb',
  '../../code/reference_data/ld_reference_generation.ipynb'],
 '../../code/molecular_phenotypes/bulk_expression.ipynb': ['../../code/molecular_phenotypes/calling/RNA_calling.ipynb',
  '../../code/molecular_phenotypes/QC/bulk_expression_QC.ipynb',
  '../../code/molecular_phenotypes/QC/bulk_expression_normalization.ipynb'],
 '../../code/molecular_phenotypes/splicing.ipynb': ['../../code/molecular_phenotypes/calling/splicing_calling.ipynb',
  '../../code/molecular_phenotypes/QC/splicing_normalization.ipynb'],
 '../../code/data_preprocessing/genotype_preprocessing.ipynb': ['../../code/data_preprocessing/genotype/VCF_QC.ipynb',
  '../../code/data_preprocessing/genotype/genotype_formatting.ipynb'],
 '../../code/data_preprocessing/phenotype_preprocessing.ipynb': ['../../co

In [231]:
#read the miniprotocol notebook and get the title (should be first cell)
def get_miniprot_notebook_title(notebook_str):
    with open(notebook_str, 'r') as miniprot_content:
        miniprot_notebook = json.load(miniprot_content)
        for i, cell in enumerate(miniprot_notebook["cells"]):
            
            if cell["cell_type"] == "markdown":
                
                if len(cell["source"]) >0:
                    if cell["source"][0].startswith("# "):
                        miniprot_title = cell["source"][0]
                        miniprot_title = miniprot_title.replace("#","")
                        print(miniprot_title)
                        return miniprot_title

## Experimental Design Conversion

In [232]:
#get the content for the experimental design of the manuscript by going through each miniprotocol for the 
#title and through each module for the descriptions
def content_for_exp_design():
    return_content = []
    major_step = 1
    #iterate over major sections dict
    for major_section in major_section_dict.keys():
        return_content.append(f"#### {major_section} (Step {major_step})")
        major_step += 1
        


        miniprot_step = 1
        #iterate over miniprotocols in each major section
        for miniprot in major_section_dict[major_section]:
            
            #get the title of the miniprotocol
            miniprot_title = f"##### {chr(ord('@')+miniprot_step)}. {get_miniprot_notebook_title(miniprot)}"
            return_content.append(miniprot_title)
            
            #don't do much if this is one of the miniprotocols sections we want to simplify
            if miniprot in miniprotocol_simplify:
                return_content.append(f"Please refer to the protocol website for more information on this miniprotocol.")
            else:
                #if in this section, then just read from description from the miniprotocol
                if major_section == "Advanced cis-QTL Analysis":
                    with open(miniprot, 'r') as miniprot_content:
                            #flag to get the next cell (or cell after description header in this case)
                            get_next_cell = False
                            miniprot_notebook = json.load(miniprot_content)
                            for i, cell in enumerate(miniprot_notebook["cells"]):
                                if cell["cell_type"] == "markdown":
                                    if len(cell["source"]) >0:
                                        content = cell["source"][0]
    
    
                                        if content.startswith("##") and get_next_cell:
                                            #reset the flag
                                            get_next_cell = False
                                        #add the description text
                                        if get_next_cell:
    
                                            return_content.append("\n" + "\n".join(cell["source"]))
                                        if cell["source"][0].startswith("## Description"):
                                            #tells us to get the next cell after the this one for the output
                                            get_next_cell = True
                else:
                    #iterate over modules in each miniprotocol                                    
                    for module in miniprotocol_dict[miniprot]:
                        with open(module, 'r') as module_content:
                            #flag to get the next cell (or cell after description header in this case)
                            get_next_cell = False
                            module_notebook = json.load(module_content)
                            for i, cell in enumerate(module_notebook["cells"]):
                                if cell["cell_type"] == "markdown":
                                    if len(cell["source"]) >0:
                                        content = cell["source"][0]
    
    
                                        if content.startswith("##") and get_next_cell:
                                            #reset the flag
                                            get_next_cell = False
                                        #add the description text
                                        if get_next_cell:
    
                                            return_content.append("\n" + "\n".join(cell["source"]))
                                        if cell["source"][0].startswith("## Description"):
                                            #tells us to get the next cell after the this one for the output
                                            get_next_cell = True

            miniprot_step += 1
    return "\n".join(return_content)

## Procedure Conversion

In [233]:
def content_for_procedure():
    return_content = []
    major_step = 0
    for major_section in major_section_dict.keys():
        major_step += 1
        #print(major_section)
        return_content.append(f"### {major_step}. {major_section}\n")
        for miniprot in major_section_dict[major_section]:
            #print(miniprot)
            with open(miniprot, "r", encoding="utf-8") as nb_file:
                nb_content = nbformat.read(nb_file, as_version=4)
                for i, cell in enumerate(nb_content.cells):
                    if cell.cell_type == "markdown":
                        for line in cell.source.splitlines():
                            if line.strip().startswith("Timing"):
                                return_content.append(line)
                            if line.strip().startswith("# "):
                                return_content.append("###"+line)
    
    
    
                    if cell.cell_type == "code":
                        if cell.source.startswith("sos run") or cell.source.startswith("!sos run") or cell.source.startswith("echo"):
                            if not cell.source.endswith(" -h"):
                                title_text = ""
                                if i > 0 and nb_content.cells[i-1].cell_type == "markdown":
                                    title_text = nb_content.cells[i-1].source.strip()
                                    title_text_no_link = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', title_text)
                                    return_content.append("##" + title_text_no_link + "\n\n")
                                    return_content.append("```python\n")
                                    return_content.append(cell.source.strip() + "\n")
                                    return_content.append("```\n\n\n")
    
    
    
                            
    return "\n".join(return_content)
        #return_content.append(f"#### {major_section} (Step {major_step})")
        #major_step += 1

## Timing Conversion

In [234]:
#get the content for the timing seection of the manuscript by going through each miniprotocol
def content_for_timing():
    return_content = []
    return_content.append(f"| Step | Time|")
    return_content.append(f"|------|-----|")
    
    #iterate over major sections dict
    for major_section in major_section_dict.keys():
        table_row = ""
        table_row = table_row + f"|{major_section}|X minutes|"
        return_content.append(table_row.replace('\n',''))

    return "\n".join(return_content)
    

In [235]:
#get the content for the timing seection of the manuscript by going through each miniprotocol
def content_for_timing_old():
    return_content = []
    return_content.append(f"| Step(Major Section) | Substep(Miniprotocol) | Time|")
    return_content.append(f"|------|-----|----|")
    
    #iterate over major sections dict
    for major_section in major_section_dict.keys():
        table_row = ""
        table_row = table_row + f"|{major_section}"
        #iterate over miniprotocols in each major section
        for miniprot in major_section_dict[major_section]:
            with open(miniprot, 'r') as miniprot_content:
                #get the title of the miniprotocol
                miniprot_title = f"{get_miniprot_notebook_title(miniprot)}".replace("\n","")
                table_row = table_row + f"|{miniprot_title}"

                miniprot_notebook = json.load(miniprot_content)
                for i, cell in enumerate(miniprot_notebook["cells"]):
                    if cell["cell_type"] == "markdown":
                        if len(cell["source"]) >0:
                            content = cell["source"][0]
                            if content.startswith("#### Miniprotocol Timing"):
                                for c in cell["source"]:
                                    if c.startswith("Timing"):
                                        table_row = table_row +f"|{c.replace('Timing','')}|"
                                        return_content.append(table_row.replace('\n',''))
                                        break
                table_row = "| "
        table_row = ""
    return "\n".join(return_content)

## Anticipated Results Conversion

In [236]:
#get the content for the anticipated results seection of the manuscript by going through each miniprotocol
def content_for_anticipated_results():
    
    return_content = []
    
    
    #iterate over major sections dict
    for major_section in major_section_dict.keys():
        miniprot_step = 1
        on_results = False
        #iterate over miniprotocols in each major section
        for miniprot in major_section_dict[major_section]:
            #get the title of the miniprotocol
            #miniprot_title = f"#### {chr(ord('@')+miniprot_step)}. {get_miniprot_notebook_title(miniprot)}"
            miniprot_title = f"#### {get_miniprot_notebook_title(miniprot)}"
            return_content.append(miniprot_title)
            miniprot_step += 1

            #don't do much if this is one of the miniprotocols sections we want to simplify
            if miniprot in miniprotocol_simplify:
                return_content.append(f"Please refer to the protocol website for more information on these results.")
            else:

                    
                with open(miniprot, 'r') as miniprot_content:
    
                    miniprot_notebook = json.load(miniprot_content)
                    for i, cell in enumerate(miniprot_notebook["cells"]):
                        if cell["cell_type"] == "markdown":
                            if len(cell["source"]) >0:
                                content = cell["source"][0]
                                if on_results:
                                    return_content.append(content)
                                    on_results = False
                                if content.startswith("## Anticipated Results"):
                                    on_results = True
    return "\n".join(return_content)

## Figures Conversion

In [237]:
#get the content for the figures by going through the anticipated results part of each miniprotocol
def content_for_figures():
    
    return_content = []
    
    
    #iterate over major sections dict
    for major_section in major_section_dict.keys():
        #iterate over miniprotocols in each major section
        for miniprot in major_section_dict[major_section]:

            #don't do much if this is one of the miniprotocols sections we want to simplify
            #if miniprot in miniprotocol_simplify:
            #    return_content.append(f"Please refer to the protocol website for more information on these results.")
            if miniprot not in miniprotocol_simplify:

                    
                with open(miniprot, 'r') as miniprot_content:

                    get_next_cell = False
                    miniprot_notebook = json.load(miniprot_content)
                    for i, cell in enumerate(miniprot_notebook["cells"]):
                        if cell["cell_type"] == "markdown":
                            if len(cell["source"]) >0:
                                content = cell["source"][0]



                                if content.startswith("##") and get_next_cell:
                                    #reset the flag
                                    get_next_cell = False
                                #add the description text
                                if (get_next_cell and "png" in str(cell["source"])):
                                    return_content.append("\n" + "\n".join(cell["source"]))
                                if cell["source"][0].startswith("## Anticipated Results"):
                                    #tells us to get the next cell after the this one for the output
                                    get_next_cell = True

    return "\n".join(return_content)

## References Conversion

In [238]:
def content_for_references():
    
    return_content = []
    
    #hold the references in a list fromatted as "author year doi"
    #use this to check for duplicates and to order correctly
    ref_list = []
    
    
    #for now just check through the experimental design text 
    exp_design = content_for_exp_design()
    
    
    
    # look for the pattern for references. Should look like:
    #[cf. Signal et al (2022)](https://doi.org/10.1186/s12859-022-04572-7)
    pattern = r'\[cf(.*?)\]\((.*?)\)'

    # Use re.findall to find all occurrences of the pattern in the input string
    matches = re.findall(pattern, exp_design)

    # look through all the matches and add to the ref_list
    for match in matches:
        ref_text = match[0]
        doi = match[1]
        year = re.findall(r'\b\d{4}\b', match[0])[0]
        #author = re.findall(r'\s([a-zA-Z]+)\s', match[0])[0]
        author = re.findall(r'\s([a-zA-Z-]+)\s', match[0])[0]

        for_ref_list = f"{author} et al. {year}. {doi}"
        #make sure it isn't already added before adding
        if for_ref_list not in ref_list:
            ref_list.append(for_ref_list)
    
    
    ref_num = 1
    #now iterate through the ref_list and add to the return content
    for ref in ref_list:
        return_content.append(f"{ref_num}. {ref} ")
        ref_num+=1
    return "\n".join(return_content)

    
    

# Do the conversion to markdown
This reads through the example_manuscript.ipynb and the miniprotocol and module notebooks to create a markdown file

In [239]:
with open(manuscript_format_notebook, 'r') as manuscript_format:
    notebook = json.load(manuscript_format)
# Clearing the content of the markdown file before appending new content
open(output_markdown_file, 'w').close()
#flag to tell us if we are in the procedure part. Used to skip some of the content in 
#the example_manuscript.ipynb that will be added programmatically here
in_procedure = False

for i, cell in enumerate(notebook["cells"]):
    if cell["cell_type"] == "markdown":

        if len(cell["source"]) >0:

            content = cell["source"][0]
            # one of the main sections (Title, Abstract, Procedure, etc...)
            if content.startswith("## "):
                section_title = ''.join(cell["source"]) + '\n\n'
                append_to_markdown(output_markdown_file, section_title)
                #if in procedure and we hit a new section, then we are no longer in the procedure section
                if in_procedure:
                    in_procedure = False
                #get content for procedure section
                if content.startswith("## Procedure"):
                    in_procedure = True

                    
                    proc = content_for_procedure()
                    print(proc)
                    append_to_markdown(output_markdown_file, proc)
                #get content for timing section
                if content.startswith("## Timing"):
                    in_procedure = False
                    timing = content_for_timing()
                    append_to_markdown(output_markdown_file, timing)
                #get content for anticipated results section
                if content.startswith("## Anticipated Results"):
                    in_procedure = False
                    antires = content_for_anticipated_results()
                    append_to_markdown(output_markdown_file, antires)
                #get content for figures section
                if content.startswith("## Figures"):
                    in_procedure = False
                    figures = content_for_figures()
                    #append_to_markdown(output_markdown_file, figures)
                #get content for references section
                if content.startswith("## References"):
                    in_procedure = False
                    ref = content_for_references()
                    append_to_markdown(output_markdown_file, ref)
            #other sub sections
            if content.startswith("### ") and not in_procedure:
                section_title = ''.join(cell["source"]) + '\n\n'

                append_to_markdown(output_markdown_file, section_title)
                # experimental design subsection of Introduction
                if content.startswith("### Experimental Design"):
                    exp = content_for_exp_design()

                    append_to_markdown(output_markdown_file, exp)


 Reference Data

 RNA-seq expression

 Alternative splicing from RNA-seq data

 Genotype data preprocessing

 Phenotype data preprocessing

 Covariate Data Preprocessing

 QTL Association Analysis

  Mixture Multivariate Distribution Estimate

 Integrative Analysis with High-Dimensional Regression

 xQTL-GWAS pairwise enrichment and colocalization
 TWAS, cTWAS and MR

 Multi-trait colocalization using ColocBoost
 Chromosome-Specific Enrichment Analysis of Annotations Using Block Jackknife
### 1. Reference data

#### Reference Data
Timing ~4 hours
##### i. Download Reference Data


```python

sos run pipeline/reference_data_preparation.ipynb download_hg_reference --cwd reference_data
sos run pipeline/reference_data_preparation.ipynb download_gene_annotation --cwd reference_data
sos run pipeline/reference_data_preparation.ipynb download_ercc_reference --cwd reference_data
sos run pipeline/reference_data_preparation.ipynb download_dbsnp --cwd reference_data

```



##### ii. Format Refere

In [240]:
header = r"""\documentclass[12pt]{article}
\usepackage[utf8]{inputenc}
\usepackage{hyperref}
\usepackage{listings}
\usepackage{color}

\definecolor{codegray}{gray}{0.95}
\lstset{
  backgroundcolor=\color{codegray},
  basicstyle=\ttfamily\footnotesize,
  breaklines=true,
  frame=single,
  columns=fullflexible
}

\begin{document}
"""

In [272]:
import re

def markdown_links_to_latex(text):
    """
    Converts Markdown links [text](url) to LaTeX \href{url}{text}.
    Escapes underscores in URLs.
    """
    def replacer(match):
        label = match.group(1)
        url = match.group(2).replace('_', r'\_')  # escape underscores
        return f'\\href{{{url}}}{{{label}}}'

    # Regex to match [label](url)
    pattern = re.compile(r'\[([^\]]+)\]\(([^)]+)\)')
    return pattern.sub(replacer, text)

In [273]:
def convert_table_to_latex(table_lines):
    """Convert buffered Markdown table lines to LaTeX tabular format."""
    if len(table_lines) < 2:
        return ""  # not a table
    
    # header row
    header = [h.strip() for h in table_lines[0].strip().split('|') if h.strip()]
    # alignment row ignored, default left
    alignment = ['l' for _ in header]
    
    # data rows
    rows = []
    for line in table_lines[2:]:  # skip header + separator
        row = [cell.strip() for cell in line.strip().split('|') if cell.strip()]
        rows.append(row)
    
    # build LaTeX
    latex = "\\begin{tabular}{" + "|".join([""] + alignment + [""]) + "}\n"
    latex += "\\hline\n"
    latex += " & ".join(header) + " \\\\\n"
    latex += "\\hline\n"
    for row in rows:
        latex += " & ".join(row) + " \\\\\n"
    latex += "\\hline\n\\end{tabular}\n"
    return latex


In [274]:
with open(output_latex_file, "w", encoding="utf-8") as latex_output:
    latex_output.write(header)
    with open(output_markdown_file, 'r', encoding="utf-8") as markdown_content:
        in_code_block = False
        in_table = False
        in_references = False
        found_reference = False
        table_buffer = []

        for line in markdown_content:
            stripped = line.strip()

            # handle code blocks
            if stripped.startswith("```python"):
                in_code_block = True
                latex_output.write(r"""\noindent""")
                latex_output.write("\n\\begin{lstlisting}[language=Python]\n")
                continue
            elif stripped.startswith("```") and in_code_block:
                in_code_block = False
                latex_output.write("\\end{lstlisting}\n")
                continue
            
            if in_code_block:
                latex_output.write(line)
                continue

            # detect start of references
            if stripped.startswith("## References"):
                in_references = True
                latex_output.write("\n\\section*{References}\n\\begin{enumerate}\n")
                continue

            if in_references:
                # convert numbered references
                if stripped and stripped[0].isdigit() :#and stripped[1] == ".":
                    
                    # split at first period
                    num, rest = stripped.split(".", 1)
                    # find URL
                    if "http" in rest:
                        parts = rest.split("http", 1)
                        text = parts[0].strip().replace("_", r'\_')
                        url = "http" + parts[1].strip()
                        latex_output.write(f"  \\item {text} \\url{{{url}}}\n")
                        found_reference = True
                    else:
                        rest_edit = rest.strip().replace('_', r'\\_')
                        latex_output.write(f"  \\item {rest_edit}\n")
                        found_reference = True
                if found_reference and not stripped:
                    # end references if a non-numbered line appears
                    in_references = False
                    found_reference = False
                    latex_output.write("\\end{enumerate}\n")
                    # fall through to normal processing for this line
                continue

            # handle tables
            if '|' in stripped and not stripped.startswith('#'):
                table_buffer.append(stripped)
                in_table = True
                continue
            elif in_table:
                latex_output.write(convert_table_to_latex(table_buffer))
                table_buffer = []
                in_table = False

            # handle headers
            if stripped.startswith("## "):
                latex_output.write("\n\\section*{" + stripped.replace("#", "").strip() + "}\n")
            elif stripped.startswith("### "):
                latex_output.write("\n\\subsection*{" + stripped.replace("#", "").strip() + "}\n")
            elif stripped.startswith("#### "):
                latex_output.write("\n\\subsubsection*{" + stripped.replace("#", "").strip() + "}\n")
            elif stripped.startswith("##### "):
                latex_output.write("\n\\paragraph*{" + stripped.replace("#", "").strip() + "}\n")
            else:
                # normal text
                line_text = stripped.replace("_", r'\_')
                line_text = markdown_links_to_latex(line_text)
                latex_output.write(line_text + "\n")

    latex_output.write("\n\\end{document}")
