# `.ipynb` to `.md` Converter

In [None]:
! jupyter nbconvert --to markdown *.ipynb

[NbConvertApp] Converting notebook 1-Converter.ipynb to markdown
[NbConvertApp] Writing 3173 bytes to 1-Converter.md
[NbConvertApp] Converting notebook 1-Index.ipynb to markdown
  {%- elif type == 'text/vnd.mermaid' -%}
[NbConvertApp] Writing 50061 bytes to 1-Index.md
[NbConvertApp] Converting notebook 2-Algebraic-and-Number-Theoritic.ipynb to markdown
[NbConvertApp] Writing 5343 bytes to 2-Algebraic-and-Number-Theoritic.md
[NbConvertApp] Converting notebook 2.1-Factoring.ipynb to markdown
[NbConvertApp] Writing 8367 bytes to 2.1-Factoring.md
[NbConvertApp] Converting notebook 2.10-Matrix Elements and Multiplicity Coefficients of Group Representations.ipynb to markdown
[NbConvertApp] Writing 7397 bytes to 2.10-Matrix Elements and Multiplicity Coefficients of Group Representations.md
[NbConvertApp] Converting notebook 2.11-Verifying Matrix Products.ipynb to markdown
[NbConvertApp] Writing 6240 bytes to 2.11-Verifying Matrix Products.md
[NbConvertApp] Converting notebook 2.12-Subset-sum.

## Combine all `.md` file to create `book.md`

In [3]:
! cat *.md > book.md

## Remove long url

In [3]:
import re
import os

def shorten_long_urls_in_markdown(input_file, output_file, url_length_threshold=80):
    """
    Converts long inline Markdown links to reference-style links.

    Args:
        input_file (str): Path to the source Markdown file.
        output_file (str): Path to save the cleaned Markdown file.
        url_length_threshold (int): The minimum length for a URL to be considered "long".
    """
    try:
        with open(input_file, 'r', encoding='utf-8') as f:
            content = f.read()
    except FileNotFoundError:
        print(f"Error: The file '{input_file}' was not found.")
        return

    # Regex to find all inline markdown links: [text](url)
    # It correctly handles nested brackets in the link text.
    inline_link_regex = re.compile(r'\[((?:[^\[\]]|\[[^\]]*\])*)\]\(([^)]+)\)')

    references = {}
    ref_counter = 1
    
    # This function will be called for each match found by re.sub
    def replace_link(match):
        nonlocal ref_counter
        link_text = match.group(1)
        url = match.group(2)

        # Check if the URL is long and not already a reference placeholder
        if len(url) > url_length_threshold and not url.startswith('#'):
            # Check if we have already created a reference for this URL
            if url in references:
                ref_id = references[url]
            else:
                ref_id = f"ref{ref_counter}"
                references[url] = ref_id
                ref_counter += 1
            
            # Return the new reference-style link
            return f"[{link_text}][{ref_id}]"
        else:
            # If the URL is not long, leave the link as is
            return match.group(0)

    # Perform the replacement across the entire file content
    new_content = inline_link_regex.sub(replace_link, content)

    # Build the reference list to append at the end
    if references:
        # Invert the dictionary to sort by ref_id number
        sorted_refs = sorted(references.items(), key=lambda item: int(item[1].replace('ref', '')))
        
        ref_list_str = "\n\n" + "="*80 + "\n"
        ref_list_str += "<!-- Link References -->\n"
        ref_list_str += "="*80 + "\n"
        
        for url, ref_id in sorted_refs:
            ref_list_str += f"[{ref_id}]: {url}\n"
            
        new_content += ref_list_str

    # Write the cleaned content to the output file
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(new_content)

    print(f"Processing complete!")
    print(f"Found and converted {len(references)} long URLs.")
    print(f"Cleaned file saved as '{output_file}'.")


# --- Main execution ---
if __name__ == "__main__":
    INPUT_FILENAME = "book.md"
    OUTPUT_FILENAME = "book_cleaned.md"
    
    # You can change the threshold for what is considered a "long" URL
    URL_THRESHOLD = 80 

    shorten_long_urls_in_markdown(INPUT_FILENAME, OUTPUT_FILENAME, URL_THRESHOLD)


Processing complete!
Found and converted 127 long URLs.
Cleaned file saved as 'book_cleaned.md'.


# References

In [7]:
import re
import csv

def parse_references_with_date(raw_text, output_filename="references_extracted.csv"):
    # Split text by lines and remove empty ones
    lines = [line.strip() for line in raw_text.split('\n') if line.strip()]
    
    data = []
    current_entry = {}
    
    # Pattern to find the Reference ID (e.g., "1", "105")
    id_pattern = re.compile(r'^\d+$')
    
    # Pattern to find a Year (1900-2099)
    date_pattern = re.compile(r'\b(19\d{2}|20\d{2})\b')
    
    i = 0
    while i < len(lines):
        line = lines[i]
        
        if id_pattern.match(line):
            # Save previous entry before starting a new one
            if current_entry:
                data.append(current_entry)
            
            current_entry = {
                'ID': line,
                'Date': '',        # New Column
                'Authors': '',
                'Title': '',
                'Publication': '',
                'arXiv': ''
            }
            
            # Heuristic: Author is usually the line after ID
            if i + 1 < len(lines):
                current_entry['Authors'] = lines[i+1]
            
            # Heuristic: Title is usually the line after Authors
            if i + 2 < len(lines):
                current_entry['Title'] = lines[i+2]
                
            # Heuristic: Scan the remaining lines for Publication/arXiv/Date info
            # until we hit the next ID number
            pub_parts = []
            arxiv_parts = []
            
            j = i + 3
            while j < len(lines) and not id_pattern.match(lines[j]):
                content = lines[j]
                
                # Check for arXiv identifiers
                if "arXiv" in content or "quant-ph" in content or "cond-mat" in content:
                    clean_arxiv = content.replace('[', '').replace(']', '')
                    arxiv_parts.append(clean_arxiv)
                else:
                    pub_parts.append(content)
                j += 1
            
            # Join the gathered lines
            full_pub_string = " ".join(pub_parts)
            full_arxiv_string = "; ".join(arxiv_parts)
            
            current_entry['Publication'] = full_pub_string
            current_entry['arXiv'] = full_arxiv_string
            
            # --- EXTRACT DATE LOGIC ---
            # 1. Look for year in Publication string first
            years = date_pattern.findall(full_pub_string)
            
            # 2. If not found, look in arXiv string
            if not years:
                years = date_pattern.findall(full_arxiv_string)
            
            # 3. If not found, look in the Title (rare, but happens in proceedings)
            if not years:
                 years = date_pattern.findall(current_entry['Title'])

            # If multiple years appear, the last one is usually the publication year
            if years:
                current_entry['Date'] = years[-1]
            # --------------------------
            
            i = j
        else:
            i += 1

    # Append the final entry
    if current_entry:
        data.append(current_entry)

    # Define columns (Date is now the 2nd column)
    keys = ['ID', 'Date', 'Authors', 'Title', 'Publication', 'arXiv']
    
    with open(output_filename, 'w', newline='', encoding='utf-8') as output_file:
        dict_writer = csv.DictWriter(output_file, fieldnames=keys)
        dict_writer.writeheader()
        dict_writer.writerows(data)
        
    print(f"Success: Converted {len(data)} references to {output_filename}")

# HOW TO RUN:
# 1. Paste your text into a file named 'refs.txt'
# 2. Run the following block:
with open('refs.txt', 'r', encoding='utf-8') as f:
     parse_references_with_date(f.read())

Success: Converted 550 references to references_extracted.csv


# Download all Papers

In [9]:
!pip install arxiv pandas tqdm

Collecting arxiv
  Downloading arxiv-2.3.1-py3-none-any.whl.metadata (5.2 kB)
Downloading arxiv-2.3.1-py3-none-any.whl (11 kB)
Installing collected packages: arxiv
Successfully installed arxiv-2.3.1


In [10]:
import os
import re
import time
import pandas as pd
import arxiv
from tqdm import tqdm

# Configuration
INPUT_CSV = 'references.csv'
OUTPUT_FOLDER = 'papers'
# arXiv requires a delay to prevent blocking (approx 3s is polite)
DELAY_SECONDS = 3 

def sanitize_filename(filename):
    """Removes illegal characters for file names."""
    return re.sub(r'[\\/*?:"<>|]', "", filename)[:150] # Limit length

def extract_arxiv_ids(text):
    """
    Extracts arXiv IDs from a string. 
    Handles formats like:
    - arXiv:quant-ph/9703054
    - quant-ph/9703054
    - arXiv:0705.2784
    - 0705.2784
    """
    if not isinstance(text, str):
        return []
    
    # Regex for both old (category/number) and new (dot number) formats
    # It looks for patterns that resemble arXiv IDs
    patterns = [
        r'([a-z\-]+/\d{7})',      # Old format (e.g., quant-ph/9703054)
        r'(\d{4}\.\d{4,5})'       # New format (e.g., 0705.2784)
    ]
    
    found_ids = []
    for p in patterns:
        matches = re.findall(p, text)
        found_ids.extend(matches)
        
    return list(set(found_ids)) # Remove duplicates

def main():
    # 1. Create output directory
    if not os.path.exists(OUTPUT_FOLDER):
        os.makedirs(OUTPUT_FOLDER)
        print(f"Created folder: {OUTPUT_FOLDER}")

    # 2. Read CSV
    try:
        df = pd.read_csv(INPUT_CSV)
        print(f"Loaded {len(df)} rows from {INPUT_CSV}")
    except FileNotFoundError:
        print(f"Error: Could not find {INPUT_CSV}. Make sure it's in the same folder.")
        return

    # 3. Collect all IDs
    tasks = []
    for index, row in df.iterrows():
        raw_text = row.get('arXiv', '')
        ids = extract_arxiv_ids(raw_text)
        
        if ids:
            # Just take the first valid ID found in the cell
            tasks.append(ids[0])
        else:
            # Optional: Log missing IDs
            # print(f"Skipping Row {index+1}: No valid arXiv ID found.")
            pass

    print(f"Found {len(tasks)} papers to download.")
    
    # 4. Initialize arXiv Client
    client = arxiv.Client(
        page_size=100,
        delay_seconds=3,
        num_retries=3
    )

    # 5. Process downloads
    # We process in chunks or one-by-one. 
    # Querying the API with the ID gets us the correct PDF URL.
    
    print("Starting download...")
    
    # We use tqdm for a progress bar
    for paper_id in tqdm(tasks):
        try:
            # Fetch paper details from arXiv API
            search = arxiv.Search(id_list=[paper_id])
            paper = next(client.results(search))
            
            # Construct clean filename: ID_Title.pdf
            safe_title = sanitize_filename(paper.title)
            safe_id = sanitize_filename(paper_id)
            filename = f"{safe_id}_{safe_title}.pdf"
            filepath = os.path.join(OUTPUT_FOLDER, filename)
            
            # Check if file already exists to skip
            if os.path.exists(filepath):
                continue
            
            # Download
            paper.download_pdf(dirpath=OUTPUT_FOLDER, filename=filename)
            
            # Polite delay
            time.sleep(DELAY_SECONDS)
            
        except StopIteration:
            print(f"\nWarning: ID {paper_id} not found on arXiv.")
        except Exception as e:
            print(f"\nError downloading {paper_id}: {e}")

    print(f"\nDone! Check the '{OUTPUT_FOLDER}' folder.")

if __name__ == "__main__":
    main()

Loaded 550 rows from references.csv
Found 459 papers to download.
Starting download...


 15%|█▌        | 71/459 [04:34<35:44,  5.53s/it]


Error downloading quant-h/0211140: Page request resulted in HTTP 400 (https://export.arxiv.org/api/query?search_query=&id_list=quant-h%2F0211140&sortBy=relevance&sortOrder=descending&start=0&max_results=100)


100%|██████████| 459/459 [31:57<00:00,  4.18s/it]


Done! Check the 'papers' folder.



