Based on the paper column of scraped/ijhs.tsv, classify each paper into the best among the following Subject , Category Subject: 1) Astronomy 2) Math 3) Medicine 4)Agriculture 5)Culture 6)Metallurgy 7)Mind sciences Category: a) Dharmic b) Islamic c)Western d)Fareast

Use pandas to load the csv.  Assume I have ollama/llama3.2 locally installed and running. Use the llama3.2 to classify the paper into the best subject and category.  The output should be a csv file with the following columns added subject  and category.  The output file should be named ijhs_classified.tsv

In [None]:
import os
from typing import Tuple , List
from dotenv import load_dotenv
import tqdm
from time import sleep
load_dotenv()

import google.generativeai as genai

# Configure Gemini API
genai.configure(api_key=os.getenv('GEMINI_API_KEY'))

# Create the model
generation_config = {
  "temperature": 1*0,
  "top_p": 0.95*0+.05,
  "top_k": 40*0+5,
  "max_output_tokens": 8192 ,
  "response_mime_type": "text/plain",
}

model = genai.GenerativeModel(
  model_name="gemini-2.0-flash-exp",
  generation_config=generation_config,
)

# Define system prompt for classification
system_prompt = """
You are an expert in history of science. 
Given a newline-separated list of scientific paper topics, classify each topic into:

Subject (choose exactly one):
1) Astronomy
2) Math
3) Medicine
4) Agriculture
5) Culture
6) Metallurgy
7) MindSciences
8) Biology

Category (choose exactly one):
a) Dharmic
b) Islamic
c) Western
d) Fareast

If the paper title length is too short, please prefix with chosen subject and category with "x_". 
If the paper does not fit any of the above categories, please classify it as "Other Misc".
Never return empty classifications - this is a hard constraint.
Ensure the number of lines in the response matches the number of lines in the input.

Output format: Return only three words per each topic - the topic sequence number, subject and the category, nothing else. 
Preserve the order of the subjects and categories as in the prompt.

Example:
  Input: "Lalit K. Gurjar M.Sc. \nContents \nCalculation for ‘chain‑reduction’ in the Triśatībhāṣya"
  Response:"Other Misc\nx_Other x_Misc\nMath Dharmic"

"""

def classify_with_gemini(papers) : # Tuple[Tuple[str, str]]:
    ps = '\n'.join(papers)
    prompt = f'{system_prompt}\nPapers: {ps}'
    response = model.generate_content(prompt)
    # Wait for a bit
    sleep(5)
    return ps, response
    #subject, category = response.text.strip().split()
    #return subject, category

# Load the data
import pandas as pd
ijhs_df = pd.read_csv('scraped/ijhs.tsv', sep='\t')
# where "Equinocital" is in the paper title
# ijhs_df = ijhs_df[ijhs_df['paper'].str.contains('Equinoctial')]
# ttl = len(ijhs_df)
ijhs_df

Unnamed: 0,journal,paper,author,url,size_in_kb,cum_size_in_kb
0,IJHS-1-1966-Issue-1,The Theory of Chemical Combination in Ancient ...,Priyadaranjan Ray,https://insa.nic.in//writereaddata/UpLoadedFil...,272.0,272.0
1,IJHS-1-1966-Issue-1,What was ‘The Scientific Revolution’ ?,J R Ravetz,https://insa.nic.in//writereaddata/UpLoadedFil...,133.0,405.0
2,IJHS-1-1966-Issue-1,Stellar Distances: Galileo's Method and Its Su...,Michael Hoskin,https://insa.nic.in//writereaddata/UpLoadedFil...,189.0,594.0
3,IJHS-1-1966-Issue-1,A Survival of Babylonian Arithmetic in ew Guin...,Derek J De Solla Price,https://insa.nic.in//writereaddata/UpLoadedFil...,80.0,674.0
4,IJHS-1-1966-Issue-1,The Impetus Theory of the Vaisesikas,S N Sen,https://insa.nic.in//writereaddata/UpLoadedFil...,261.0,935.0
...,...,...,...,...,...,...
1935,IJHS-59-2024-Issue-3,Understanding the various scientific theories ...,Jun‑Young Oh,https://insa.nic.in//writereaddata/UpLoadedFil...,823.0,1536356.0
1936,IJHS-59-2024-Issue-3,Historical perspectives of critical care in In...,Ujjwala Murkute,https://insa.nic.in//writereaddata/UpLoadedFil...,527.0,1536883.0
1937,IJHS-59-2024-Issue-3,"Book Review: Health, medicine and the encounte...",Kamlesh Mohan,https://insa.nic.in//writereaddata/UpLoadedFil...,356.0,1537239.0
1938,IJHS-59-2024-Issue-3,Project Report: History of linguistic science ...,Satarupa Dattamajumdar Saha,https://insa.nic.in//writereaddata/UpLoadedFil...,487.0,1537726.0


In [482]:
import re

def classify_samples_for_visual_inspection(ijhs_df, num_samples=50):   
    def print_list(*args):
        for lst in args:
            print("\n".join([ f"{i+1:03d}) {x}"  for i, x in enumerate(lst.split("\n"))]))
            print("\n")

    def gen_df(ps, resp, df=None, run=-1):
        papers =  ps.split("\n")
        classes = [re.sub("^\s*\d+\s*", "", x.strip()) for x in resp.text.split("\n") if x.strip() != ""]
        print(f"InputPapers={len(papers)}; OutputClasses={len(classes)}")
        if df is None : df = pd.DataFrame ([ papers]).T
        df[f"run_{len(df.columns)+1 if run<0 else run}"] = classes
        return df
    
    slice_df = ijhs_df.sample(num_samples) #[ijhs_df.author.str.contains('u')]
    display(f"Sampled {slice_df.shape[0]}")
    # print_list(ps , resp.text)
    df = None 
    # classify 2 times to check for consistency 
    for run in range(0,2):
        ps, resp = classify_with_gemini (slice_df.paper.to_list())
        print (f"Run{run}\n======\n" , resp.usage_metadata)
        try :  
            df=gen_df(ps,resp, df, run)
            # display(df)
        except Exception as e:
            print(e)
            print(len(ps.split("\n")) , len(resp.text.split("\n")))
        
    # df['qc'] compares last two columns for equality
    df['qc'] = df.iloc[:,-1] == df.iloc[:,-2]
    display(pd.DataFrame(df.qc.value_counts()))
    display(df)

classify_samples_for_visual_inspection(ijhs_df, num_samples=10)


'Sampled 10'

Run0
 prompt_token_count: 404
candidates_token_count: 61
total_token_count: 465

InputPapers=10; OutputClasses=10
Run1
 prompt_token_count: 404
candidates_token_count: 61
total_token_count: 465

InputPapers=10; OutputClasses=10


Unnamed: 0_level_0,count
qc,Unnamed: 1_level_1
True,10


Unnamed: 0,0,run_0,run_1,qc
0,Mathematics and Astronomy in Medieval India,Math Dharmic,Math Dharmic,True
1,Book Reviews,x_Other x_Misc,x_Other x_Misc,True
2,Seminar Report,x_Other x_Misc,x_Other x_Misc,True
3,Vangastambhanasodhanam: A Chapter on Metallurg...,Metallurgy Dharmic,Metallurgy Dharmic,True
4,News,x_Other x_Misc,x_Other x_Misc,True
5,Arthur Koestler’s Osculation with Lamarckism a...,Biology Western,Biology Western,True
6,Project Report: The Traditional Ayurveda Pract...,Medicine Dharmic,Medicine Dharmic,True
7,News,x_Other x_Misc,x_Other x_Misc,True
8,The Main Characteristics of Hindu Astronomy in...,Astronomy Dharmic,Astronomy Dharmic,True
9,Fath Raihbar - The Massive Bronze Cannon at Pe...,Metallurgy Islamic,Metallurgy Islamic,True


In [453]:
# Process jihs_df in batches of 10 papers
def batch_classify(src_df, from_idx,to_idx,step,acc,failed_batches): 
    ttl = len(src_df)   
    for i in range(from_idx,to_idx,step):
        begin = i
        end = i+step if i+step < ttl else ttl
        key = f"{begin+1:04d}:{end:04d}"
        # print(f'\n===== {key} of {ttl} papers to classify') ; continue
        if ( (key in acc) and (acc[key].shape[0] >=7) ): 
            print(f'{key} of {ttl} already classified')
            continue
        print(f'\n===== {key}/{ttl} papers to classify')
        # continue
        df = src_df[begin:end].copy()
        # drop rows with empty single word titles
        # df = df[~df.paper.str.contains('^\s*\w+\s*$')]
        # display(df)
        try:
            ps, ans = classify_with_gemini(df.paper.to_list())
        except Exception as e:
            print(f'Error: {e} for {key}; Will continue after a minute')
            sleep(60)
            continue
        
        try :
            cat_df = pd.DataFrame([ x.split(' ')[1:]  for x in ans.text.split("\n") if ans.text.strip() != ""], columns=['subject', 'category'])#.dropna()
            ans_df = df.assign(subject=cat_df.subject.to_list()[:len(df)], category=cat_df.category.to_list()[:len(df)])
            display(ans_df.tail(4))
        except ValueError as e:
            numbered_ps = "\n".join([ f'{i+1}. {x}' for i,x in enumerate(df.paper.to_list()[-10:])])
            numbered_response = "\n".join([ f'{i+1}. {x}' for i,x in enumerate(ans.text.split("\n")[-10:])]) 
            print(f'Error: {e} for {key}; Will continue after 10 seconds')
            print(f'\nPrompt:\n{numbered_ps}')
            print(f'\nResponse:\n{numbered_response}')
            # display(f'Cat_df: {cat_df}')
            # display(cat_df)
            # sleep(10)
            failed_batches.append([begin, end])
            continue

        # delta_acc[key] = ans_df
        acc[key] = ans_df
        # indicate progress using tqdm
        print(f'===== {key} of {ttl} papers classified')
    return failed_batches
        # save the response


## first attempt to classify in batches of 50
## this is a long running process .. so use with caution

# acc={}
# failed_batches = []
# batch_classify(ijhs_df, 0, len(ijhs_df), 50, acc , failed_batches)

##second attempt to classify failed batches in batches of 10
# if len(failed_batches)  != 0:
#     delta_acc ={}
#     delta_failed_batches = []
#     for fb in failed_batches:
#         print(f'Processing failed batch {fb}')
#         batch_classify(ijhs_df, fb[0], fb[1], 10, delta_acc , delta_failed_batches)
#     # TODO: handle delta_failed_batches not being empty
#     # merge the two dictionaries
#     for k,v in delta_acc.items(): acc[k] = v

Processing failed batch [1050, 1100]
1051:1060 of 1940 already classified
1061:1070 of 1940 already classified
1071:1080 of 1940 already classified
1081:1090 of 1940 already classified
1091:1100 of 1940 already classified
Processing failed batch [1850, 1900]
1851:1860 of 1940 already classified
1861:1870 of 1940 already classified
1871:1880 of 1940 already classified
1881:1890 of 1940 already classified
1891:1900 of 1940 already classified


In [484]:
def save_to_csv(acc, filename='scraped/ijhs-classifed~.tsv'):
    vs = []
    for k in sorted(acc.keys()) :
        vs.append(acc[k])

    xdf = pd.concat(vs).sort_index().drop(columns=['cum_size_in_kb'])
    xdf.index = range(1, len(xdf)+1)
    xdf.size_in_kb = xdf.size_in_kb.fillna(0).astype(int)
    xdf.to_csv(
        filename, sep='\t',
        index_label='#',
    )
    print(f'Saved to {filename}')
    return xdf

save_to_csv(acc, filename='scraped/ijhs-classifed~.tsv')

Saved to scraped/ijhs-classifed~.tsv


Unnamed: 0,journal,paper,author,url,size_in_kb,subject,category
1,IJHS-1-1966-Issue-1,The Theory of Chemical Combination in Ancient ...,Priyadaranjan Ray,https://insa.nic.in//writereaddata/UpLoadedFil...,272,Biology,Dharmic
2,IJHS-1-1966-Issue-1,What was ‘The Scientific Revolution’ ?,J R Ravetz,https://insa.nic.in//writereaddata/UpLoadedFil...,133,Culture,Western
3,IJHS-1-1966-Issue-1,Stellar Distances: Galileo's Method and Its Su...,Michael Hoskin,https://insa.nic.in//writereaddata/UpLoadedFil...,189,Astronomy,Western
4,IJHS-1-1966-Issue-1,A Survival of Babylonian Arithmetic in ew Guin...,Derek J De Solla Price,https://insa.nic.in//writereaddata/UpLoadedFil...,80,Math,Other
5,IJHS-1-1966-Issue-1,The Impetus Theory of the Vaisesikas,S N Sen,https://insa.nic.in//writereaddata/UpLoadedFil...,261,MindSciences,Dharmic
...,...,...,...,...,...,...,...
1936,IJHS-59-2024-Issue-3,Understanding the various scientific theories ...,Jun‑Young Oh,https://insa.nic.in//writereaddata/UpLoadedFil...,823,Other,Misc
1937,IJHS-59-2024-Issue-3,Historical perspectives of critical care in In...,Ujjwala Murkute,https://insa.nic.in//writereaddata/UpLoadedFil...,527,Medicine,Western
1938,IJHS-59-2024-Issue-3,"Book Review: Health, medicine and the encounte...",Kamlesh Mohan,https://insa.nic.in//writereaddata/UpLoadedFil...,356,Medicine,Western
1939,IJHS-59-2024-Issue-3,Project Report: History of linguistic science ...,Satarupa Dattamajumdar Saha,https://insa.nic.in//writereaddata/UpLoadedFil...,487,Culture,Western


In [511]:
from IPython.display import display, Markdown
def examine_a_classified_sample(filename='scraped/ijhs-classifed~.tsv', num_samples=10):
    ijhs_classified = pd.read_csv(filename, sep='\t', index_col=0)
    display(Markdown(f"Loaded **{ijhs_classified.shape[0]} classified papers** whose columns are **{ijhs_classified.columns.to_list()}**"))
    ijhs_classified = ijhs_classified.assign(
        pdf = lambda x: [ s.split('/')[-1] for s in x.url]
    )
    cols=['paper','subject', 'category', 'pdf']
    sample_df=  ijhs_classified[cols].sample(num_samples).reset_index(drop='index').style.set_properties(
        subset=['subject', 'category'], **{'width': '60px', 'text-align': 'left' , 'white-space': 'pre-wrap', 'font-size': '10pt', 'color':'blue'}
    )
    display(
        Markdown(f"### Classified Samples\n"), sample_df,
        Markdown(f"### Using this Prompt\n```{system_prompt}```"),
    )

examine_a_classified_sample(filename='scraped/ijhs-classified~.tsv', num_samples=50)

Loaded **1940 classified papers** whose columns are **['journal', 'paper', 'author', 'url', 'size_in_kb', 'subject', 'category']**

### Classified Samples


Unnamed: 0,paper,subject,category,pdf
0,Contents,x_Misc,,Contents.pdf
1,The First Indian Aeronaut,Other,Misc,Vol27_3_6_AGhosh.pdf
2,Metals and Metallurgy in the Harappan Civilization,Metallurgy,Western,Vol53_3_2018__Art04.pdf
3,Editorial,x_Other,x_Misc,Vol51_4_2016_Art01.pdf
4,Orbituary: Shabbir Ahmad Khan Ghori,x_Other,x_Misc,Vol37_4_9_Obituary.pdf
5,NEWS,x_Other,x_Misc,Vol49_3_14_NEWS.pdf
6,HISTEM and the Making of Modern India — Some Questions and Explanations,Other,Misc,Vol50_2015_4_Art05.pdf
7,Determination of Ascensional Difference in the Lagnaprakarana,Astronomy,Dharmic,Vol53_3_2018__Art06.pdf
8,Contents,x_Other,x_Misc,Vol53_4_2018__Contents.pdf
9,Epoch of Ramakasiddhanta,Astronomy,Dharmic,Vol41_3_2_KCHari.pdf


### Using this Prompt
```
You are an expert in history of science. 
Given a newline-separated list of scientific paper topics, classify each topic into:

Subject (choose exactly one):
1) Astronomy
2) Math
3) Medicine
4) Agriculture
5) Culture
6) Metallurgy
7) MindSciences
8) Biology

Category (choose exactly one):
a) Dharmic
b) Islamic
c) Western
d) Fareast

If the paper title length is too short, please prefix with chosen subject and category with "x_". 
If the paper does not fit any of the above categories, please classify it as "Other Misc".
Never return empty classifications - this is a hard constraint.
Ensure the number of lines in the response matches the number of lines in the input.

Output format: Return only three words per each topic - the topic sequence number, subject and the category, nothing else. 
Preserve the order of the subjects and categories as in the prompt.

Example:
  Input: "Lalit K. Gurjar M.Sc. 
Contents 
Calculation for ‘chain‑reduction’ in the Triśatībhāṣya"
  Response:"Other Misc
x_Other x_Misc
Math Dharmic"

```

In [512]:
pd.read_csv('scraped/ijhs-classified~.tsv', sep='\t').columns.to_list()

['#', 'journal', 'paper', 'author', 'url', 'size_in_kb', 'subject', 'category']

Using ijhs_classified~.tsv whose columns are ['#', 'journal', 'paper', 'author', 'url', 'size_in_kb', 'subject', 'category'] , generate a markdown file that displays the above columns in a table.  The table should be sorted by subject and category.  The table should have the following columns: 

'#', 'journal', 'subject', 'category' 'paper', 'author', 'size_in_kb'

The paper column should be a link to the url column.  The author column should be a list of authors separated by commas.  The size_in_kb column should be rounded to the nearest integer.  The markdown file should be named ijhs_classified.md

Wrap the code in a function named md_the_classification that takes the input tsv file.
The output makrdown file should be named using the input file name with the extension changed to .md.
The function should return the markdown file name.

In [514]:
def md_the_classification(tsv_file):
    # Read the TSV file
    df = pd.read_csv(tsv_file, sep='\t')
    
    # # Sort by subject and category
    # df = df.sort_values(['subject', 'category'])

    df.sort_values(by=['#'], inplace=True)
    
    # Round size_in_kb to nearest integer
    df['size_in_kb'] = df['size_in_kb'].round().astype(int)
    
    # Create markdown table
    md = "# Indian Journal of History of Science - Classified Papers\n\n"
    md += "| # | Journal | Subject | Category | Paper | Author | Size (KB) |\n"
    md += "|---|---------|---------|----------|-------|--------|------------|\n"
    
    for _, row in df.iterrows():
        md += f"| {row['#']} | {row['journal']} | {row['subject']} | {row['category']} | [{row['paper']}]({row['url']}) | {row['author']} | {row['size_in_kb']} |\n"
    
    # Generate output filename
    out_file = tsv_file.rsplit('.', 1)[0] + '.md'
    
    # Write markdown file
    with open(out_file, 'w', encoding='utf-8') as f:
        f.write(md)

    print(f"Markdown file written to {out_file}")
        
    return out_file

md_the_classification('scraped/ijhs-classified~.tsv')


Markdown file written to scraped/ijhs-classified~.md


'scraped/ijhs-classified~.md'

Using tsv files whose columns are ['#', 'journal', 'paper', 'author', 'url', 'size_in_kb', 'subject', 'category'] , generate a markdown file that displays the above columns in a table.  The table should be sorted by subject and category.  The table should have the following columns: 

'#', 'journal', 'subject', 'category' 'paper', 'author', 'size_in_kb'

The paper column should be a link to the url column.  The author column should be a list of authors separated by commas.  The size_in_kb column should be rounded to the nearest integer. The table should be sorted by the # column.

The md page must display a text box on top the table.  The table should get filtered based on the text box. Only those rows that contain the text in the text box should be displayed.  The text box should be treated as a case insensitive regular expression. The selected text should be highlighted in the table. 

The text box should be cleared when the page is loaded. 

The markdown file should be named using the input file name with the extension changed to .md

Wrap the code in a function named md_the_classification_with_search that takes the input tsv file.

The function should return the markdown file name.

In [518]:
def md_the_classification_with_search(tsv_file):
    # Read the TSV file
    df = pd.read_csv(tsv_file, sep='\t')
    
    # Sort by index (#)
    df.sort_values(by=['#'], inplace=True)
    
    # Round size_in_kb to nearest integer
    df['size_in_kb'] = df['size_in_kb'].round().astype(int)

    # Create markdown with search functionality
    md = """# Indian Journal of History of Science - Classified Papers

<script>

function filterTable() {
  var input = document.getElementById("searchInput");
  var filter = input.value.toLowerCase();
  var table = document.getElementById("papers");
  var tr = table.getElementsByTagName("tr");

  // First clear all previous markers
  var marks = table.getElementsByTagName("mark");
  while(marks.length > 0) {
    var parent = marks[0].parentNode;
    parent.innerHTML = parent.textContent;
  }

  try {
    var regex = new RegExp(filter, 'i');
    for (var i = 1; i < tr.length; i++) {
      var td = tr[i].getElementsByTagName("td");
      var show = false;
      for (var j = 0; j < td.length; j++) {
        var cell = td[j];
        if (cell) {
          var text = cell.textContent || cell.innerText;

          // Check if text matches regex
          if (text.match(regex)) {
            show = true;
            
            // Highlight matching text
            cell.innerHTML = text.replace(regex, function(match) {
              var ans = ("<mark>" + match + "</mark>");
              return ans;
              // yield ans;
            });
            
          }
        }
      }
      tr[i].style.display = show ? "" : "none";
    }
  } catch(e) {
    // If invalid regex, treat as plain text
    for (var i = 1; i < tr.length; i++) {
      var td = tr[i].getElementsByTagName("td");
      var show = false;
      for (var j = 0; j < td.length; j++) {
        var cell = td[j];
        if (cell) {
          var text = cell.textContent || cell.innerText;
          if (text.toLowerCase().indexOf(filter) > -1) {
            show = true;
          }
        }
      }
      tr[i].style.display = show ? "" : "none";
    }
  }
}

// Debounce helper function
function debounce(func, wait) {
    let timeout;
    return function(...args) {
        clearTimeout(timeout);
        timeout = setTimeout(() => func.apply(this, args), wait);
    };
}

// Debounced filter function
const debouncedFilter = debounce(filterTable, 500 );

// Clear search box on page load
window.onload = function() {
    document.getElementById("searchInput").value = "";
    filterTable();
}

let lastCallTime = Date.now();
function debouncedFilter1() {
    const now = Date.now();
    const timeSinceLastCall = now - lastCallTime;
    console.log(timeSinceLastCall);
    if (timeSinceLastCall < 300) {
        lastCallTime = now;
        return;
    }
    lastCallTime = now;
    filterTable();
}
</script>

<input type="text" id="searchInput" placeholder="Search..." onkeyup="debouncedFilter()" style="width: 100%; padding: 12px 20px; margin: 8px 0; box-sizing: border-box;">


<table id="papers">
<tr>
<th>#</th>
<th>Journal</th>
<th>Subject</th>
<th>Category</th>
<th>Paper</th>
<th>Author</th>
<th>Size (KB)</th>
</tr>
"""
    
    # Add table rows
    for _, row in df.iterrows():
        md += f"<tr>\n"
        md += f"<td>{row['#']}</td>\n"
        md += f"<td>{row['journal']}</td>\n"
        md += f"<td>{row['subject']}</td>\n"
        md += f"<td>{row['category']}</td>\n"
        md += f"<td><a href='{row['url']}'>{row['paper']}</a></td>\n"
        md += f"<td>{row['author']}</td>\n"
        md += f"<td>{row['size_in_kb']}</td>\n"
        md += f"</tr>\n"
    
    md += "</table>"
    
    # Generate output filename
    out_file = tsv_file.rsplit('.', 1)[0] + '-search.md'
    
    # Write markdown file
    with open(out_file, 'w', encoding='utf-8') as f:
        f.write(md)
        
    print(f"Markdown file written to {out_file}")
    return out_file

md_the_classification_with_search('scraped/ijhs-classified~.tsv')

Markdown file written to scraped/ijhs-classified~-search.md


'scraped/ijhs-classified~-search.md'

In [521]:
def md_the_classification_with_search(tsv_file):
    # Read the TSV file
    df = pd.read_csv(tsv_file, sep='\t')
    
    # Sort by index (#)
    df.sort_values(by=['#'], inplace=True)
    
    # Round size_in_kb to nearest integer
    df['size_in_kb'] = df['size_in_kb'].round().astype(int)
    
    # Create markdown with search functionality
    md = """<h2> Indian Journal of History of Science - Classified Papers</h2>

<script>
function highlightText(text, filter) {
    if (!filter) return text;
    const regex = new RegExp(`(${filter})`, 'gi');
    return text.replace(regex, '<mark>$1</mark>');
}

function filterTable() {
    const input = document.getElementById("searchInput");
    const filter = input.value.toLowerCase();
    const table = document.getElementById("papers");
    const rows = table.getElementsByTagName("tr");

    for (let i = 1; i < rows.length; i++) {
        const cells = rows[i].getElementsByTagName("td");
        let show = false;
        let rowHtml = '';
        
        for (let j = 0; j < cells.length; j++) {
            const cell = cells[j];
            const text = cell.textContent || cell.innerText;
            
            if (text.toLowerCase().includes(filter)) {
                show = true;
            }
            
            // For cells with links, preserve the link while highlighting
            if (cell.getElementsByTagName("a").length > 0) {
                const link = cell.getElementsByTagName("a")[0];
                const href = link.getAttribute("href");
                rowHtml += `<td><a href="${href}">${highlightText(text, filter)}</a></td>`;
            } else {
                rowHtml += `<td>${highlightText(text, filter)}</td>`;
            }
        }
        
        if (show) {
            rows[i].style.display = "";
            rows[i].innerHTML = rowHtml;
        } else {
            rows[i].style.display = "none";
        }
    }
}

// Debounce helper function
function debounce(func, wait) {
    let timeout;
    return function(...args) {
        clearTimeout(timeout);
        timeout = setTimeout(() => func.apply(this, args), wait);
    };
}

// Debounced filter function
const debouncedFilter = debounce(filterTable, 500 );

// Clear search box on page load
window.onload = function() {
    document.getElementById("searchInput").value = "";
    filterTable();
}
</script>

<input type="text" id="searchInput" placeholder="Search..." style="width: 100%; padding: 12px 20px; margin: 8px 0; box-sizing: border-box;" onkeyup="debouncedFilter()">


<table id="papers">
<tr>
<th>#</th>
<th>Journal</th>
<th>Subject</th>
<th>Category</th>
<th>Paper</th>
<th>Author</th>
<th>Size (KB)</th>
</tr>
"""
    
    # Add table rows
    for _, row in df.iterrows():
        md += f"<tr>\n"
        md += f"<td>{row['#']}</td>\n"
        md += f"<td>{row['journal']}</td>\n"
        md += f"<td>{row['subject']}</td>\n"
        md += f"<td>{row['category']}</td>\n"
        md += f"<td><a href='{row['url']}'>{row['paper']}</a></td>\n"
        md += f"<td>{row['author']}</td>\n"
        md += f"<td>{row['size_in_kb']}</td>\n"
        md += f"</tr>\n"
    
    md += "</table>"
    
    # Generate output filename
    out_file = tsv_file.rsplit('.', 1)[0] + '-search.md'
  
    # Write markdown file
    with open(out_file, 'w', encoding='utf-8') as f:
        f.write(md)
    
    print(f"Markdown file written to {out_file}")
    return out_file

md_the_classification_with_search('scraped/ijhs-classified~.tsv')

Markdown file written to scraped/ijhs-classified~-search.md


'scraped/ijhs-classified~-search.md'