In [21]:
import bs4
import requests
import pandas as pd
from io import StringIO

In [10]:
url = "https://iccv2023.thecvf.com/main.conference.program-107.php"

In [None]:
response = requests.get(url)
soup = bs4.BeautifulSoup(response.text, "html.parser")
for table in soup.find_all("table", {'class': 'Posters'}):
    df = pd.read_html(str(table))[0]
    #display(df)
    titles = df[['Session','Paper Title Author·s Name·s']].groupby('Session')

    for name, group in titles:
        if name == "Human pose/shape estimation":
            group.to_csv(f"data/Human Pose.csv")  

### Extract Papers and create JSON

In [None]:
import json
import requests
from bs4 import BeautifulSoup

def save_file(link):
    # Saving the PDF File
    local_file_name = "papers/" + link['href'].strip().split("/")[-1]
    uri = "https://openaccess.thecvf.com/" + link['href']
    r = requests.get(uri)
    with open(local_file_name, "wb") as code:
        code.write(r.content)
    return local_file_name

if __name__ == '__main__':
    paper = {}
    conference = "ICCV2023"
    res = requests.get("http://openaccess.thecvf.com/"+conference+"?day=all")
    if res.status_code == 200:
        soup = BeautifulSoup(res.content, "html.parser")
        results = soup.find_all("dt", class_="ptitle")
        for res in results:
            print(res.text)
            paper[res.text] = {}
            anchor = res
            authors = anchor.find_next_sibling("dd")
            paper[res.text]["authors"] = []
            for auth in authors.find_all("a"):
                #print(auth.text)
                paper[res.text]["authors"].append(auth.text)
            links = authors.find_next_sibling("dd")
            for link in links.find_all("a", string="pdf", href=True):
                local_save_location = save_file(link)
                paper[res.text]["url"] = local_save_location
                #print(link['href'])

    with open("cvf_data_w_pdf.json", "w") as fw:
        json.dump(paper, fw, ensure_ascii=False, indent=4)

### Read the Above JSON file and generate embedding for the paper titles

In [75]:
import json

with open("cvf_data_w_pdf.json", "r") as fr:
    data = json.load(fr)
    sentences = list(data.keys())

In [76]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
embeddings = model.encode(sentences)

In [77]:
embeddings.shape

(2161, 384)

In [78]:
### We will index and save these embeddings to a file
import hnswlib
import numpy as np
import pickle

dim = embeddings.shape[1]
num_elements = len(embeddings)

# Generating sample data
ids = np.arange(num_elements)

# Declaring index
p = hnswlib.Index(space = 'l2', dim = dim) # possible options are l2, cosine or ip

# Initializing index - the maximum number of elements should be known beforehand
p.init_index(max_elements = num_elements, ef_construction = 200, M = 16)

# Element insertion (can be called several times):
p.add_items(embeddings, ids)

# Controlling the recall by setting ef:
p.set_ef(50) # ef should always be > k

with open("embed_model.pkl", "wb") as fw:
    pickle.dump(p, fw)

In [79]:
### Creating a query engine

def get_best_matches(query, k=1):
    #p = open("embed_model.pkl","rb")    
    query_embedding = model.encode([query])[0]
    idx, distances = p.knn_query(query_embedding, k=k)
    for label, dist in zip(idx, distances):
        return sentences[label[0]]
    return None

In [80]:
get_best_matches("Few-Shot Common Action Localization via Cross-Attentional Fusion of Context and Temporal Dynamics Juntae Lee, Mihir Jain, Sungrack Yun Paper ID:9014 9")

'Few-Shot Common Action Localization via Cross-Attentional Fusion of Context and Temporal Dynamics'

In [83]:
### Updating this file with closest match
import json

with open("cvf_data_w_pdf.json", "r") as fr:
    paper_to_obj = json.load(fr)
   

In [93]:
import glob
import csv
from collections import defaultdict

meta_json = defaultdict(list)

for fname in glob.glob("data/*.csv"):
    with open(fname) as f:
        reader = csv.DictReader(f, delimiter=",")
        category = fname.split("/")[-1].replace('.csv', '')
        for row in reader:
            paper_name = row["Paper Title Author·s Name·s"]
            # find the closest paper_name match            
            match = get_best_matches(paper_name)

            if match:
                meta_json[category].append({
                    "link": paper_to_obj[match]["url"],
                    "title": match,
                    "authors": paper_to_obj[match]["authors"]
                })

            #print(category, '|', paper_name)
            


### Persisting meta json

In [96]:
with open("iccv_data_w_cat.json", "w") as fw:
    json.dump(meta_json, fw, ensure_ascii=False)

In [None]:
for cat, papers in meta_json.items():
    print(cat, len(papers))
    for paper in papers:
        print(paper['title'])

In [None]:
!pip install markdown


In [106]:
import json
import os
from py_markdown_table.markdown_table import markdown_table

### Saving as markdown
README = "README.md"
ICCV_DATA = "iccv_data_w_cat.json"

with open(ICCV_DATA) as fd, open(README, "w") as md:
    iccv_data = json.load(fd)
    
    md.write("# ICCV 2023 \n")
    md.write("Papers from ICCV 2023 with categories")

    for paper_cat, papers in iccv_data.items():        
        
        md.write(f"\n### {paper_cat} \n")

        # generating the dataframe
        table = []

        for paper in papers:
            paper_title = paper['title'] 
            paper_link = paper['link']
            md_link = f"[Paper](<{paper_link}>)"
            table.append([paper_title, md_link])
        df = pd.DataFrame(table, columns=["Paper Title", "Link"])
        content  = markdown_table(df.to_dict(orient='records')).setParams(row_sep = 'markdown', quote = False, padding_weight='centerright').getMarkdown()
        md.write(content + "\n\n")