In [17]:
# !apt-get update
# !pip install selenium
# !apt install chromium-chromedriver
# !cp /usr/lib/chromium-browser/chromedriver /usr/bin
import sys
sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')

In [18]:
import numpy as np
import matplotlib.pyplot as plt 
import pandas as pd
import time

%matplotlib inline

In [19]:
# wacv https://openaccess.thecvf.com/WACV2022
# cvpr https://openaccess.thecvf.com/CVPR2022?day=all
# iccv https://openaccess.thecvf.com/ICCV2021?day=all
url = "https://openaccess.thecvf.com/CVPR2022?day=all"
year = 2022
conference = "cvpr"
file_name = conference + '_' + str(year) + '.csv'
root = r'D:\miniconda\Lib\site-packages\chromedriver.exe'



In [20]:
 
# Crawl the meta data from CVPR Open Access
# Set up a browser to crawl from dynamic web pages 
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service

chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
serv = Service(root)
wd = webdriver.Chrome(service=serv,options=chrome_options)


# Load URL for all CVPR accepted papers.
wd.get(url) #FIXME

meta_list = [] 
wait_time = 1
max_try = 1000

html_link = []

# paper_link = wd.find_elements_by_partial_link_text("pdf")
paper_link = wd.find_elements(By.PARTIAL_LINK_TEXT,"pdf")
for link in paper_link:
    pdf_link = link.get_attribute("href")
    
    pdf_link = pdf_link.replace('/papers/', '/html/',)
    pdf_link = pdf_link.replace('.pdf', '.html',)
    #pdf与子页面网址不一样时，需要手动替换
    # pdf_link = pdf_link.replace('content_ICCV_2017', 'content_iccv_2017',)


    html_link.append(pdf_link)

# # title = wd.find_elements_by_class_name("ptitle")
# title = wd.find_elements(By.CLASS_NAME,"ptitle")
# title = [t.text for t in title]

# author = wd.find_elements(By.CLASS_NAME,"authsearch")
# author = [t.text for t in author]

# print("The number of total accepted paper titles : ", len(title))
print("The number of total accepted paper htmls : ", len(html_link))


The number of total accepted paper htmls :  2077


In [24]:
lens = int(len(html_link))
title_arr = []
author_arr = []
abstract_arr = []
page_arr = []

for i in range(lens):
    wd.get(html_link[i])
    if(wd.title == "404 Not Found"):
        continue
    else:
        title = wd.find_element(By.ID,"papertitle")
        informations = wd.find_element(By.ID,"authors")
        abstract = wd.find_element(By.ID,"abstract")

        information = informations.text.split(';')
        author = information[0]
        page = information[1]

        title_arr.append(title.text)
        author_arr.append(author)
        page_arr.append(page)
        abstract_arr.append(abstract.text)

KeyboardInterrupt: 

In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from collections import Counter

print(stopwords.words('english'))

stopwords_deep_learning = ['learning', 'network', 'neural', 'networks', 'deep', 'via', 'using', 'convolutional', 'single']

keyword_list = []

for i, link in enumerate(html_link):
  
  print(i, "th paper's title : ", title[i])
    
  word_list = title[i].split(" ")
  word_list = list(set(word_list))
    
  word_list_cleaned = [] 
  for word in word_list: 
    word = word.lower()
    if word not in stopwords.words('english') and word not in stopwords_deep_learning: #remove stopwords
          word_list_cleaned.append(word)  
    
  for k in range(len(word_list_cleaned)):
    keyword_list.append(word_list_cleaned[k])
  
keyword_counter = Counter(keyword_list)
print(keyword_counter)  

print('{} different keywords before merging'.format(len(keyword_counter)))

# Merge duplicates: CNNs and CNN
duplicates = []
for k in keyword_counter:
    if k+'s' in keyword_counter:
        duplicates.append(k)
for k in duplicates:
    keyword_counter[k] += keyword_counter[k+'s']
    del keyword_counter[k+'s']
print('{} different keywords after merging'.format(len(keyword_counter)))
print(keyword_counter)  

print("")

In [None]:
# Show N most common keywords and their frequencies
num_keyowrd = 75 #FIXME
keywords_counter_vis = keyword_counter.most_common(num_keyowrd)

plt.rcdefaults()
fig, ax = plt.subplots(figsize=(8, 18))

key = [k[0] for k in keywords_counter_vis] 
value = [k[1] for k in keywords_counter_vis] 
y_pos = np.arange(len(key))
ax.barh(y_pos, value, align='center', color='green', ecolor='black', log=True)
ax.set_yticks(y_pos)
ax.set_yticklabels(key, rotation=0, fontsize=10)
ax.invert_yaxis() 
for i, v in enumerate(value):
    ax.text(v + 3, i + .25, str(v), color='black', fontsize=10)
ax.set_xlabel('Frequency')
ax.set_title('CVPR 2018 Submission Top {} Keywords'.format(num_keyowrd))

plt.show()

In [None]:
# Show the word cloud forming by keywords
from wordcloud import WordCloud
wordcloud = WordCloud(max_font_size=64, max_words=160, 
                      width=1280, height=640,
                      background_color="black").generate(' '.join(keyword_list))
plt.figure(figsize=(16, 8))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

In [None]:
year_arr = [year for i in range(lens)]
conference_arr = [conference for i in range(lens)]
data = {
    'Title': title_arr,
    'Author': author_arr,
    'Page': page_arr,
    'Abstract': abstract_arr,
    'Link': html_link,
    'Year': year_arr,
    'Conference': conference_arr,
}

df = pd.DataFrame(data)

In [None]:
df.to_csv(file_name,sep=',',index=False,header=True)