In [7]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [20]:
def setup_webdriver():
    options = webdriver.ChromeOptions()
    driver = webdriver.Chrome(options=options)
    return driver

In [21]:
def fetch_page_source(driver, url):
    driver.get(url)
    try:
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CLASS_NAME, 'List-results-items'))
        )
        return driver.page_source
    except TimeoutException:
        print(f"Timeout while waiting for page to load: {url}")
        return None



In [26]:
def parse_articles(soup):
    articles_data = {}
    content = soup.find('xpl-results-list')
    articles = content.find_all('div', class_="List-results-items")
    author_names = []
    for article in articles:
        title_element = article.find('a')
        title = title_element.text
        link = title_element.attrs['href']
        
        year_span = article.find('span', string=lambda t: t and 'Year:' in t)
        year = year_span.text.split(': ')[1] if year_span else 'Year not found'
        
        conference_title = article.find('div', class_='description text-base-md-lh').a.string.strip()
        
        conference_paper_span = article.find('span', string=lambda text: text and 'Conference Paper' in text)
        journal_article_span = article.find('span', string=lambda text: text and 'Journal Article' in text)
        paper_type = conference_paper_span.string if conference_paper_span else (journal_article_span.string if journal_article_span else 'Unknown')
        
        publisher_span = article.find('span', string=lambda text: text and 'Publisher:' in text)
        publisher_span_text = publisher_span.find_next_sibling('span') 
        publisher = publisher_span_text.text.strip() if publisher_span_text else 'Publisher not found'

        author_names = [author.find('span').text.strip() for author in article.find_all('a', target="_self")]
        authors = article.find_all('a', target="_self")
        if authors == []:
            authors = article.find_all('button', {'xplhighlight' : True})
            year = article.find('span', string=lambda t: t and 'Year:' in t).text.split(': ')[1]
        for author in authors:
            author_names.append(author.find('span').text.strip())
        articles_data[title] = [
            "https://ieeexplore.ieee.org" + link,
            list(set(author_names)),
            year,
            conference_title,
            paper_type,
            publisher
        ]
        author_names = []
    return articles_data



In [27]:
def getTitles(end_page):
    driver = setup_webdriver()
    result = {}
    
    for i in range(1, end_page):
        url = f"https://ieeexplore.ieee.org/search/searchresult.jsp?action=search&highlight=true&returnType=SEARCH&matchPubs=true&rowsPerPage=10&refinements=ContentType:Conferences&refinements=ContentType:Journals&returnFacets=ALL&pageNumber={i}"
        page_source = fetch_page_source(driver, url)
        
        if page_source:
            soup = BeautifulSoup(page_source, 'html.parser')
            articles_data = parse_articles(soup)
            result.update(articles_data)
    
    driver.quit()
    return result

In [34]:
titles_and_info = getTitles(2)

In [35]:
titles_and_info

{'Deep Residual Learning for Image Recognition': ['https://ieeexplore.ieee.org/document/7780459/',
  ['Shaoqing Ren', 'Xiangyu Zhang', 'Kaiming He', 'Jian Sun'],
  '2016',
  '2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)',
  'Conference Paper',
  'IEEE'],
 'A mathematical theory of communication': ['https://ieeexplore.ieee.org/document/6773024/',
  ['C. E. Shannon'],
  '1948',
  'The Bell System Technical Journal',
  'Journal Article',
  'Nokia Bell Labs'],
 'A new look at the statistical model identification': ['https://ieeexplore.ieee.org/document/1100705/',
  ['H. Akaike'],
  '1974',
  'IEEE Transactions on Automatic Control',
  'Journal Article',
  'IEEE'],
 'Image quality assessment: from error visibility to structural similarity': ['https://ieeexplore.ieee.org/document/1284395/',
  ['E.P. Simoncelli', 'Zhou Wang', 'H.R. Sheikh', 'A.C. Bovik'],
  '2004',
  'IEEE Transactions on Image Processing',
  'Journal Article',
  'IEEE'],
 'Gradient-based learning ap

In [13]:
def getInfo(paper_dict):
    options = webdriver.ChromeOptions()
    # Launch Chrome:
    driver = webdriver.Chrome(options=options)
    result = {}
    
    for paper_title, paper_info in paper_dict.items():
        url = paper_info[0]  # Extract the URL from the dictionary
        driver.get(url)
        
        try:
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located(
                    (By.CLASS_NAME, 'u-mb-1')  # Adjust this if necessary for the correct class
                )
            )
        except TimeoutException:
            print(f"Timeout while waiting for abstract to load on {url}")
            continue
        
        page_source = driver.page_source
        soup = BeautifulSoup(page_source, 'html.parser')
        dynamic_content = soup.find('div', {'xplmathjax': True})
        
        if dynamic_content:
            result[paper_title] = dynamic_content.get_text(strip=True)
        else:
            result[paper_title] = "No abstract found"
    
    driver.quit()
    return result


In [30]:
def load_page(driver, url):
    driver.get(url)
    try:
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CLASS_NAME, 'u-mb-1'))
        )
        return driver.page_source
    except TimeoutException:
        print(f"Timeout while waiting for abstract to load on {url}")
        return None

In [31]:
def extract_abstract(soup):
    dynamic_content = soup.find('div', {'xplmathjax': True})
    if dynamic_content:
        return dynamic_content.get_text(strip=True)
    else:
        return "No abstract found"

In [36]:
def getInfo(paper_dict):
    driver = setup_webdriver()
    result = {}
    
    for paper_title, paper_info in paper_dict.items():
        url = paper_info[0]  # Extract the URL from the dictionary
        page_source = load_page(driver, url)
        
        if page_source:
            soup = BeautifulSoup(page_source, 'html.parser')
            abstract = extract_abstract(soup)
            result[paper_title] = abstract
    
    driver.quit()
    return result

In [37]:
abstracts_test = getInfo(titles_and_info)

In [38]:
abstracts_test

{'Deep Residual Learning for Image Recognition': 'Deeper neural networks are more difficult to train. We present a residual learning framework to ease the training of networks that are substantially deeper than those used previously. We explicitly reformulate the layers as learning residual functions with reference to the layer inputs, instead of learning unreferenced functions. We provide comprehensive empirical evidence showing that these residual networks are easier to optimize, and can gain accuracy from considerably increased depth. On the ImageNet dataset we evaluate residual nets with a depth of up to 152 layers - 8× deeper than VGG nets [40] but still having lower complexity. An ensemble of these residual nets achieves 3.57% error on the ImageNet test set. This result won the 1st place on the ILSVRC 2015 classification task. We also present analysis on CIFAR-10 with 100 and 1000 layers. The depth of representations is of central importance for many visual recognition tasks. Sol

In [39]:
def dict_to_csv(publications_dict, abstract_dict):
    rows = []
    for title, details in publications_dict.items():
        rows.append({
            'Title': title,
            'Author': ', '.join(details[1]),
            'Abstract': abstract_dict.get(title, ''),  # Use the get method to avoid KeyError
            'Year': details[2],
            'Journal/Conference Name': details[3],
            'Conference or Journal': details[4],
            'Publisher': details[5]
    })
    df = pd.DataFrame(rows)
    return df

In [40]:
test_df = dict_to_csv(titles_and_info, abstracts_test)

In [41]:
test_df

Unnamed: 0,Title,Author,Abstract,Year,Journal/Conference Name,Conference or Journal,Publisher
0,Deep Residual Learning for Image Recognition,"Shaoqing Ren, Xiangyu Zhang, Kaiming He, Jian Sun",Deeper neural networks are more difficult to t...,2016,2016 IEEE Conference on Computer Vision and Pa...,Conference Paper,IEEE
1,A mathematical theory of communication,C. E. Shannon,The recent development of various methods of m...,1948,The Bell System Technical Journal,Journal Article,Nokia Bell Labs
2,A new look at the statistical model identifica...,H. Akaike,The history of the development of statistical ...,1974,IEEE Transactions on Automatic Control,Journal Article,IEEE
3,Image quality assessment: from error visibilit...,"E.P. Simoncelli, Zhou Wang, H.R. Sheikh, A.C. ...",Objective methods for assessing perceptual ima...,2004,IEEE Transactions on Image Processing,Journal Article,IEEE
4,Gradient-based learning applied to document re...,"L. Bottou, Y. Bengio, P. Haffner, Y. Lecun",Multilayer neural networks trained with the ba...,1998,Proceedings of the IEEE,Journal Article,IEEE
5,ImageNet: A large-scale hierarchical image dat...,"Wei Dong, Kai Li, Richard Socher, Li-Jia Li, L...",The explosion of image data on the Internet ha...,2009,2009 IEEE Conference on Computer Vision and Pa...,Conference Paper,IEEE
6,A fast and elitist multiobjective genetic algo...,"K. Deb, T. Meyarivan, A. Pratap, S. Agarwal",Multi-objective evolutionary algorithms (MOEAs...,2002,IEEE Transactions on Evolutionary Computation,Journal Article,IEEE
7,Particle swarm optimization,"J. Kennedy, R. Eberhart",A concept for the optimization of nonlinear fu...,1995,Proceedings of ICNN'95 - International Confere...,Conference Paper,IEEE
8,Going deeper with convolutions,"Dumitru Erhan, Yangqing Jia, Scott Reed, Andre...",We propose a deep convolutional neural network...,2015,2015 IEEE Conference on Computer Vision and Pa...,Conference Paper,IEEE
9,"You Only Look Once: Unified, Real-Time Object ...","Joseph Redmon, Ross Girshick, Santosh Divvala,...","We present YOLO, a new approach to object dete...",2016,2016 IEEE Conference on Computer Vision and Pa...,Conference Paper,IEEE


In [None]:
# df.to_csv('publications_with_abstracts.csv', index=False)