In [7]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [20]:
def setup_webdriver():
    options = webdriver.ChromeOptions()
    driver = webdriver.Chrome(options=options)
    return driver

In [21]:
def fetch_page_source(driver, url):
    driver.get(url)
    try:
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CLASS_NAME, 'List-results-items'))
        )
        return driver.page_source
    except TimeoutException:
        print(f"Timeout while waiting for page to load: {url}")
        return None



In [26]:
def parse_articles(soup):
    articles_data = {}
    content = soup.find('xpl-results-list')
    articles = content.find_all('div', class_="List-results-items")
    author_names = []
    for article in articles:
        title_element = article.find('a')
        title = title_element.text
        link = title_element.attrs['href']
        
        year_span = article.find('span', string=lambda t: t and 'Year:' in t)
        year = year_span.text.split(': ')[1] if year_span else 'Year not found'
        
        conference_title = article.find('div', class_='description text-base-md-lh').a.string.strip()
        
        conference_paper_span = article.find('span', string=lambda text: text and 'Conference Paper' in text)
        journal_article_span = article.find('span', string=lambda text: text and 'Journal Article' in text)
        paper_type = conference_paper_span.string if conference_paper_span else (journal_article_span.string if journal_article_span else 'Unknown')
        
        publisher_span = article.find('span', string=lambda text: text and 'Publisher:' in text)
        publisher_span_text = publisher_span.find_next_sibling('span') 
        publisher = publisher_span_text.text.strip() if publisher_span_text else 'Publisher not found'

        author_names = [author.find('span').text.strip() for author in article.find_all('a', target="_self")]
        authors = article.find_all('a', target="_self")
        if authors == []:
            authors = article.find_all('button', {'xplhighlight' : True})
            year = article.find('span', string=lambda t: t and 'Year:' in t).text.split(': ')[1]
        for author in authors:
            author_names.append(author.find('span').text.strip())
        articles_data[title] = [
            "https://ieeexplore.ieee.org" + link,
            list(set(author_names)),
            year,
            conference_title,
            paper_type,
            publisher
        ]
        author_names = []
    return articles_data



In [44]:
def getTitles(end_page):
    driver = setup_webdriver()
    result = {}
    
    for i in range(1, end_page):
        url = f"https://ieeexplore.ieee.org/search/searchresult.jsp?action=search&highlight=true&returnType=SEARCH&matchPubs=true&rowsPerPage=100&refinements=ContentType:Conferences&refinements=ContentType:Journals&returnFacets=ALL&pageNumber={i}"
        page_source = fetch_page_source(driver, url)
        
        if page_source:
            soup = BeautifulSoup(page_source, 'html.parser')
            articles_data = parse_articles(soup)
            result.update(articles_data)
    
    driver.quit()
    return result

In [45]:
titles_and_info = getTitles(52)

In [46]:
titles_and_info

{'Deep Residual Learning for Image Recognition': ['https://ieeexplore.ieee.org/document/7780459/',
  ['Shaoqing Ren', 'Xiangyu Zhang', 'Kaiming He', 'Jian Sun'],
  '2016',
  '2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)',
  'Conference Paper',
  'IEEE'],
 'A mathematical theory of communication': ['https://ieeexplore.ieee.org/document/6773067/',
  ['C. E. Shannon'],
  '1948',
  'The Bell System Technical Journal',
  'Journal Article',
  'Nokia Bell Labs'],
 'A new look at the statistical model identification': ['https://ieeexplore.ieee.org/document/1100705/',
  ['H. Akaike'],
  '1974',
  'IEEE Transactions on Automatic Control',
  'Journal Article',
  'IEEE'],
 'Image quality assessment: from error visibility to structural similarity': ['https://ieeexplore.ieee.org/document/1284395/',
  ['E.P. Simoncelli', 'Zhou Wang', 'H.R. Sheikh', 'A.C. Bovik'],
  '2004',
  'IEEE Transactions on Image Processing',
  'Journal Article',
  'IEEE'],
 'Gradient-based learning ap

In [30]:
def load_page(driver, url):
    driver.get(url)
    try:
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CLASS_NAME, 'u-mb-1'))
        )
        return driver.page_source
    except TimeoutException:
        print(f"Timeout while waiting for abstract to load on {url}")
        return None

In [31]:
def extract_abstract(soup):
    dynamic_content = soup.find('div', {'xplmathjax': True})
    if dynamic_content:
        return dynamic_content.get_text(strip=True)
    else:
        return "No abstract found"

In [47]:
def getInfo(paper_dict):
    driver = setup_webdriver()
    result = {}
    
    for paper_title, paper_info in paper_dict.items():
        url = paper_info[0]  # Extract the URL from the dictionary
        page_source = load_page(driver, url)
        
        if page_source:
            soup = BeautifulSoup(page_source, 'html.parser')
            abstract = extract_abstract(soup)
            result[paper_title] = abstract
    
    driver.quit()
    return result

In [51]:
abstracts_test = getInfo(titles_and_info)

In [53]:
len(abstracts_test)

5094

In [55]:
def dict_to_csv(publications_dict, abstract_dict):
    rows = []
    for title, details in publications_dict.items():
        rows.append({
            'Title': title,
            'Author': ', '.join(details[1]),
            'Abstract': abstract_dict.get(title, ''),  # Use the get method to avoid KeyError
            'Year': details[2],
            'Journal/Conference Name': details[3],
            'Conference or Journal': details[4],
            'Publisher': details[5],
            'Link' : details[0]
        })
    df = pd.DataFrame(rows)
    return df

In [56]:
test_df = dict_to_csv(titles_and_info, abstracts_test)

In [57]:
test_df

Unnamed: 0,Title,Author,Abstract,Year,Journal/Conference Name,Conference or Journal,Publisher,Link
0,Deep Residual Learning for Image Recognition,"Shaoqing Ren, Xiangyu Zhang, Kaiming He, Jian Sun",Deeper neural networks are more difficult to t...,2016,2016 IEEE Conference on Computer Vision and Pa...,Conference Paper,IEEE,https://ieeexplore.ieee.org/document/7780459/
1,A mathematical theory of communication,C. E. Shannon,In this final installment of the paper we cons...,1948,The Bell System Technical Journal,Journal Article,Nokia Bell Labs,https://ieeexplore.ieee.org/document/6773067/
2,A new look at the statistical model identifica...,H. Akaike,The history of the development of statistical ...,1974,IEEE Transactions on Automatic Control,Journal Article,IEEE,https://ieeexplore.ieee.org/document/1100705/
3,Image quality assessment: from error visibilit...,"E.P. Simoncelli, Zhou Wang, H.R. Sheikh, A.C. ...",Objective methods for assessing perceptual ima...,2004,IEEE Transactions on Image Processing,Journal Article,IEEE,https://ieeexplore.ieee.org/document/1284395/
4,Gradient-based learning applied to document re...,"L. Bottou, Y. Bengio, P. Haffner, Y. Lecun",Multilayer neural networks trained with the ba...,1998,Proceedings of the IEEE,Journal Article,IEEE,https://ieeexplore.ieee.org/document/726791/
...,...,...,...,...,...,...,...,...
5089,Silicon-based optoelectronics,R.A. Soref,The decade of the 1990's is an opportune time ...,1993,Proceedings of the IEEE,Journal Article,IEEE,https://ieeexplore.ieee.org/document/248958/
5090,Remote Sensing Image Scene Classification Meet...,"Xingxing Xie, Junwei Han, Lei Guo, Gong Cheng,...","Remote sensing image scene classification, whi...",2020,IEEE Journal of Selected Topics in Applied Ear...,Journal Article,IEEE,https://ieeexplore.ieee.org/document/9127795/
5091,Sparse Channel Estimation for Multicarrier Und...,"Christian R. Berger, Shengli Zhou, James C. Pr...","In this paper, we investigate various channel ...",2010,IEEE Transactions on Signal Processing,Journal Article,IEEE,https://ieeexplore.ieee.org/document/5352256/
5092,Sum capacity of Gaussian vector broadcast chan...,"Wei Yu, J.M. Cioffi",This paper characterizes the sum capacity of a...,2004,IEEE Transactions on Information Theory,Journal Article,IEEE,https://ieeexplore.ieee.org/document/1327794/


In [58]:
test_df.to_csv('publications.csv', index=False)