# WebScrapping
this file gets all urls from `input.txt` and scrapes the data from the url and saves it in `output.csv`

In [151]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options as ChromeOptions
from selenium.webdriver.ie.options import Options as IEOptions
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
import pandas as pd
from pandas.errors import EmptyDataError
import os
from icecream import ic

In [152]:
options = ChromeOptions()
driver = webdriver.Chrome(options=options)

In [153]:
# configs
LOG = 1
INPUT_FILE = 'input.txt'
OUTPUT_FILE = 'output.csv'

base = {
    'ieee': 'https://ieeexplore.ieee.org/abstract/document/'
}

In [154]:
def logger(s):
    if LOG:
        print(s)

In [155]:
def get_authors():
    try:
        authors = driver.find_elements(By.CLASS_NAME,"authors-info")
        authors = [i.text.replace(';', '') for i in authors]
        return ", ".join(authors)
    except NoSuchElementException:
        return None
    except Exception:
        return None

In [156]:
def get_titles():
    try:
        title_journal_name = driver.title.split(" | ")
        return title_journal_name[0], title_journal_name[1]
    except NoSuchElementException:
        return None, None
    except IndexError:
        return None, None
    except Exception:
        return None, None

In [157]:
def get_volume():
    return None

In [158]:
def get_citations():
    try:
        return driver.find_elements(By.CLASS_NAME,"document-banner-metric-count")[0].text
    except NoSuchElementException:
        return None
    except Exception:
        return None

In [159]:
def get_doi():
    try:
        return driver.find_element(By.XPATH,"//div[@class='u-pb-1 stats-document-abstract-doi']/a").text
    except NoSuchElementException:
        return None
    except Exception:
        return None

In [160]:
def get_page_no():
    try:
        ele = driver.find_element(By.XPATH,"//div[@class='col-6']")
        ele_str = ele.text.split('\n')[0].split(':')[1].strip()
        return ele_str
    except NoSuchElementException:
        return None
    except Exception:
        return None

In [161]:
def get_date():
    try:
        _date = driver.find_element(By.XPATH,"//div[@class='u-pb-1 doc-abstract-dateadded']")
    except NoSuchElementException:
        try:
            _date = driver.find_element(By.XPATH,"//div[@class='u-pb-1 doc-abstract-pubdate']")
        except NoSuchElementException:
            _date = None
        except Exception:
            _date = None
    if not _date:
        return None, None
    try:
        _date = _date.text.split(": ")[1].strip().split(" ")
        _month = _date[1]
        _year = _date[2]
    # _date = _date[0]
    except IndexError:
        return None, None
    return _month, _year

In [162]:
def get_issn():
    try:
        issn = driver.find_elements(By.XPATH,"//div[@class='u-pb-1']")
        return issn[2].text.split(':')[1].strip()
    except NoSuchElementException:
        return None
    except Exception:
        return None

In [163]:
def get_data(url):
    '''
    @param url: url of the article
    '''
    if not str(url).startswith('http'):
        url = base['ieee'] + str(url)
    
    driver.get(url)
    authors = get_authors()
    title_of_paper, journal_name = get_titles()
    volume = get_volume()
    citation = get_citations()
    doi = get_doi()
    page_no = get_page_no()
    month, year = get_date()
    issn = get_issn()

    return {
        'authors' : authors,
        'title of paper': title_of_paper,
        'journal name': journal_name,
        'volume': volume,
        'citations': citation,
        'doi': doi,
        'page no': page_no,
        'month of publication': month,
        'year of publication': year,
        'ISSN': issn,
        'url': url,
    }

In [164]:
def get_input_links():
    '''
    Get the input links from the input file
    '''
    with open(INPUT_FILE, 'r') as links:
        urls = links.readlines()
    # with open(INPUT_FILE, 'w') as links:
    #     links.write('')
    return urls

In [165]:
logger('Getting input links...')
# urls = get_input_links()
urls = [i for i in range(8248650, 8248800)]
data = []
logger('Getting data...')
for url in urls:
    data.append(get_data(url))
driver.quit()
logger('Writing data to file...')
if not os.path.exists(OUTPUT_FILE):
    df = pd.DataFrame()
else:
    try:
        df = pd.read_csv(OUTPUT_FILE)
    except EmptyDataError:
        df = pd.DataFrame()

new_df = pd.DataFrame(data)
df = df.append(new_df)

try:
    os.remove(OUTPUT_FILE)
except:
    print('OUTPUT_FILE cannot be deleted')
if not os.path.exists(OUTPUT_FILE):
    df.to_csv(OUTPUT_FILE, index=False)
    logger('Data saved to file!')

Getting input links...
Getting data...
Writing data to file...
Data saved to file!
