# Selenium

In [1]:
import pandas as pd
from datetime import datetime, timedelta
import selenium
from selenium import webdriver
import time
from bs4 import BeautifulSoup as bsp
import requests as r

In [2]:
# URL to parse
url = 'https://www.giiresearch.com/material_report.shtml'

In [3]:
# Creating browser instance. Using Brave Browser in this case
driver_path = r'C:\Users\soumy\Documents\Python Notebooks\chromedriver.exe'
brave_path = r'C:\Program Files\BraveSoftware\Brave-Browser\Application\brave.exe'
option = webdriver.ChromeOptions()
option.binary_location = brave_path

driver = webdriver.Chrome(executable_path = driver_path, options=option)

driver.implicitly_wait(15) 
driver.get(url)   # Opening url in browser instance

In [4]:
# Functions to convert string date to datetime format

def to_dt_format(date):
    date = datetime.strptime(date,'%B %d, %Y')
    return date
def to_dt_time(date):
    date_text = date.find_element_by_class_name('plist_info_dd2').text
    dt_date = to_dt_format(date_text)
    return dt_date

In [5]:
# Function to get publish_date,title,pages and urls of objects from
# given start_date and end_date

def getData(start_date,end_date):
    days_num = end_date-start_date
    dates_needed = [start_date + timedelta(days=i) for i in range(days_num.days + 1)]

    dates_on_page = driver.find_elements_by_class_name('plist_dateinfo')
    dates_on_page = [to_dt_time(date) for date in dates_on_page]

    if set(dates_on_page).isdisjoint(dates_needed):
        try:
            driver.find_element_by_class_name('btn_next').click()
            getData(start_date,end_date)
        except Exception as e:
            print(e)
            print('Something happened')
    elif not set(dates_on_page).isdisjoint(dates_needed):
        items = driver.find_elements_by_class_name('plist_item')
        for item in items:
            publish_date = to_dt_time(item.find_element_by_class_name('plist_dateinfo'))
            if publish_date<=end_date and publish_date>=start_date:
                title = item.find_element_by_class_name('list_title').text
                pages = item.find_element_by_class_name('plist_pageinfo').find_element_by_class_name('plist_info_dd2').text
                url = item.find_element_by_link_text(title).get_attribute('href')
                data.append([publish_date,title,pages,url])
        if not dates_on_page[-1]<dates_needed[0]:
            try:
                driver.find_element_by_class_name('btn_next').click()
                getData(start_date,end_date)
            except Exception as e:
                print(e)
                print('Something happened')
    return data

In [6]:
# Specify start_date and end_date (start_date<end_date)
start_date = to_dt_format('April 11, 2022')
end_date = to_dt_format('April 13, 2022')
data = []

# Get data using function
data = getData(start_date,end_date)

In [7]:
# Convert data to a dataframe
data_df = pd.DataFrame(data,columns = ['Published Date','Report Title','No. of Pages','URL'])

# BS4

In [8]:
# Using Beautifulsoup to go through the list of urls we scraped
# and extractind description, category and table of contents info from each
contents_list, category_list = [],[]
for url in data_df.URL:
    get_url = None
    while get_url == None:
        try:
            get_url = r.get(url)
            break
        except:
            print("Connection refused by the server..")
            print("Let me sleep for 5 seconds")
            print("ZZzzzz...")
            time.sleep(5)
            print("Was a nice sleep, now let me continue...")
            continue
    soup = bsp(get_url.content,'html.parser')
    category = soup.find("a",{"href":"/material_report.shtml"})
    contents = soup.findAll("div",attrs={"class","cntSecContent"})
    contents = [content.text for content in contents]
    contents_list.append(contents)
    category_list.append(category.text.strip())

In [9]:
# Converting to dataframes
contents_df = pd.DataFrame(contents_list)
category_df = pd.DataFrame(category_list, columns = ['Category/Industry'])

In [10]:
# Merging all data into output df
output_df = pd.concat([data_df,category_df,contents_df],axis=1)
output_df.drop(['URL'],axis=1,inplace=True)

In [None]:
output_df

In [12]:
# Saving output_df as csv file
output_df.to_csv('output.csv', index = False)