# Web Scraping Insolvency Insider

In [1]:
from selenium import webdriver
import urllib3
import re
import time
import pandas as pd

In [2]:
# Create driver object. Opens browser window.
executablePath = "C:/Users/Robpr/OneDrive/Documents/chromedriver.exe"
driver = webdriver.Chrome(executable_path=executablePath)

In [3]:
# Navigates browser to insolvency insider.
driver.get("https://insolvencyinsider.ca/filing/")

In [4]:
# Creates "load more" button object.
loadMore = driver.find_element_by_xpath(xpath="/html/body/div[2]/div/main/div/div/div/button")

In [5]:
url = "https://insolvencyinsider.ca/filing/"
http = urllib3.PoolManager()
r = http.request("GET", url)
text = str(r.data)

In [9]:
totalPagesObj = re.search(pattern='"total_pages":\d+', string=text)

totalPagesStr = totalPagesObj.group(0)

totalPages = int((re.search(pattern="\d+", string=totalPagesStr)).group(0))

In [11]:
totalPagesStr

'"total_pages":76'

In [10]:
print(totalPages)

76


In [12]:
# Clicks the Load more button (total pages - 1) times with a three second delay.
for i in range(totalPages-1):
    loadMore.click()
    time.sleep(3)

In [13]:
# Creates a list of filing name elements and a list of filing date elements.
filingNamesElements = driver.find_elements_by_class_name("filing-name")
filingDateElements = driver.find_elements_by_class_name("filing-date")
filingHrefElements = driver.find_elements_by_xpath("//*[@id='content']/div[2]/div/div[1]/h3/a")

In [15]:
filingMetas = []
for i in range(len(filingNamesElements) + 1):
    filingMetai = driver.find_elements_by_xpath(("//*[@id='content']/div[2]/div[%d]/div[2]/div[1]" %(i)))
    for element in filingMetai:
        filingMetaTexti = element.text
        filingMetas.append(filingMetaTexti)

In [16]:
metaDict = {"Filing Type": [], "Industry": [], "Province": []}

for filing in filingMetas:
    filingSplit = filing.split("\n")
    
    for item in filingSplit:
        itemSplit = item.split(": ")
        
        if itemSplit[0] == "Filing Type":
            metaDict["Filing Type"].append(itemSplit[1])
        elif itemSplit[0] == "Industry":
            metaDict["Industry"].append(itemSplit[1])
        elif itemSplit[0] == "Province":
            metaDict["Province"].append(itemSplit[1])
            
    if "Filing Type" not in filing:
        metaDict["Filing Type"].append("NA")
    elif "Industry" not in filing:
        metaDict["Industry"].append("NA")
    elif "Province" not in filing:
        metaDict["Province"].append("NA")

In [17]:
for key in metaDict:
    print(len(metaDict[key]))

751
751
751


In [14]:
# # Creates a list of all p elements.
# filingTypeElements = driver.find_elements_by_xpath("//p")

# # Initiates a dictonary to store filing meta data.
# metaDict = {"Filing Type": [], "Trustee": [], "Industry": [], "Province": []}

In [15]:
# filingMetaElements = driver.find_elements_by_class_name("filing-meta")

# for element in filingMetaElements:
#     print(element.text)

In [16]:
# # For each element in the list of all p elements, splits the element on ": " and if the split element has length greater than
# # zero (to avoid index out of range error), then allocate the element text to the appropriate key in the meta data dictionary. 
# for element in filingTypeElements:
    
#     splitElement = element.text.split(": ")
    
#     if len(splitElement) > 0:
#         if splitElement[0] == "Filing Type":
#             metaDict["Filing Type"].append(splitElement[1])
            
#         elif splitElement[0] == "Trustee":
#             metaDict["Trustee"].append(splitElement[1])
        
#         elif splitElement[0] == "Applicant":
#             metaDict["Applicant"].append(splitElement[1])
        
#         elif splitElement[0] == "Applicant Counsel":
#             metaDict["Applicant Counsel"].append(splitElement[1])
        
#         elif splitElement[0] == "Industry":
#             metaDict["Industry"].append(splitElement[1])
        
#         elif splitElement[0] == "Province":
#             metaDict["Province"].append(splitElement[1])

# metaDict

In [18]:
# Initiates a list for filing names and a list for filing dates.
filingName = []
filingDate = []
filingLink = []

# for each element in filing name elements list, appends the element's text to the filing names list.
for element in filingNamesElements:
    filingName.append(element.text)

# for each element in filing date elements list, appends the element's text to the filing dates list.
for element in filingDateElements:
    filingDate.append(element.text)
    
for link in filingHrefElements:
    if link.get_attribute("href"):
        filingLink.append(link.get_attribute("href"))

In [27]:
%timeit filingName = [element.text for element in filingNamesElements]
filingDate = [element.text for element in filingDateElements]
filingLink = [element.text for element in filingHrefElements]

7.49 s ± 130 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [18]:
len(filingLink)

743

In [20]:
# Creates a final dictionary with filing names and dates.
fullDict = {
    "Filing Name": filingName,
    "Filing Date": filingDate, 
    "Filing Type": metaDict["Filing Type"],
    "Industry": metaDict["Industry"],
    "Province": metaDict["Province"],
    "Link": filingLink
}

In [21]:
# Just checking whether lists are all the same length.
for key in fullDict:
    print(len(fullDict[key]))

751
751
751
751
751
751


In [23]:
# Creates a data frame.
df = pd.DataFrame(fullDict)
df["Filing Date"] = pd.to_datetime(df["Filing Date"], infer_datetime_format=True)

df

Unnamed: 0,Filing Name,Filing Date,Filing Type,Industry,Province,Link
0,Sunrise Acquisitions (Hwy 7) Inc,2021-06-28,Receivership,Real Estate,Ontario,https://insolvencyinsider.ca/filing/sunrise-ac...
1,Spartan Bioscience Inc.,2021-06-21,CCAA,Healthcare,Ontario,https://insolvencyinsider.ca/filing/spartan-bi...
2,Alaska – Alberta Railway Development Corporati...,2021-06-18,NOI,Transportation,Alberta,https://insolvencyinsider.ca/filing/alaska-alb...
3,Nautilus Plus Inc.,2021-06-14,NOI,Other,Quebec,https://insolvencyinsider.ca/filing/nautilus-p...
4,The Estate of Paul Zigomanis (the “Estate”),2021-06-10,Bankruptcy,Other,Ontario,https://insolvencyinsider.ca/filing/the-estate...
...,...,...,...,...,...,...
746,ReidBuilt,2017-11-02,Receivership,Real Estate,Alberta,https://insolvencyinsider.ca/filing/reidbuilt/
747,Spareparts,2017-10-31,CCAA,Retail,Alberta,https://insolvencyinsider.ca/filing/spareparts/
748,BuildDirect,2017-10-31,CCAA,Technology,British Columbia,https://insolvencyinsider.ca/filing/builddirect/
749,1735549 Ontario,2017-10-27,Receivership,Real Estate,Ontario,https://insolvencyinsider.ca/filing/1735549-on...


In [34]:
print(df['Filing Date'])

0     2021-06-08
1     2021-06-07
2     2021-05-31
3     2021-05-27
4     2021-05-26
         ...    
738   2017-11-02
739   2017-10-31
740   2017-10-31
741   2017-10-27
742   2017-06-22
Name: Filing Date, Length: 743, dtype: datetime64[ns]
