## Introduction

The aim is to extract all the possible tables from the structured and unstructured SEC filings without any use of RegEx and try to aggregate all the tables which may or may not contain CDS information. Once that is comepletely, filtering method should be in place which would filter out the tables which do not have Credit Default Swap information

## Declaring libraries required to run our implementation

In [1]:
from bs4 import BeautifulSoup
from collections import namedtuple

import pprint
import csv
import urllib
import re
import sys

## Defining a class Page to define the get table functions

In [2]:
class Page:

    def __init__(self, data):
        """
        Retrieves and stores the urllib.urlopen object for a given url
        """

        self.link = data

    def get_tables(self,data):
        """
        Extracts each table on the page and places it in a dictionary.
        Converts each dictionary to a Table object. Returns a list of
        pointers to the respective Table object(s).
        """
        print("enter get table")
        soup = BeautifulSoup(data,"lxml")
        print("done1")
        tables = soup.findAll("table")
        # have to extract each entry using nested loops
        table_list = []
        for table in tables:
            # empty dictionary each time represents our table
            table_dict = {}
            rows = table.findAll("tr")
            # count will be the key for each list of values
            count = 0
            for row in rows:
                value_list = []
                entries = row.findAll("td")
                for entry in entries:
                    # fix the encoding issues with utf-8
                    entry = entry.text.encode("utf-8","ignore")
                    strip_unicode = re.compile("([^-_a-zA-Z0-9!@#%&=,/'\";:~`\$\^\*\(\)\+\[\]\.\{\}\|\?\<\>\\]+|[^\s]+)")
                    entry = entry.decode("utf-8")
                    entry = strip_unicode.sub(" ", entry)
                    value_list.append(entry)
                # we don't want empty data packages
                if len(value_list) > 0:
                    table_dict[count] = value_list
                    count += 1

            table_obj = Table(table_dict)
            table_list.append(table_obj)

        return table_list

    def save_tables(self, tables, ignore_small=False):
        """
        Takes an input a list of table objects and saves each
        table to csv format. If ignore_small is True,
        we ignore any tables with 5 entries or fewer. 
        """

        counter = 1
        for table in tables:
            if ignore_small:
                if table.get_metadata().num_entries > 5:
                    name = "table" + str(counter)
                    table.save_table(name)
                    counter += 1
            else:
                name = "table" + str(counter)
                table.save_table(name)
                counter += 1

## Defining table function to get the table data and store it

In [None]:
Metadata = namedtuple("Metadata", "num_cols num_entries")

class Table:

    def __init__(self, data):
        """
        Stores a given table as a dictionary. The keys are the headings and the
        values are the data, represented as lists.
        """
        self.table_data = data

    def get_metadata(self):
        """
        Returns a Metadata object that contains the number of columns
        and the total number of entries.
        """

        col_headings = self.table_data.keys()
        num_cols = len(col_headings)
        num_entries = 0

        for heading in col_headings:
            num_entries += len(self.table_data[heading])

        return Metadata(
            num_cols = num_cols,
            num_entries = num_entries
        )

    def show_table(self):
        """
        Prints a formatted table to the command line using pprint
        """
        pprint.pprint(self.table_data, width=1)

    def save_table(self, name):
        """
        Saves a table to csv format under the given file name. 
        File name should omit the extension.
        """
        fname = name + ".csv"

        with open(fname, 'wb') as outf:
            w = csv.writer(outf, dialect="excel")
            li = self.table_data.values()
            w.writerows(li)


## Driver Function

In [None]:
# enter the file we want
f = open("0001193125-17-056504 2 copy.html", 'r')
data = f.read()
f.close()

# convert to a page object
page = Page(data)
print("converted page object.....")
# get the tables
tables = page.get_tables(data)
print("got the tables.......")

# save the tables
page.save_tables(tables)
print("tables saved.......")
# you can change the name that it saves the table to by calling save_table on the table object itself:
# don't include the extension in the file name
tables[0].save_table("customName")
print("saved the table........")
# Sometimes, websites use HTML tables to format their page, not represent data
# You can use the ignore_small argument to handle this issue to some extent:

# Note that the below code overwrites the table1.csv file that was created from the earlier url
# https://coinmarketcap.com/exchanges/volume/24-hour/

# This website is a good example of table fragments
page = Page(data)
tables = page.get_tables()
page.save_tables(tables, ignore_small=False)

converted page object.....
enter get table
