# webscrape.ipynb
## pre-postprocessing, pre-model-training

In [None]:
def url_match(data, link) -> str:
    if len(link.attrs['href']) == 0: return ''
    match = re.search(DIRECT_SUB_DOMAINS, link.attrs['href'])
    new_target = ""
    sep = ''
    if match:
        new_target = link.attrs['href']
    elif "." not in link.attrs['href'] and is_entrypoint(link.attrs['href']):
        print("RELATIVE MATCH (SITEMAP): ", link.attrs['href'])
        if link.attrs['href'][0] != '/': sep = '/'
        new_target = data[DOMAIN] + sep + link.attrs['href']
    elif "." not in link.attrs['href'] and is_entrypoint(data[DOMAIN]):
        print("RELATIVE MATCH (INTRA-SITEMAP): ", link.attrs['href'])
        base_match = re.search("^([^\/]+)\/", data[DOMAIN])
        if base_match != None:
            base = base_match.group(1)
        if link.attrs['href'][0] != '/': sep = '/'
        new_target = base + sep + link.attrs['href']
    elif "." not in link.attrs['href']:
        print("RELATIVE MATCH (NON-SITEMAP): ", link.attrs['href'])
        if link.attrs['href'][0] != '/': sep = '/'
        new_target = data[DOMAIN] + sep + link.attrs['href']
    return new_target

In [None]:
def find_about_pages(links, data):
    href_list = []
    filtered_links = []
    for link in links: 
        if 'href' in link.attrs.keys():
            char_set = set(link.attrs['href'])
            if char_set.isdisjoint({'#','(',')','@'}) and '.pdf' not in link.attrs['href']:
                href_list.append(link.attrs['href'])
                filtered_links.append(link)
    entry = {
        'id': data[ID],
        'origin': complete_url(data[DOMAIN]),
        'all_link_count': len(links),
        'links': str(href_list),
    }
    client = MongoClient(DB_CONNECTION)
    client.data.sublinks.insert_one(entry)
    client.close()

    selected_urls = None
    if len(filtered_links) > MAX_SUBPAGES:
        selected_urls = []
        indices = np.random.choice(len(filtered_links), MAX_SUBPAGES, replace=False)
        for index in indices:
            res = url_match(data, filtered_links[index])
            if res != "" and re.search(DIRECT_SUB_DOMAINS, complete_url(res)):
                selected_urls.append(filtered_links[index])
    else: selected_urls = filtered_links

    with concurrent.futures.ThreadPoolExecutor() as executor:
        future_to_url = {
            executor.submit(fetch_html, data, url_match(data, link)): \
                link for link in selected_urls
        }

In [None]:
def extract_text(tags):
    texts = []
    for t in tags:
        inner_match = re.search("<.*>([\w -,.!?\"\']+)<.*>", str(t))
        if inner_match:
            text = inner_match.group(1)
            texts.append(text.replace("\"", '\''))
    text = ' '.join(texts)
    if len(text) < LEN_CUTOFF: return None
    else: return text

In [None]:
def complete_url(base_url) -> str:
    if 'https://www.' not in base_url: 
        return 'https://www.' + base_url
    else: return base_url

def is_entrypoint(href) -> bool:
    for elem in ENTRYPOINT_POSITIVES:
        if elem in href:
            return True

In [None]:
def find_site_map(links, data) -> list:
    site_map_targets = []
    for link in links:
        if 'href' in link.attrs.keys() and is_entrypoint(link.attrs['href']):
            new_target = url_match(data, link)
            if new_target != "":
                site_map_targets.append(new_target)
    return site_map_targets

In [None]:
def fetch_html(data, base_url):
    error = ''
    if base_url == '': return None
    full_url = complete_url(base_url)
    try:
        result = requests.get(full_url, timeout=TIMEOUT, headers=np.random.choice(HEADERS))
        status = result.status_code
        print("URL: ", full_url)
        print("STATUS: ", status)
        src = result.content
        soup = BeautifulSoup(src, 'html.parser')
        html_tags = soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
        text = extract_text(html_tags)

        if result.status_code != 200: text = u''
        elif result.status_code == 200:
            document = {
                'id': data[ID],
                'endpoint': full_url,
                'status': result.status_code,
                'time': datetime.datetime.now(),
                'text': text
            }
            client = MongoClient(DB_CONNECTION)
            if not client.data.companies.find_one({'id': data[ID]}):
                company = {
                    'id': data[ID],
                    'name': data[NAME],
                    'domain': data[DOMAIN],
                    'year_founded': data[YEAR_FOUNDED],
                    'industry': data[INDUSTRY],
                    'size_range': data[SIZE_RANGE],
                    'locality': data[LOCALITY],
                    'country': data[COUNTRY],
                    'linked_in_url': data[LINKEDIN_URL],
                    'relevant': data[-1]
                }
                client.data.companies.insert_one(company)
            client.data.documents.insert_one(document)
            client.close()
        return soup
    except requests.exceptions.Timeout:
        error = 'timeout'
    except requests.exceptions.SSLError:
        error = 'too many retries'
    except requests.exceptions.TooManyRedirects:
        error = 'too many redirects'
    except requests.exceptions.ConnectionError:
        error = 'refused connection'
    client = MongoClient(DB_CONNECTION)
    failure = {
        'id': data[ID],
        'endpoint': full_url,
        'time': datetime.datetime.now(),
        'error': error
    }
    client.data.failures.insert_one(failure)
    return None

In [None]:
def work_unit(data, is_site_map):
    if is_site_map:
        soup = fetch_html(data, data[DOMAIN])
        if soup != None:
            links = soup.find_all("a")
            find_about_pages(links, data)
    else:
        soup = fetch_html(data, data[DOMAIN])
        if soup != None:
            links = soup.find_all("a")
            site_map_targets = find_site_map(links, data)
            if len(site_map_targets) == 0:
                find_about_pages(links, data)
            else:
                print(site_map_targets)
                for target in site_map_targets:
                    data[DOMAIN] = target
                    work_unit(data, True)

In [None]:
def execute_work(data):
    with concurrent.futures.ProcessPoolExecutor(max_workers=8) as executor:
        future_to_url = {
            executor.submit(work_unit, obj, False): \
                obj for obj in data
        }

In [None]:
def load_sam_entities_data(num_enqueues, src_file):
    data = []
    counter = 0
    with open(src_file) as jsonfile:
            entities = json.load(jsonfile)
            for entry in entities['domain_agent']:
                if re.search(TARGET_URLS, entry['domain_agent_url']):
                    counter += 1
                    if str(counter) not in PREVIOUS_SCRAPES:
                        row = [str(counter), entry['domain_agent_name'], \
                            entry['domain_agent_url'], '', entry['attribute_agent'], \
                                '', '', '', '', '', '']
                        data.append(row)
                    if counter > num_enqueues: break
    return data

In [None]:
# USE STRICTLY W/ PEOPLE DATA LABS DATASET
def enqueue_work_units(is_binary, class_rerouting, src_file):
    counter = 0
    class_data = defaultdict(list)
    with open(src_file, newline='') as csvfile:
        reader = csv.reader(csvfile, delimiter=',', quotechar='\"')
        for row in reader:
            if counter == 0:
                counter += 1 
                continue # skip csv header row
            current_url = row[DOMAIN]
            if re.search(TARGET_URLS, current_url) and \
                    row[ID] not in PREVIOUS_SCRAPES and \
                    int(row[EMPLOYEE_ESTIMATE]) > MIN_EMPLOYEES:
                counter += 1
                custom_row = row[0:4]
                # UNCOMMENT BELOW IF MULTICLASS
                #custom_row.append(class_rerouting[row[INDUSTRY]]) 
                custom_row.append(row[INDUSTRY])
                custom_row.extend(row[5:])
                # BINARIZE LABELS - row size is larger in binary mode
                if is_binary:
                    label = ''
                    if row[INDUSTRY] in RELEVANT_CLASSES:
                        label = '1'
                        custom_row.append(1)
                    else:
                        label = '0'
                        custom_row.append(0)
                    class_data[label].append(custom_row)
                if counter > 200000: break
    return class_data

In [None]:
def randomly_sample_classes(class_data):
    data = []
    for key,_ in class_data.items():
        indices = np.random.choice(len(class_data[key]), CLASS_BALANCE_THRESH, replace=False)
        for index in indices:
            data.append(class_data[key][index])
    np.random.shuffle(data)
    return data

In [3]:
def enqueue_training_data(src_file):
    class_data = defaultdict(list)
    with open(src_file, newline='') as csvfile:
        reader = csv.reader(csvfile, delimiter=',', quotechar='\"')
        for row in reader:
            if row[0] == 'class': continue
            reformatted_row = row[1:]
            reformatted_row.append(row[0])
            class_data[row[0]].append(reformatted_row)
    return class_data

def write_training_metadata(data, filename):
    with open(filename, 'a') as f:
        for sample in data:
            new = [sample[-1]]+sample[:-2]
            write = csv.writer(f)
            write.writerow(new)
        f.close()

In [2]:
def load_relevant_classes(src_file):
    relevant_classes = set()
    with open(src_file) as csvfile:
        reader = csv.reader(csvfile, delimiter='\n')
        for r in reader:
            relevant_classes.add(r[0].split(': ')[0])
    return relevant_classes

def populate_class_mapping(src_file):
    custom_class_map = {}
    with open(src_file) as csvfile:
        reader = csv.reader(csvfile,delimiter=',',quotechar='\"')
        for row in reader:
            index = row.index('*')
            for i in range(0, index-1):
                custom_class_map[row[i]] = row[index-1]
    return custom_class_map

In [1]:
def webscrape(is_train):
    data = None
    if is_train:
        rel_classes = load_relevant_classes(src_file='./manifests/pointed.csv')
        custom_class_map = populate_class_mapping(src_file='./maniefests/collapsed.csv')
        class_data = enqueue_work_units(
            is_binary=True,
            class_rerouting=custom_class_map,
            src_file='/inputs/companies_sorted.csv' # ENQUEUE ENTIRE PEOPLE DATA LABS DATASET (PDL)
        )
        data = randomly_sample_classes(class_data)
    else:
        data = load_sam_entities_data(
            num_enqueues=100,
            src_file='./inputs/sam_entities.json' # SAM.GOV ENTITIES JSON DUMP
        )
    if data: execute_work(data)

    """
    class_data = enqueue_training_data(
        src_file='inputs/company_samples_10000_hq.csv' # ENQUEUE PDL SUBSET FROM METADATA
    )
    write_training_metadata(data, './company_samples_XXXX_Cq.csv') # SAVE PDL COMPANY METADATA FOR REPLICATION PURPOSES
    """
    return

In [None]:
webscrape(is_train=True)