In [2]:
import pandas as pd
import http.client
import json
import requests
from bs4 import BeautifulSoup
import time
import concurrent.futures
import re

def find_linkedin_url(company_name):
    conn = http.client.HTTPSConnection("google.serper.dev")
    payload = json.dumps({
        "q": f"{company_name} site:linkedin.com"
    })
    headers = {
        'X-API-KEY': '46e6377865b21659da0a212efadbadf2129740f5',
        'Content-Type': 'application/json'
    }
    conn.request("POST", "/search", payload, headers)
    res = conn.getresponse()
    data = res.read().decode("utf-8")
    results = json.loads(data)
    if not results:
        return None
    if not results['organic']:
        return None
    website = results['organic'][0]['link']
    return website

def extract_linkedin_company_id(linkedin_url):
    print(f"Fetching {linkedin_url}")
    try:
        zenrows_apikey = '2fb712f035250fa0feba32543c584318e4894544'
        params = {
            'url': linkedin_url,
            'apikey': zenrows_apikey,
            'js_render': 'true',
            'custom_headers': 'true',
            'premium_proxy': 'true',
            'wait_for': '[data-entity-id]'
        }
        headers = {
            'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36',
        }

        retries = 4
        backoff_factor = 2
        initial_wait = 2  # initial wait time in seconds

        for attempt in range(retries):
            response = requests.get('https://api.zenrows.com/v1/', params=params, headers=headers)

            if response.status_code == 200:
                soup = BeautifulSoup(response.content, 'html.parser')

                # Use regex to find the number in the format urn:li:organization:<number>
                match = re.search(r'urn:li:organization:(\d+)', str(soup))

                if match:
                    data_entity_id = match.group(1)
                    print(data_entity_id)
                    return data_entity_id

            print(f"Error crawling {linkedin_url}: {response.text}, attempt {attempt + 1} of {retries}")
            wait_time = initial_wait * (backoff_factor ** attempt)
            time.sleep(wait_time)  # Exponential backoff
    except Exception:
        return None
    return None

def get_linkedin_id(company_name):
    linkedin_url = find_linkedin_url(company_name)
    if not linkedin_url:
        return None
    linkedin_id = extract_linkedin_company_id(linkedin_url)
    return linkedin_id

def process_company(company_name):
    linkedin_id = get_linkedin_id(company_name)
    return company_name, linkedin_id

df = pd.read_csv('domains.csv')

# Using ThreadPoolExecutor to run 10 concurrent rows at a time
with concurrent.futures.ThreadPoolExecutor(max_workers=7) as executor:
    results = list(executor.map(process_company, df['domain']))

# Convert the results into a DataFrame
results_df = pd.DataFrame(results, columns=['domain', 'linkedin_id'])

# Merge the new DataFrame with the original one
df = df.merge(results_df, on='domain')

print(df)


KeyError: 'Company Domain Name'

In [2]:
df.to_csv('domains_enriched.csv')

In [8]:
import requests
import time
from bs4 import BeautifulSoup
import urllib.parse
import re
from requests.auth import HTTPBasicAuth
import urllib
import http.client
import json

def get_domain_from_name(name):
    URL = f"https://company.clearbit.com/v1/domains/find?name={name}"

    CLEARBIT_KEY = 'sk_0afe7e300fe9fd1777f5e11811dc530f'

    response = requests.get(URL, auth=HTTPBasicAuth(CLEARBIT_KEY, ""))

    if response.status_code == 200:
        response_json = response.json()
        return response_json["domain"]

    elif response.status_code == 404:
        return None

    elif response.status_code == 422:
        print("Weird name:", name)
        return None

    else:
        print("Status code:", response.status_code)
        print("Body: ", response.json())
        raise Exception("Weird scenario")    
    
def serper_name_to_domain(company_name):
    SERPER_KEY = '46e6377865b21659da0a212efadbadf2129740f5'
    conn = http.client.HTTPSConnection("google.serper.dev")
    payload = json.dumps({
    "q": f"{company_name} company website"
    })
    headers = {
    'X-API-KEY': SERPER_KEY,
    'Content-Type': 'application/json'
    }
    conn.request("POST", "/search", payload, headers)
    res = conn.getresponse()
    data = res.read().decode("utf-8")
    results = json.loads(data)
    website = results['organic'][0]['link']

    if 'linkedin' in website:
        return None

    return results['organic'][0]['link']


def find_company_domain(company_name):
    try:
        clearbit_domain =  get_domain_from_name(company_name)
        if clearbit_domain:
           return clearbit_domain
        else:
           return serper_name_to_domain(company_name)
    except Exception as e:
        return None

# Parsing functions
def get_parsed_html(html: str) -> BeautifulSoup:
    return BeautifulSoup(html, "html.parser")

def get_body_from_html(html: str) -> str:
    """
    Gets the body of the page
    """
    if not html:
        return ""
    soup = get_parsed_html(html)
    # Extract the body content if exists, otherwise use the whole page
    body = soup.body if soup.body else soup
    # Get the text content and remove non-ASCII characters
    body_text = body.get_text(" ", strip=True)
    return body_text.encode("ascii", "ignore").decode()

# Main function
def fetch_parsed_response(url):
    max_retries = 4
    base_delay = 2  # Initial delay in seconds

    try:
        if not url:
            raise ValueError("Missing or invalid 'website' key in input_data.")

        apikey = '2fb712f035250fa0feba32543c584318e4894544'  # Our Zenrows API Key
        params = {
            'url': 'capchase.com',
            'apikey': apikey,
            'js_render': 'true',
            'premium_proxy': 'true'
        }

        attempt = 0
        while attempt < max_retries:
            try:
                # Make the GET request
                response = requests.get('https://api.zenrows.com/v1/', params=params)
                
                # Raise an HTTPError for bad responses (e.g., 4xx or 5xx)
                response.raise_for_status()

                # Parse the HTML response
                parsed_body = get_body_from_html(response.text)

                # Return parsed body and empty metadata
                return {
                    'body': parsed_body,
                    'error': None
                }

            except requests.RequestException as e:
                print(e)
                attempt += 1
                if attempt >= max_retries:
                    raise e  # Re-raise the exception if max retries reached
                print(f"Attempt {attempt} failed. Retrying in {base_delay ** attempt} seconds...")
                time.sleep(base_delay ** attempt)  # Exponential backoff

    except (KeyError, ValueError, requests.RequestException) as e:
        # Handle KeyError, ValueError, and HTTP/connection-related exceptions
        error_message = f"Error occurred: {e}"
        print(error_message)  # Optionally log the error

        return {
            'body': None,
            'error': error_message
        }

    except Exception as e:
        # Catch any other unexpected exceptions
        error_message = f"An unexpected error occurred: {e}"
        print(error_message)  # Optionally log the error

        return {
            'body': None,
            'error': error_message
        }

# Example return statement
res = fetch_parsed_response('madisonk12.us')

400 Client Error: Bad Request for url: https://api.zenrows.com/v1/?url=capchase.com&apikey=2fb712f035250fa0feba32543c584318e4894544&js_render=true&premium_proxy=true
Attempt 1 failed. Retrying in 2 seconds...
400 Client Error: Bad Request for url: https://api.zenrows.com/v1/?url=capchase.com&apikey=2fb712f035250fa0feba32543c584318e4894544&js_render=true&premium_proxy=true
Attempt 2 failed. Retrying in 4 seconds...


KeyboardInterrupt: 

In [9]:
print("http" + "url")


httpurl
