In [None]:
import requests
import random
import re
from unidecode import unidecode
import pandas as pd
from bs4 import BeautifulSoup
import time
import http
import json


# Function to convert product names to slugs
def product_name_to_slug(product_name):

    product_name = product_name.lower()
    product_name = unidecode(product_name)
    product_name = re.sub(r'\s+', '-', product_name)
    product_name = re.sub(r'[^a-z0-9-]', '', product_name)
    return product_name

def serper_name_to_g2(company_name):
    conn = http.client.HTTPSConnection("google.serper.dev")
    payload = json.dumps({
    "q": f"{company_name} alternatives & competitors site:g2.com"
    })
    headers = {
    'X-API-KEY': '<API_KEY>',
    'Content-Type': 'application/json'
    }
    conn.request("POST", "/search", payload, headers)
    res = conn.getresponse()
    data = res.read().decode("utf-8")
    results = json.loads(data)

    organic_results = results['organic']

    g2_url = organic_results[0]['link'] if organic_results else None

    return g2_url

def scrape_g2_alternatives(g2_url):
    apikey = '<API_KEY>'

    params = {
        'url': g2_url,
        'apikey': apikey,
        'js_render': 'true',
        'wait_for': '.grid-x',
        'premium_proxy': 'true',
        'proxy_country': 'us',
        'custom_headers': 'true',
    }
    headers = {
        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36',
    }
    response = requests.get('https://api.zenrows.com/v1/', params=params, headers=headers)

    soup = BeautifulSoup(response.text, 'html.parser')
    product_list = soup.find('ul', class_='mb-0 list--chevron')
    products = [li.text for li in product_list.find_all('li')]

    return products

product_competitors = {}
g2_urls = []

def run():
    company_df = pd.read_csv('./data/domains.csv')
    company_names = company_df['domain'].to_list()

    for idx, company_name in enumerate(company_names): 

        print(f"Scraping company {company_name}")

        g2_url = serper_name_to_g2(company_name)
        
        if not g2_url or 'competitors/alternatives' not in g2_url:
            continue

        print(f"Found G2 URL: {g2_url}")
        


        attempts = 0
        success = False

        while attempts < 4 and not success:
            try:
                products = scrape_g2_alternatives(g2_url)
                product_competitors[company_name] = products
                success = True
                print(f'Company: {company_name} competitors: {products}')
            except Exception as e:
                attempts += 1
                wait_time = 2 ** attempts + random.random()  # Exponential backoff with jitter
                print(f"Attempt {attempts} failed: {e}. Retrying in {wait_time:.2f} seconds...")
                time.sleep(wait_time)
        
        if not success:
            print(f"Failed to scrape {company_name} after 3 attempts. Skipping to next company.")
            continue
    

run()

In [None]:
import urllib.parse
import re
import requests
import pandas as pd
import maquinillo.settings
import os
from requests.auth import HTTPBasicAuth
import time
import random
import urllib
import http.client
import json
import tldextract

def get_base_domain(url):
    extracted = tldextract.extract(url)
    domain = f"{extracted.domain}.{extracted.suffix}"
    return domain

def sleep_random_time():
    # Generate a random floating-point number between 0 and 2
    random_time = 2 * random.random()

    # Sleep for the random amount of time
    time.sleep(random_time)

def get_domain_from_name(name):
    URL = f"https://company.clearbit.com/v1/domains/find?name={name}"

    CLEARBIT_KEY = os.getenv("CLEARBIT_API_KEY")

    response = requests.get(URL, auth=HTTPBasicAuth(CLEARBIT_KEY, ""))

    if response.status_code == 200:
        response_json = response.json()
        return response_json["domain"]

    elif response.status_code == 404:
        return None

    elif response.status_code == 422:
        print("Weird name:", name)
        return None

    else:
        print("Status code:", response.status_code)
        print("Body: ", response.json())
        raise Exception("Weird scenario")    
    
def serper_name_to_domain(company_name):
    SERPER_KEY = os.getenv("SERPER_API_KEY")
    conn = http.client.HTTPSConnection("google.serper.dev")
    payload = json.dumps({
    "q": f"{company_name} company website"
    })
    headers = {
    'X-API-KEY': SERPER_KEY,
    'Content-Type': 'application/json'
    }
    conn.request("POST", "/search", payload, headers)
    res = conn.getresponse()
    data = res.read().decode("utf-8")
    results = json.loads(data)
    website = results['organic'][0]['link']

    if 'linkedin' in website:
        return None

    return get_base_domain(results['organic'][0]['link'])


def name_to_domain(name):
    print(f"Processing {name}")
    
    try:
        clearbit_domain = get_domain_from_name(name)
        if clearbit_domain:
            domain = clearbit_domain
        else:
            domain = serper_name_to_domain(name)
    except Exception as e:
        return name
    return domain

In [None]:
print(product_competitors)

for key, values in product_competitors.items():
    product_competitors[key] = [name_to_domain(value) for value in values]

product_competitors_df = pd.DataFrame.from_dict(product_competitors, orient='index')
product_competitors_df.to_csv('./data/annual_only_competitors.csv')


In [None]:
all_competitors = []

for key, values in product_competitors.items():
    all_competitors.extend([value for value in values])

In [None]:
print(all_competitors)