In [None]:
import urllib.parse
import re
import requests
import pandas as pd
import maquinillo.settings
import os
from requests.auth import HTTPBasicAuth
import time
import random
import urllib
import http.client
import json
import tldextract
import concurrent.futures


def get_base_domain(url):
    extracted = tldextract.extract(url)
    domain = f"{extracted.domain}.{extracted.suffix}"
    return domain

def sleep_random_time():
    # Generate a random floating-point number between 0 and 2
    random_time = 2 * random.random()

    # Sleep for the random amount of time
    time.sleep(random_time)

def get_domain_from_name(name):
    URL = f"https://company.clearbit.com/v1/domains/find?name={name}"

    CLEARBIT_KEY = os.getenv("CLEARBIT_API_KEY")

    response = requests.get(URL, auth=HTTPBasicAuth(CLEARBIT_KEY, ""))

    if response.status_code == 200:
        response_json = response.json()
        return response_json["domain"]

    elif response.status_code == 404:
        return None

    elif response.status_code == 422:
        print("Weird name:", name)
        return None

    else:
        print("Status code:", response.status_code)
        print("Body: ", response.json())
        raise Exception("Weird scenario")    
    
def serper_name_to_domain(company_name):
    conn = http.client.HTTPSConnection("google.serper.dev")
    payload = json.dumps({
    "q": f"{company_name} company website"
    })
    headers = {
    'X-API-KEY': '<API_KEY>',
    'Content-Type': 'application/json'
    }
    conn.request("POST", "/search", payload, headers)
    res = conn.getresponse()
    data = res.read().decode("utf-8")
    results = json.loads(data)
    website = results['organic'][0]['link']

    if 'linkedin' in website:
        return None

    return get_base_domain(results['organic'][0]['link'])


def enrich_company(row):
    print(row['Company Name'])
    try:
        clearbit_domain =  get_domain_from_name(row['Company Name'])
        if clearbit_domain:
            row['domain_found'] = clearbit_domain
        else:
            row['domain_found'] = serper_name_to_domain(row['Company Name'])
    except Exception as e:
        return ""
    return row


In [None]:
df = pd.read_csv('subtitles.csv')

# Load existing checkpoint if available
checkpoint_file = 'checkpoint.csv'
if os.path.exists(checkpoint_file):
    df_enriched = pd.read_csv(checkpoint_file)
    processed_rows = len(df_enriched)
    print(f"Resuming from row {processed_rows}...")
else:
    # Create a Pandas DataFrame from the company data
    df_enriched = pd.DataFrame(columns=df.columns)  # Create an empty DataFrame for enriched results
    processed_rows = 0  # No rows processed yet

# Rate limiting wrapper
def rate_limited_enrich(row, requests_per_second):
    try:
        enriched_row = enrich_company(row)  # Call your actual function
        time.sleep(1 / requests_per_second)  # Ensure rate limit is maintained
        return enriched_row
    except Exception as e:
        print(f"Error processing row {row['company_name']}: {e}")
        return row  # Return the original row if there's an error, or handle as needed

# Function to parallelize the processing with rate limiting and checkpointing
def parallel_apply_with_checkpoint(df, func, requests_per_minute=600, num_workers=4, checkpoint_interval=500):
    requests_per_second = requests_per_minute / 60  # Calculate how many requests per second
    global df_enriched  # Use the global enriched DataFrame to store results
    
    # Only process the rows that haven't been processed yet
    df_to_process = df.iloc[processed_rows:]

    with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
        for idx, enriched_row in enumerate(executor.map(lambda row: rate_limited_enrich(row, requests_per_second), 
                                                        [row for _, row in df_to_process.iterrows()])):
            # Append the result to the global enriched DataFrame
            df_enriched = pd.concat([df_enriched, pd.DataFrame([enriched_row])], ignore_index=True)

            # Checkpoint: Save progress every 'checkpoint_interval' rows
            if (idx + 1) % checkpoint_interval == 0 or (idx + 1) == len(df_to_process):
                df_enriched.to_csv(checkpoint_file, index=False)  # Save the checkpoint
                print(f"Checkpoint saved: {len(df_enriched)} rows processed.")

    return df_enriched

# Apply the function in parallel while respecting the rate limit and checkpointing
df_enriched = parallel_apply_with_checkpoint(df, enrich_company, requests_per_minute=600)

# Final save to the output CSV file
df_enriched.to_csv('buyers_enriched.csv', index=False)

print("Processing complete and saved to domains_enriched.csv")