In [None]:
import pandas as pd
import requests
import http
import json
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

# Function to get RepVue URL using serper
def serper_name_to_repvue_slug(company_name):
    try:
        conn = http.client.HTTPSConnection("google.serper.dev")
        payload = json.dumps({
            "q": f"{company_name} site:repvue.com"
        })
        headers = {
            'X-API-KEY': '<API_KEY>',
            'Content-Type': 'application/json'
        }
        conn.request("POST", "/search", payload, headers)
        res = conn.getresponse()
        data = res.read().decode("utf-8")
        results = json.loads(data)
        organic_results = results['organic']
        repvue_url = organic_results[0]['link'] if organic_results else None
        slug = repvue_url.rstrip('/').split('/')[-1]

        # print(f"Found slug {slug}")
        return slug
    except Exception as e:
        # print(f"Error in serper_name_to_repvue for {company_name}: {e}")
        return None

# Function to scrape RepVue data
def scrape_repvue_url(company_slug):
    url = f'https://www.repvue.com/_next/data/t2sT-IylGuJcU6tKv07hE/en/companies/{company_slug}.json?slug={company_slug}'

    try:
        headers = {
            'accept': '*/*',
            'accept-language': 'en-US,en;q=0.9',
            'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
            'Cookie': '_ga=GA1.1.304363711.1731690100; _tt_enable_cookie=1; _ttp=Jbs1r-orh6Y1Ghh3ZO3Ut6lYaxW.tt.1; repvue_consent=true; _ga_20BBWGRF3X=GS1.1.1732109815.4.1.1732110132.0.0.0; mp_dc14bdafdff673e2452bc261c4fdc3ae_mixpanel=%7B%22distinct_id%22%3A%20%22%24device%3A19330c5f36a4fe-07440e6edcb0b7-1f525636-201b88-19330c5f36a4fe%22%2C%22%24device_id%22%3A%20%2219330c5f36a4fe-07440e6edcb0b7-1f525636-201b88-19330c5f36a4fe%22%2C%22%24initial_referrer%22%3A%20%22%24direct%22%2C%22%24initial_referring_domain%22%3A%20%22%24direct%22%2C%22%24search_engine%22%3A%20%22google%22%7D'
        }
        response = requests.get(url, headers=headers)
        
        if response.status_code == 200:
            data = response.json()
            company = data.get('pageProps', {}).get('company', {})
            overview = data.get('pageProps', {}).get('overview', {})
            quota_attainment = company.get('quota_attainment', None)
            industry_average_quota_attainment = overview.get('industry_average_quota_attainment', None)
            return quota_attainment, industry_average_quota_attainment
        else:
            return None, None
    except Exception as e:
        print(f"Error in scrape_repvue_url for {url}: {e}")
        return None, None

# Combined function to process each row
def process_row(row):
    try:
        company_name = row['company_name']
        repvue_slug = serper_name_to_repvue_slug(company_name)
        if repvue_slug:
            quota_attainment, industry_average_quota_attainment = scrape_repvue_url(repvue_slug)
            return row['index'], quota_attainment, industry_average_quota_attainment, repvue_slug
        return row['index'], None, None, None
    except Exception as e:
        print(f"Error in process_row for {row['company_name']}: {e}")
        return row['index'], None, None, None

# Load the dataframe
df = pd.read_csv('./vendr_fixed.csv')

# Add index to rows for tracking
df.reset_index(inplace=True)

# Initialize new columns
df['quota_attainment'] = None
df['industry_average_quota_attainment'] = None
df['repvue_slug'] = None

# Process rows in parallel
results = []
with ThreadPoolExecutor(max_workers=10) as executor:
    # Submit tasks
    futures = {executor.submit(process_row, row): row for _, row in df.iterrows()}
    
    # Track progress with tqdm
    for future in tqdm(as_completed(futures), total=len(futures), desc="Processing Rows"):
        try:
            results.append(future.result())
        except Exception as e:
            print(f"Error while processing a future: {e}")

# Update dataframe with results
for index, quota_attainment, industry_average, repvue_slug in results:
    print(quota_attainment, industry_average, repvue_slug)
    df.at[index, 'quota_attainment'] = quota_attainment
    df.at[index, 'industry_average_quota_attainment'] = industry_average
    df.at[index, 'repvue_slug'] = repvue_slug



In [None]:
df.to_csv('./repvue_quota_attainment.csv')

In [None]:

df.dropna(subset=['quota_attainment', 'industry_average_quota_attainment', 'averageSavingsPercent'], inplace=True)
df['company_stats'] = df['company_stats'].apply(lambda x: json.loads(str(x).replace("'", '"')) if x else None)

# Extract 'negotiationsHandled' into a new column
df['negotiationsHandled'] = df['company_stats'].apply(
    lambda x: x.get('negotiationsHandled') if isinstance(x, dict) else None
)
df.to_csv('./repvue_quota_attainment_clean.csv')