In [None]:
import requests
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import time

# Define the initial endpoint
initial_endpoint = "https://www.vendr.com/categories?_data=routes%2Fcategories._index"

# Define the headers for the second endpoint requests
second_endpoint_headers = {
    'accept': '*/*',
    'accept-language': 'en-US,en;q=0.9',
    'baggage': 'sentry-environment=production,sentry-release=1dab7cf756903936b083ffa2e91f29776517b253,sentry-public_key=5883a39b5675f77d625461a1260c675e,sentry-trace_id=bf38b4b84dbe41afb454802a31203551,sentry-sample_rate=1,sentry-transaction=routes%2Fcategories.%24categorySlug.%24subCategorySlug._index,sentry-sampled=true',
    'cookie': '<REPLACE_WITH_YOUR_COOKIES>',
    'priority': 'u=1, i',
    'referer': 'https://www.vendr.com/categories/vertical-industries/agriculture?verified=false&page=1',
    'sec-ch-ua': '"Chromium";v="130", "Google Chrome";v="130", "Not?A_Brand";v="99"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"macOS"',
    'sec-fetch-dest': 'empty',
    'sec-fetch-mode': 'cors',
    'sec-fetch-site': 'same-origin',
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36'
}

# Function to fetch data from a given URL with retries and optional headers
def fetch_data(url, retries=3, backoff_factor=0.3, headers=None):
    for attempt in range(retries):
        try:
            response = requests.get(url, headers=headers, timeout=10)
            response.raise_for_status()
            return response.json()
        except requests.exceptions.RequestException as e:
            if attempt < retries - 1:
                time.sleep(backoff_factor * (2 ** attempt))
            else:
                return None

# Function to fetch data for a category and its pages
def fetch_category_data(category, child, page):
    category_slug = category.get('slug')
    child_slug = child.get('slug')
    url = f"https://www.vendr.com/categories/{category_slug}/{child_slug}?verified=false&page={page}&_data=routes%2Fcategories.%24categorySlug.%24subCategorySlug._index"
    return fetch_data(url, headers=second_endpoint_headers)

# Fetch initial categories data
categories_data = fetch_data(initial_endpoint)

if categories_data is None:
    print("Failed to retrieve initial categories data. Exiting.")
else:
    print("Initial categories data fetched successfully.")

# Extract all tasks for parallel processing
tasks = []
all_companies = []

with ThreadPoolExecutor(max_workers=40) as executor:
    for category in categories_data['categories']:
        for child in category.get('children', []):
            page = 1
            total_pages = 1

            # Fetch the first page to determine total pages
            first_page_data = fetch_category_data(category, child, page)
            if first_page_data:
                companies = first_page_data.get('companies', [])
                for company in companies:
                    all_companies.append({
                        'parent_category': category.get('name'),
                        'parent_slug': category.get('slug'),
                        'child_category': child.get('name'),
                        'child_slug': child.get('slug'),
                        'company_id': company.get('id'),
                        'company_slug': company.get('slug'),
                        'company_name': company.get('name'),
                        'company_legal_name': company.get('legalName'),
                        'company_icon': company.get('icon'),
                        'company_description': company.get('description'),
                        'is_vendr_verified': company.get('isVendrVerified'),
                        'company_stats': company.get('stats')
                    })
                total_pages = first_page_data.get('pagination', {}).get('totalPages', 1)

            # Add tasks for remaining pages
            for page in range(2, total_pages + 1):
                tasks.append(executor.submit(fetch_category_data, category, child, page))

    # Collect results from parallel tasks
    for future in tqdm(as_completed(tasks), total=len(tasks), desc="Fetching pages"):
        data = future.result()
        if data:
            companies = data.get('companies', [])
            for company in companies:
                all_companies.append({
                    'parent_category': category.get('name'),
                    'parent_slug': category.get('slug'),
                    'child_category': child.get('name'),
                    'child_slug': child.get('slug'),
                    'company_id': company.get('id'),
                    'company_slug': company.get('slug'),
                    'company_name': company.get('name'),
                    'company_legal_name': company.get('legalName'),
                    'company_icon': company.get('icon'),
                    'company_description': company.get('description'),
                    'is_vendr_verified': company.get('isVendrVerified'),
                    'company_stats': company.get('stats')
                })

# Convert the list of companies to a pandas DataFrame
df_companies = pd.DataFrame(all_companies)

# Save the DataFrame to a CSV file
df_companies.to_csv('vendr_all_companies.csv', index=False)
print("All company data saved to vendr_all_companies.csv")


In [None]:
from concurrent.futures import ThreadPoolExecutor, as_completed
import requests
import pandas as pd
from tqdm import tqdm
import time

# Function to fetch details for a company slug
def fetch_company_details_with_slug(slug, headers):
    url = f"https://www.vendr.com/marketplace/{slug}?_data=routes%2Fmarketplace.%24companySlug._index"
    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        data = response.json()
        
        company = data.get('company', {})
        competitors = company.get('competitors', [])

        # Extract lowest and second-lowest discounts
        if competitors:
            sorted_competitors = sorted(
                competitors,
                key=lambda c: float(c.get('stats', {}).get('averageSavingsPercent', float('inf')))
            )
            lowest = sorted_competitors[0] if len(sorted_competitors) > 0 else None
            second_lowest = sorted_competitors[1] if len(sorted_competitors) > 1 else None
            lowest_name = lowest.get('name') if lowest else None
            lowest_discount = float(lowest.get('stats', {}).get('averageSavingsPercent', 0)) if lowest else None
            second_lowest_name = second_lowest.get('name') if second_lowest else None
            second_lowest_discount = float(second_lowest.get('stats', {}).get('averageSavingsPercent', 0)) if second_lowest else None
        else:
            lowest_name = None
            lowest_discount = None
            second_lowest_name = None
            second_lowest_discount = None

        # Return extracted details
        return {
            "raw_data": data,
            'company_slug': slug,  # Include the slug for proper alignment
            'domain': company.get('domain', None),
            'description': company.get('description', None),
            'competitors_data': competitors,
            'competitor_names': ', '.join([comp.get('name', '') for comp in competitors]),
            'communityInsights': company.get('communityInsights', []),
            'endOfQuarterSignatureNewPurchase': company.get('discountLevers', {}).get('endOfQuarterSignatureNewPurchase', None),
            'endOfQuarterSignatureRenewal': company.get('discountLevers', {}).get('endOfQuarterSignatureRenewal', None),
            'expectedGrowthEconomiesOfScaleNewPurchase': company.get('discountLevers', {}).get('expectedGrowthEconomiesOfScaleNewPurchase', None),
            'expectedGrowthEconomiesOfScaleRenewal': company.get('discountLevers', {}).get('expectedGrowthEconomiesOfScaleRenewal', None),
            'multiYearNewPurchase': company.get('discountLevers', {}).get('multiYearNewPurchase', None),
            'multiYearRenewal': company.get('discountLevers', {}).get('multiYearRenewal', None),
            'hasBigSavings': company.get('hasBigSavings', None),
            'quickSalesProcessSignatureNewPurchase': company.get('discountLevers', {}).get('quickSalesProcessSignatureNewPurchase', None),
            'quickSalesProcessSignatureRenewal': company.get('discountLevers', {}).get('quickSalesProcessSignatureRenewal', None),
            'averageContractValue': company.get('stats', {}).get('averageContractValue', None),
            'averageSavingsPercent': company.get('stats', {}).get('averageSavingsPercent', None),
            'lowest_discount_competitor': lowest_name,
            'lowest_discount': lowest_discount,
            'second_lowest_competitor': second_lowest_name,
            'second_lowest_discount': second_lowest_discount
        }
    except requests.exceptions.RequestException as e:
        print(f"Request failed for company slug '{slug}': {e}")
        # Return default values in case of failure
        return {
            'company_slug': slug,
            'domain': None,
            'description': None,
            'competitors_data': None,
            'competitor_names': None,
            'communityInsights': None,
            'endOfQuarterSignatureNewPurchase': None,
            'endOfQuarterSignatureRenewal': None,
            'expectedGrowthEconomiesOfScaleNewPurchase': None,
            'expectedGrowthEconomiesOfScaleRenewal': None,
            'multiYearNewPurchase': None,
            'multiYearRenewal': None,
            'hasBigSavings': None,
            'quickSalesProcessSignatureNewPurchase': None,
            'quickSalesProcessSignatureRenewal': None,
            'averageContractValue': None,
            'averageSavingsPercent': None,
            'lowest_discount_competitor': None,
            'lowest_discount': None,
            'second_lowest_competitor': None,
            'second_lowest_discount': None
        }

# Load company slugs
df_companies = pd.read_csv('vendr_all_companies.csv')
company_slugs = df_companies['company_slug'].tolist()

# Headers for the API requests
marketplace_endpoint_headers = {
    'accept': '*/*',
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36'
}

# Fetch details using ThreadPoolExecutor
results = []
max_workers = 10

print("Fetching detailed company information...")

with ThreadPoolExecutor(max_workers=max_workers) as executor:
    future_to_slug = {executor.submit(fetch_company_details_with_slug, slug, marketplace_endpoint_headers): slug for slug in company_slugs}
    
    for future in tqdm(as_completed(future_to_slug), total=len(future_to_slug), desc="Fetching companies"):
        results.append(future.result())

# Convert the results into a DataFrame
df_results = pd.DataFrame(results)

# Merge the fetched data back into the original DataFrame
df_enriched = pd.merge(df_companies, df_results, on='company_slug', how='left')

# Save the enriched DataFrame
df_enriched.to_csv("enriched_companies_with_discounts.csv", index=False)

print("Data fetching and enrichment complete. Saved to 'enriched_companies_with_discounts.csv'.")


In [None]:
df_enriched['communityInsights'] = df_enriched['communityInsights'].apply(lambda x: None if isinstance(x, list) and len(x) == 0 else x)



In [None]:
# Save the enriched DataFrame to a CSV file

# Convert averageSavingsPercent to numeric
df_enriched['averageSavingsPercent'] = pd.to_numeric(df_enriched['averageSavingsPercent'])

# Filter rows where averageSavingsPercent > lowest_discount
df_filtered = df_enriched[df_enriched['averageSavingsPercent'] > df_enriched['lowest_discount']]

df_filtered.to_csv('vendr_target.csv')




In [None]:
import pandas as pd
from tqdm import tqdm
from openai import OpenAI
from pydantic import BaseModel


df = pd.read_csv("vendr_target.csv")


class Reviews(BaseModel):
    quotes: list[str]

# Ensure `communityInsights` is a list or string before proceeding
df['communityInsights'] = df['communityInsights'].fillna("").astype(str)

# Define a function to call OpenAI
def get_multi_year_quotes(community_insights):
    print(f"Processing: {community_insights[:50]}...")  # Log the first 50 characters of the input
    try:
        client = OpenAI()

        # Call OpenAI API with chat model
        completion = client.beta.chat.completions.parse(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": "You are an expert reviewing saas reviews regarding pricing."},
                {
                    "role": "user",
                    "content": f"Your goal is to extract customer quotes mentioning having been given discounts by agreeing to commit to long term contracts (e.g annual or multi-year). Do not keep any quote mentioning discounts for other reasons (e.g mentioning competitors, acquiring more licences, etc.). This is the json with all quotes: {community_insights}"
                }
            ],
                response_format=Reviews,
        )

        print(completion.choices[0].message.parsed.quotes)
        
        # Extract and return the response content
        return completion.choices[0].message.parsed.quotes
    except Exception as e:
        print(f"Error for input: {community_insights[:50]}... - {e}")
        return None

# Add a new column for multi_year_quotes
print("Fetching quotes from OpenAI...")
tqdm.pandas()  # Enable tqdm progress bar for Pandas
df['multi_year_quotes'] = df['communityInsights'].progress_apply(get_multi_year_quotes)

# Save the updated DataFrame
df.to_csv("enriched_with_quotes.csv", index=False)
print("Updated DataFrame saved to 'enriched_with_quotes.csv'.")


In [None]:
from tqdm import tqdm
from openai import OpenAI
from pydantic import BaseModel
import pandas as pd


df = pd.read_csv("enriched_with_quotes.csv")


class Reviews(BaseModel):
    quote_1: str
    quote_2: str

# Define a function to call OpenAI
def get_multi_year_quotes(quotes):
    print(f"Processing: {quotes[:50]}...")  # Log the first 50 characters of the input
    try:
        client = OpenAI()

        # Call OpenAI API with chat model
        completion = client.beta.chat.completions.parse(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": "You are an expert reviewing saas reviews regarding pricing."},
                {
                    "role": "user",
                    "content": f"""Your goal is to select up to two quotes from the provided list with the following criteria: 
                    1. Quotes must explicitly mention annual or multi year commitments
                    2. Chosen quotes should be prioritized in order of higher to lower discount mentioned
                    3. If no annual or multi year commitment is mentioned, it's ok to not select any quote, just return null value
                    List of quotes: {quotes}"""
                }
            ],
                response_format=Reviews,
        )
        
        # Extract and return the response content
        return completion.choices[0].message.parsed.quote_1, completion.choices[0].message.parsed.quote_2
    except Exception as e:
        print(f"Error for input: {quotes[:50]}... - {e}")
        return None

# Add a new column for multi_year_quotes
print("Fetching quotes from OpenAI...")
tqdm.pandas()  # Enable tqdm progress bar for Pandas
df[['quote_1', 'quote_2']] = pd.DataFrame(
    df['multi_year_quotes'].progress_apply(get_multi_year_quotes).tolist(),
    index=df.index
)
df.to_csv('enriched_with_quotes.csv')