In [None]:
import requests
from urllib.parse import urlparse, parse_qs, urlencode, quote
import time
import json
import pandas as pd
from tqdm import tqdm


In [None]:
# Function to securely input sensitive information
def secure_input(prompt):
    import getpass
    return getpass.getpass(prompt)

# User Inputs
linkedin_cookie = secure_input("Enter your LinkedIn Cookie (e.g., 'li_at=...; ...'): ")
csrf_token = secure_input("Enter your CSRF Token (e.g., 'ajax:1234567890'): ")
sales_navigator_url = input("Enter the LinkedIn Sales Navigator URL: ")


In [None]:
def parse_sales_navigator_url(url):
    """
    Parses the Sales Navigator URL and extracts query parameters.
    """
    parsed_url = urlparse(url)
    query_params = parse_qs(parsed_url.query)
    # Flatten the query parameters (take first value for each key)
    query_params = {k: v[0] for k, v in query_params.items()}
    return query_params

def build_api_url(base_api_url, query_params, start=0, count=25):
    """
    Constructs the API URL with updated pagination parameters and correct encoding.
    """
    query_params_copy = query_params.copy()
    query_params_copy['start'] = str(start)
    query_params_copy['count'] = str(count)
    query_params_copy['q'] = 'searchQuery'
    query_params_copy['decorationId'] = 'com.linkedin.sales.deco.desktop.searchv2.LeadSearchResult-14'
    
    # Ensure proper encoding of the 'query' parameter
    if 'query' in query_params_copy:
        # Encode special characters except for '(', ')', ':', ',', and '%'
        # This prevents double encoding of already encoded characters like '%20'
        query_encoded = quote(query_params_copy['query'], safe='(),:%')
        query_params_copy['query'] = query_encoded
    print(query_params_copy['query'])
    # Reconstruct the query string
    query_string = urlencode(query_params_copy, safe='(),:%')
    return f"{base_api_url}?{query_string}"


In [None]:
# Parse the Sales Navigator URL to extract query parameters
query_params = parse_sales_navigator_url(sales_navigator_url)

# Define the base API endpoint
base_api_url = 'https://www.linkedin.com/sales-api/salesApiLeadSearch'

# Define headers
headers = {
    'Accept': '*/*',
    'Accept-Language': 'en-US,en;q=0.9',
    'cookie': linkedin_cookie,
    'csrf-token': csrf_token,
    'x-restli-protocol-version': '2.0.0',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36',
}

# Pagination settings
START = 0
COUNT = 25
MAX_RESULTS = 2000  # Total desired results


In [None]:
# Initialize a list to hold all company elements
all_leads = []

# Calculate the number of iterations
iterations = min(MAX_RESULTS // COUNT, 80)  # To ensure we don't exceed 2000

print(f"Starting scraping of {iterations * COUNT} records...")

for start in tqdm(range(START, iterations * COUNT, COUNT), desc="Scraping"):
    # Build the API URL for the current page
    api_url = build_api_url(base_api_url, query_params, start=start, count=COUNT)

    try:
        # Make the GET request
        response = requests.get(api_url, headers=headers)
        
        # Check for successful response
        if response.status_code == 200:
            data = response.json()
            leads = data.get('elements', [])
            all_leads.extend(leads)
            print(f"Fetched {len(leads)} leads at start={start}. Total fetched: {len(all_leads)}")
        else:
            print(f"Failed to fetch data at start={start}. Status Code: {response.status_code}")
            print(f"Response: {response.text}")
            # Optionally, implement retry logic or break
            break
        
        # Optional: Sleep to respect rate limits
        time.sleep(1)  # Sleep for 1 second between requests
    
    except Exception as e:
        print(f"An error occurred at start={start}: {e}")
        break

print(f"Scraping completed. Total leads fetched: {len(all_leads)}")


In [None]:
import re
# Function to extract relevant fields from each company element
def extract_lead_data(lead):
    try:

        linkedin_id = lead.get('entityUrn').split(':')[-1].split(',')[0][1:] if ':' in lead.get('entityUrn') else None

        lead_data = {
            'first_name': lead.get('lastName'),
            'last_name': lead.get('firstName'),
            'full_name': lead.get('fullName'),
            'geo_region': lead.get('geoRegion'),
            'summary': lead.get('summary'),
            'company_name': lead.get('currentPositions')[0]['companyName'],
            'company_id': lead.get('currentPositions')[0]['companyUrn'],
            'current_job_title': lead.get('currentPositions')[0]['title'],
            'entityUrn': lead.get('entityUrn'),
            'linkedin_id': linkedin_id,
            'objectUrn': lead.get('objectUrn')
        }
        
        # Optional: Extract more fields as needed
        
        return lead_data
    except Exception as e:
        print(f"Error extracting data for a lead: {e}")
        return None

# Process all companies
processed_leads = [extract_lead_data(lead) for lead in all_leads]
# Remove None entries
processed_leads = [lead for lead in processed_leads if lead is not None]

# Create a DataFrame
df = pd.DataFrame(processed_leads)


In [None]:
df.to_csv('linkedin_scraped_people.csv')