## StaffSpy

In [33]:
import os
import requests
import json

def get_li_slugs(name:str):
    response = requests.get(
        url="https://app.scrapingbee.com/api/v1/store/google",
        params={
            "api_key": os.environ.get('SCRAPINGBEE_API_KEY'),
            "search": f"{name} linkedin",
            "nb_results": 10
        }
    )
    response_str = response.content.decode('utf-8')
    response_obj = json.loads(response_str)
    profile_results = filter(
        lambda obj: obj['url'].startswith("https://www.linkedin.com/in/"),
        response_obj['organic_results']
        )
    
    slug = list(profile_results)[0]['url'].split('/in/')[1].strip('/') # Needs more complex logic to handle multiple names

    return slug

In [34]:
slug = get_li_slugs('Sybille Legitime')
slug

'slegitime'

In [3]:
from pathlib import Path
from staffspy import LinkedInAccount

# session_file = f"{Path(__file__).resolve().parent}/session.pkl"
session_file="session.pkl" # saves login cookies to log in once (lasts about a week)

account = LinkedInAccount(
    session_file=session_file,
    log_level=1
)

users = account.scrape_users(user_ids=[slug])

2025-04-24 21:12:31,511 - StaffSpy - INFO - Testing if logged in by checking arbitrary LinkedIn company page
2025-04-24 21:12:32,230 - StaffSpy - INFO - Account successfully logged in - res code 200
2025-04-24 21:12:32,639 - StaffSpy - INFO - Fetching data for account ACoAABunNswBt5Cz2Fi0-qYi3EgzbHoCsFYgsg8    1 / 1 - https://www.linkedin.com/in/slegitime
2025-04-24 21:12:33,715 - StaffSpy - INFO - Scraped 1 users


In [4]:
users.columns

Index(['search_term', 'id', 'urn', 'profile_link', 'profile_id', 'name',
       'first_name', 'last_name', 'location', 'headline', 'estimated_age',
       'followers', 'connections', 'mutuals', 'is_connection', 'premium',
       'creator', 'influencer', 'open_to_work', 'is_hiring',
       'current_position', 'current_company', 'past_company_1',
       'past_company_2', 'school_1', 'school_2', 'top_skill_1', 'top_skill_2',
       'top_skill_3', 'bio', 'experiences', 'schools', 'skills',
       'certifications', 'languages', 'emails_in_bio', 'potential_emails',
       'profile_photo', 'banner_photo', 'connection_created_at',
       'connection_email', 'connection_phone_numbers', 'connection_websites',
       'connection_street_address', 'connection_birthday'],
      dtype='object')

In [5]:
users['current_company']

0    Prudential Financial
Name: current_company, dtype: object

In [6]:
import concurrent.futures
import requests

MAX_RETRIES = 5 # Setting the maximum number of retries if we have failed requests to 5.
MAX_THREADS = 4
names = ["Sybille Legitime", "Dave Legitime"]
slugs = []

def extract_slug(response):
    response_str = response.content.decode('utf-8')
    response_obj = json.loads(response_str)
    return response_obj['organic_results'][0]['url'].split('/in/')[1].strip('/')

def scrape(name):
    for _ in range(MAX_RETRIES):
        response = requests.get(
            url="https://app.scrapingbee.com/api/v1/store/google",
            params={
                "api_key": os.environ.get('SCRAPINGBEE_API_KEY'),
                "search": f"{name} linkedin",
                "nb_results": 10
            }
        )

        if response.ok: # If we get a successful request
            slug = extract_slug(response)
            return slug

with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_THREADS) as executor:
    futures = [executor.submit(scrape, name) for name in names]
    
    for future in concurrent.futures.as_completed(futures):
        slugs.append(future.result())

In [7]:
slugs

['slegitime', 'dave-l%C3%A9gitime']

In [8]:
users = account.scrape_users(user_ids=slugs)

2025-04-24 21:35:00,122 - StaffSpy - INFO - Fetching data for account ACoAABunNswBt5Cz2Fi0-qYi3EgzbHoCsFYgsg8    1 / 2 - https://www.linkedin.com/in/slegitime
2025-04-24 21:35:01,293 - StaffSpy - INFO - Fetching data for account ACoAACp_A38BycdjIxPDDgCz9dUCLqdhHXTzF38    2 / 2 - https://www.linkedin.com/in/dave-l%C3%A9gitime
2025-04-24 21:35:02,275 - StaffSpy - INFO - Scraped 2 users


In [17]:
import pandas as pd

name_to_company = users[['name', 'profile_id', 'current_company']].copy()
name_to_company

Unnamed: 0,name,profile_id,current_company
0,Sybille Légitime,slegitime,Prudential Financial
1,Dave Légitime,dave-légitime,College of Our Lady of the Elms


In [18]:
name_to_company.to_csv("../test/name_to_company.csv", index=False)


## proxycurl

In [None]:
import requests
import os

api_key = os.environ.get('PROXYCURL_API_KEY')
headers = {'Authorization': 'Bearer ' + api_key}
api_endpoint = 'https://nubela.co/proxycurl/api/v2/linkedin'
params = {
    'linkedin_profile_url': 'https://linkedin.com/in/slegitime/',
    'extra': 'include',
    'github_profile_id': 'include',
    'facebook_profile_id': 'include',
    'twitter_profile_id': 'include',
    'personal_contact_number': 'include',
    'personal_email': 'include',
    'inferred_salary': 'include',
    'skills': 'include',
    'use_cache': 'if-present',
    'fallback_to_cache': 'on-error',
}
response = requests.get(api_endpoint,
                        params=params,
                        headers=headers)

In [12]:
import pandas as pd
import json

res = json.loads(response.content.decode('utf-8'))
proxycurl_res_dict = {
    'name': [res['full_name']],
    'profile_id': [res['public_identifier']],
    'current_company': [res['experiences'][0]['company']]
}

proxycurl_res_df = pd.DataFrame(proxycurl_res_dict)

In [13]:
proxycurl_res_df

Unnamed: 0,name,profile_id,current_company
0,Sybille Légitime,slegitime,Prudential Financial


In [14]:
proxycurl_res_df.to_csv("../test/name_to_company_proxycurl.csv", index=False)