In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import os
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import time

# File paths for checkpoint and output
PROGRESS_FILE = 'progress_checkpoint.csv'
OUTPUT_FILE = 'output_with_job_check.csv'

# Load the CSV into a DataFrame or resume from the last checkpoint
if os.path.exists(PROGRESS_FILE):
    df = pd.read_csv(PROGRESS_FILE)
    print("Resuming from the last checkpoint.")
else:
    df = pd.read_csv('linkedin_people.csv')
    df['current_job_at_company'] = None  # Initialize the new column
    print("Starting from the beginning.")

# Define the Zenrows API key and endpoint
ZENROWS_API_KEY = '<API_KEY>'
ZENROWS_ENDPOINT = 'https://api.zenrows.com/v1/'

# Function to call Zenrows and parse the job positions with exponential retry
def get_job_positions(index, linkedin_url, company_name):
    max_retries = 3
    for attempt in range(max_retries):
        try:
            params = {
                'apikey': ZENROWS_API_KEY,
                'url': linkedin_url,
                'js_render': 'true',
                'premium_proxy': 'true',
            }
            
            response = requests.get(ZENROWS_ENDPOINT, params=params)
            if response.status_code == 200:
                soup = BeautifulSoup(response.text, 'html.parser')
                job_positions = []

                # Find grouped job experiences
                experience_groups = soup.find_all('li', class_='experience-group')
                for group in experience_groups:
                    company_header_tag = group.find('h4', class_='experience-group-header__company')
                    if company_header_tag:
                        company_group_name = company_header_tag.text.strip()

                        # Find individual job positions within the group
                        grouped_jobs = group.find_all('li', class_='experience-group-position')
                        for job in grouped_jobs:
                            time_range_tag = job.find('span', class_='date-range text-color-text-secondary font-sans text-md leading-open font-regular')
                            if time_range_tag:
                                time_range = time_range_tag.text.strip()
                                job_positions.append((company_group_name, time_range))

                # Find standalone job positions outside of groups
                standalone_jobs = soup.find_all('li', class_='profile-section-card relative flex w-full list-none py-1.5 pr-2 pl-1 experience-item')
                for job in standalone_jobs:
                    company_name_tag = job.find('span', class_='experience-item__subtitle')
                    time_range_tag = job.find('span', class_='date-range text-color-text-secondary font-sans text-md leading-open font-regular')

                    if company_name_tag and time_range_tag:
                        company = company_name_tag.text.strip()
                        time_range = time_range_tag.text.strip()
                        job_positions.append((company, time_range))

                # Check if any position matches the given company and contains "Present"
                for company, time_range in job_positions:
                    if (company.lower() == company_name.lower() or company.lower() in company_name.lower() or  company_name.lower() in company.lower()) and 'Present' in time_range:
                        return index, True
                
                if not job_positions:
                    return index, "Not Found"

                return index, False
            else:
                print(f"Attempt {attempt + 1} failed for {linkedin_url} with status code {response.status_code}. Retrying...")
        except requests.exceptions.RequestException as e:
            print(f"Attempt {attempt + 1} failed for {linkedin_url} with error: {e}. Retrying...")

        # Exponential backoff: wait for 2^attempt seconds before retrying
        time.sleep(2 ** attempt)

    # If all retries fail, log the failure and return None
    print(f"Failed to retrieve data for {linkedin_url} after {max_retries} attempts.")
    return index, None

# Process rows with non-empty LinkedIn URLs in parallel for the first 20 rows
try:
    with ThreadPoolExecutor(max_workers=40) as executor:
        futures = []
        
        for index, row in df.iterrows():
            if pd.notna(row['linkedin_url']) and (pd.isna(row['current_job_at_company']) or not row['current_job_at_company']):
                futures.append(
                    executor.submit(get_job_positions, index, row['linkedin_url'], row['company'])
                )

        # Use tqdm to show progress as tasks are completed
        for future in tqdm(as_completed(futures), total=len(futures), desc="Processing LinkedIn data"):
            index, result = future.result()
            if result is not None:
                df.at[index, 'current_job_at_company'] = result

            # Save progress after each processed row
            df.to_csv(PROGRESS_FILE, index=False)
except Exception as e:
    print(f"An error occurred: {e}")
finally:
    # Save the final result to the output file
    df.to_csv(OUTPUT_FILE, index=False)
    print("Progress saved to the output file.")
