# ENSIAS Alumni LinkedIn Data Scraper

This notebook scrapes LinkedIn profiles of ENSIAS alumni to collect professional data.

## 1. Setup Environment and Dependencies

In [1]:
# Install required packages
!pip install selenium
!pip install webdriver-manager
!pip install beautifulsoup4
!pip install lxml



In [2]:
# Import required libraries
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import WebDriverException
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import pandas as pd
import numpy as np
import time
import os

## 2. Configure LinkedIn Login

In [3]:
# Initialize Chrome WebDriver
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

# Navigate to LinkedIn login page
driver.get("https://www.linkedin.com/login")
time.sleep(3)  # Give page time to fully load

# Set LinkedIn credentials
user_name = 'rachidben460@gmail.com'  # LinkedIn email
password = 'mee3sessa1996LEPRO'  # LinkedIn password

# Enter credentials and login
username_field = WebDriverWait(driver, 10).until(
    EC.element_to_be_clickable((By.ID, "username"))
)
username_field.clear()
username_field.send_keys(user_name)

password_field = WebDriverWait(driver, 10).until(
    EC.element_to_be_clickable((By.ID, "password"))
)
password_field.clear()
password_field.send_keys(password)

# Use the correct XPath for the sign-in button
login_button = driver.find_element('xpath', '//*[@id="organic-div"]/form/div[4]/button')
login_button.click()

# Wait for login to complete
print("Waiting for login to complete...")
time.sleep(8)  # Wait time to ensure page loads completely

Waiting for login to complete...


## 3. Retrieve Alumni Profiles

In [4]:
# Navigate to ENSIAS alumni page (filtered for years 2010-2019)
driver.get("https://www.linkedin.com/school/ecole-nationale-superieure-d-informatique-et-d-analyse-des-systemes/people/?educationEndYear=2019&educationStartYear=2010")

# Wait for page to load
time.sleep(5)

In [5]:
# Scroll and click "Show more results" to load more profiles
target_profiles = 30  # Default target: collect 20 profiles
profiles_loaded = 0
max_attempts = 15  # Safety limit for maximum attempts

# Initial scroll to load first set of profiles
driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
time.sleep(3)

for attempt in range(max_attempts):
    try:
        # Find and click "Show more results" button
        show_more_button = driver.find_element('xpath', '//span[text()="Show more results"]/parent::button')
        
        # Scroll to button to ensure it's visible
        driver.execute_script("arguments[0].scrollIntoView();", show_more_button)
        time.sleep(1)
        
        # Click the button
        show_more_button.click()
        print(f"Clicked 'Show more results' ({attempt+1}/{max_attempts})")
        
        # Wait for new content to load
        time.sleep(3)
        
        # Estimate number of profiles loaded (each click adds ~12 profiles)
        profiles_loaded += 12
        
        # Stop if we've loaded enough profiles
        if profiles_loaded >= target_profiles:
            print(f"Target of {target_profiles} profiles reached")
            break
            
    except Exception as e:
        print(f"Could not find or click 'Show more results' button: {str(e)}")
        # Try scrolling a bit more and retry
        driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
        time.sleep(2)
        
        # If we've made multiple attempts without success, we're probably at the end
        if attempt > 2:
            print("No more 'Show more results' button found. All profiles loaded.")
            break

print(f"Finished loading approximately {profiles_loaded} profiles")

Clicked 'Show more results' (1/15)
Clicked 'Show more results' (2/15)
Clicked 'Show more results' (3/15)
Target of 30 profiles reached
Finished loading approximately 36 profiles


## 4. Extract Profile Links

In [6]:
# Get page source and parse with BeautifulSoup
page_source = driver.page_source
soup = BeautifulSoup(page_source, 'lxml')

# Find all profile cards based on the structure you provided
profile_links = soup.find_all('a', class_=lambda c: c and "pVgHQJscSHWSyOoSAQIMWsVcRbnRZrJxuo" in c)

# If the above doesn't work, try this alternative approach using partial ID match
if not profile_links:
    profile_links = soup.find_all('a', id=lambda i: i and i.startswith('org-people-profile-card__profile-image-'))

# Create a set to store unique profile URLs
unique_profile_links = set()

# Extract and filter profile links
for link in profile_links:
    href = link.get('href')
    if href:
        # Skip school/organization profiles (URLs containing '/school/' or '/company/')
        if '/school/' in href or '/company/' in href:
            continue
            
        # Extract the main profile URL by removing query parameters
        if '?' in href:
            href = href.split('?')[0]
        
        # Make sure it's a profile URL (should contain '/in/')
        if '/in/' in href:
            unique_profile_links.add(href)

# Convert set to list
unique_profile_links_list = list(unique_profile_links)

# Display the number of unique profiles found
print(f"Found {len(unique_profile_links_list)} unique LinkedIn profiles")

# Display first 5 profile links as a sample
print("\nSample of profile links:")
for link in unique_profile_links_list[:5]:
    print(link)

# Create full_profile_urls for the scraping section
full_profile_urls = unique_profile_links_list

Found 48 unique LinkedIn profiles

Sample of profile links:
https://www.linkedin.com/in/mouad-bazzi
https://www.linkedin.com/in/anas-al-kouraichi-695bb7178
https://www.linkedin.com/in/sqrt-negativeone
https://www.linkedin.com/in/zakariyaa-amekhroub-54a896198
https://www.linkedin.com/in/ismael-sbihi-061121129


## 5. Clean and Process Links

In [7]:
# Limit to the first 200 profiles (to reduce processing time)
profile_links_limited = unique_profile_links_list[:200]
print(f"Limited to {len(profile_links_limited)} profiles for processing")

# Normalize LinkedIn profile URLs
normalized_profile_urls = []

for url in profile_links_limited:
    # Parse URL
    parsed_url = urlparse(url)
    # Extract domain and path
    profile_url = parsed_url.netloc + parsed_url.path + '/'
    # Add to list
    normalized_profile_urls.append(profile_url)

# Add 'https://' prefix to all URLs
full_profile_urls = []
for url in normalized_profile_urls:
    full_url = "https://" + url
    full_profile_urls.append(full_url)

# Display sample of processed URLs
print("\nSample of processed URLs:")
for url in full_profile_urls[:5]:
    print(url)

Limited to 48 profiles for processing

Sample of processed URLs:
https://www.linkedin.com/in/mouad-bazzi/
https://www.linkedin.com/in/anas-al-kouraichi-695bb7178/
https://www.linkedin.com/in/sqrt-negativeone/
https://www.linkedin.com/in/zakariyaa-amekhroub-54a896198/
https://www.linkedin.com/in/ismael-sbihi-061121129/


## 6. Scrape Individual Profiles

In [8]:
# Set the number of profiles to scrape (limited to 3 for testing)
profiles_to_scrape = 20  # Testing with just 3 profiles

# Initialize lists to store extracted data
names = []
locations = []
companies = []
positions = []
graduation_years = []  # Will now be populated

# Counter for progress tracking
profile_counter = 0
# Limit the profiles to scrape to the specified number
profiles_to_process = full_profile_urls[:profiles_to_scrape]
total_profiles = len(profiles_to_process)

print(f"Starting to scrape {total_profiles} profiles...")

# Process each profile
for profile_url in profiles_to_process:
    profile_counter += 1
    print(f"Processing profile {profile_counter}/{total_profiles}: {profile_url}")
    
    try:
        # Visit profile page
        driver.get(profile_url)
        time.sleep(8)  # Increased wait time for page to fully load
        
        # Extract name from title
        name = ""
        try:
            title_element = driver.find_element('xpath', '/html/head/title')
            if title_element:
                full_title = title_element.get_attribute('textContent')
                # Extract name from title format "(32) NAME | LinkedIn"
                if "|" in full_title:
                    name_part = full_title.split("|")[0].strip()
                    # Remove any leading numbers/parentheses
                    if ")" in name_part:
                        name = name_part.split(")")[1].strip()
                    else:
                        name = name_part
                names.append(name)
                print(f"  - Found name: {name}")
            else:
                # Try alternative method
                page_source = driver.page_source
                soup = BeautifulSoup(page_source, 'lxml')
                name_element = soup.find('h1', {'class': 'text-heading-xlarge'})
                if name_element:
                    name = name_element.get_text().strip()
                    names.append(name)
                    print(f"  - Found name: {name}")
                else:
                    names.append("")
                    print("  - Name not found")
        except Exception as e:
            names.append("")
            print(f"  - Error extracting name: {str(e)}")
        
        # Extract location
        location = ""
        try:
            # Get page source and parse with BeautifulSoup
            page_source = driver.page_source
            soup = BeautifulSoup(page_source, 'lxml')
            
            # Try multiple location selectors
            location_element = soup.find('span', {'class': 'text-body-small inline t-black--light break-words'})
            if not location_element:
                location_xpath = '//*[@id="profile-content"]/div/div[2]/div/div/main/section[1]/div[2]/div[2]/div[2]/span[1]'
                location_element = driver.find_element('xpath', location_xpath)
                if location_element:
                    location = location_element.text.strip()
            else:
                location = location_element.get_text().strip()
                
            if location:
                locations.append(location)
                print(f"  - Found location: {location}")
            else:
                locations.append("")
                print("  - Location not found")
        except Exception as e:
            locations.append("")
            print(f"  - Error extracting location: {str(e)}")
        
        # Extract graduation year from education section
        graduation_year = ""
        try:
            # Navigate to education section directly
            education_url = profile_url + "details/education/"
            driver.get(education_url)
            time.sleep(5)  # Wait for page to load
            
            # Get page source and parse with BeautifulSoup
            page_source = driver.page_source
            soup = BeautifulSoup(page_source, 'lxml')
            
            # Try to find ENSIAS in the education list
            ensias_found = False
            
            # Look for education institutions
            education_elements = soup.find_all('div', class_=lambda c: c and 't-bold' in c)
            
            for i, edu_element in enumerate(education_elements):
                edu_span = edu_element.find('span', {'aria-hidden': 'true'})
                if edu_span:
                    edu_text = edu_span.get_text().strip()
                    # Check if this is ENSIAS
                    if "ENSIAS" in edu_text or "Ecole Nationale Supérieure d'Informatique" in edu_text:
                        ensias_found = True
                        # Look for the date span associated with this education entry
                        # It's typically in a t-black--light class span near this element
                        
                        # First, try to find the parent li element
                        parent_li = edu_element
                        while parent_li and parent_li.name != 'li':
                            parent_li = parent_li.parent
                        
                        if parent_li:
                            # Find the date span within this li
                            date_span = parent_li.find('span', class_=lambda c: c and 't-black--light' in c)
                            if date_span:
                                date_inner_span = date_span.find('span', {'aria-hidden': 'true'})
                                if date_inner_span:
                                    date_text = date_inner_span.get_text().strip()
                                    # Extract the graduation year from the date range
                                    # Format could be "2018 - 2021" or similar
                                    if "-" in date_text:
                                        graduation_year = date_text.split("-")[1].strip()
                                        # Clean up any non-numeric characters
                                        import re
                                        graduation_year = re.sub(r'[^0-9]', '', graduation_year)
                                        # Take just the first 4 digits (the year)
                                        if len(graduation_year) >= 4:
                                            graduation_year = graduation_year[:4]
                                    else:
                                        # If there's no range, just try to extract any 4-digit year
                                        import re
                                        years = re.findall(r'\b(20\d{2})\b', date_text)
                                        if years:
                                            graduation_year = years[-1]  # Take the last year found
            
            # If we didn't find ENSIAS specifically, but found some education entries,
            # take the most recent graduation year as a fallback
            if not ensias_found and education_elements:
                date_spans = soup.find_all('span', class_=lambda c: c and 't-black--light' in c)
                for date_span in date_spans:
                    inner_span = date_span.find('span', {'aria-hidden': 'true'})
                    if inner_span:
                        date_text = inner_span.get_text().strip()
                        if "-" in date_text:
                            year_part = date_text.split("-")[1].strip()
                            import re
                            year_digits = re.sub(r'[^0-9]', '', year_part)
                            if len(year_digits) >= 4:
                                graduation_year = year_digits[:4]
                                break
                        else:
                            import re
                            years = re.findall(r'\b(20\d{2})\b', date_text)
                            if years:
                                graduation_year = years[-1]
                                break
            
            if graduation_year:
                graduation_years.append(graduation_year)
                print(f"  - Found graduation year: {graduation_year}")
            else:
                graduation_years.append("")
                print("  - Graduation year not found")
        except Exception as e:
            graduation_years.append("")
            print(f"  - Error extracting graduation year: {str(e)}")
        
        # Extract current position and company - try experience section directly
        position = ""
        company = ""
        try:
            # Navigate to experience section directly
            experience_url = profile_url + "details/experience/"
            driver.get(experience_url)
            time.sleep(5)  # Wait for page to load
            
            # Get page source and parse with BeautifulSoup
            page_source = driver.page_source
            soup = BeautifulSoup(page_source, 'lxml')
            
            # Find position using the class from your working code
            position_element = soup.find('div', class_=lambda c: c and 't-bold' in c)
            if position_element:
                position_span = position_element.find('span', {'aria-hidden': 'true'})
                if position_span:
                    position = position_span.get_text().strip()
            
            # Find company using the class from your working code
            company_element = soup.find('span', class_='t-14 t-normal')
            if company_element:
                company_span = company_element.find('span', {'aria-hidden': 'true'})
                if company_span:
                    company_text = company_span.get_text().strip()
                    # Split by "·" as in your working code
                    company_parts = company_text.split("·")
                    if len(company_parts) > 0:
                        company = company_parts[0].strip()
            
            if not position or not company:
                # If not found, go back to main profile and try the original method
                driver.get(profile_url)
                time.sleep(5)
                
                # Scroll down to ensure experience section loads
                driver.execute_script("window.scrollBy(0, 500);")
                time.sleep(2)
                
                page_source = driver.page_source
                soup = BeautifulSoup(page_source, 'lxml')
                
                # Find the experience section container
                experience_container = soup.find('div', class_='pvs-list__container')
                
                if experience_container:
                    # Find the first (most recent) experience item
                    first_experience = experience_container.find('li', class_='pvs-list__paged-list-item')
                    
                    if first_experience:
                        # Extract position
                        if not position:
                            position_div = first_experience.find('div', class_=lambda c: c and 't-bold' in c)
                            if position_div:
                                position_span = position_div.find('span', {'aria-hidden': 'true'})
                                if position_span:
                                    position = position_span.get_text(strip=True)
                        
                        # Extract company
                        if not company:
                            company_spans = first_experience.find_all('span', class_=lambda c: c and 't-14' in c and 't-normal' in c)
                            for span in company_spans:
                                if 't-black--light' not in span.get('class', []):
                                    company_span = span.find('span', {'aria-hidden': 'true'})
                                    if company_span:
                                        company_text = company_span.get_text(strip=True)
                                        if "·" in company_text:
                                            company = company_text.split("·")[0].strip()
                                        else:
                                            company = company_text
                                        break
            
            positions.append(position)
            companies.append(company)
            
            if position:
                print(f"  - Found position: {position}")
            else:
                print("  - Position not found")
                
            if company:
                print(f"  - Found company: {company}")
            else:
                print("  - Company not found")
                
        except Exception as e:
            positions.append("")
            companies.append("")
            print(f"  - Error extracting position/company: {str(e)}")
            
    except Exception as e:
        print(f"  - Error processing profile: {str(e)}")
        # Append empty values for failed profiles
        if len(names) < profile_counter:
            names.append("")
        if len(locations) < profile_counter:
            locations.append("")
        if len(companies) < profile_counter:
            companies.append("")
        if len(positions) < profile_counter:
            positions.append("")
        if len(graduation_years) < profile_counter:
            graduation_years.append("")
    
    # Print progress for each profile
    print(f"Completed profile {profile_counter}/{total_profiles}")

print("Profile scraping complete")

Starting to scrape 20 profiles...
Processing profile 1/20: https://www.linkedin.com/in/mouad-bazzi/
  - Found name: Mouad Bazzi
  - Found location: Prefecture of Casablanca, Casablanca-Settat, Morocco
  - Found graduation year: 2022
  - Found position: Software Engineer
  - Found company: Attijariwafa bank
Completed profile 1/20
Processing profile 2/20: https://www.linkedin.com/in/anas-al-kouraichi-695bb7178/
  - Found name: anas al-kouraichi
  - Found location: Rabat-Salé-Kénitra, Morocco
  - Found graduation year: 2022
  - Found position: FY COMPUTING
  - Found company: 3 yrs 8 mos
Completed profile 2/20
Processing profile 3/20: https://www.linkedin.com/in/sqrt-negativeone/
  - Found name: Fakhri Mouad
  - Found location: Rabat, Rabat-Salé-Kénitra, Morocco
  - Found graduation year: 2023
  - Found position: C Developer
  - Found company: Journee
Completed profile 3/20
Processing profile 4/20: https://www.linkedin.com/in/zakariyaa-amekhroub-54a896198/
  - Found name: Zakariyaa Amekhro

In [9]:
# Display counts for collected data
print("Data collection summary:")
print(f"Names: {len(names)}")
print(f"Locations: {len(locations)}")
print(f"Graduation years: {len(graduation_years)}")
print(f"Positions: {len(positions)}")
print(f"Companies: {len(companies)}")

Data collection summary:
Names: 20
Locations: 20
Graduation years: 20
Positions: 20
Companies: 20


In [10]:
# Save the collected data to a CSV file
import pandas as pd
import os

# Create a DataFrame from the collected data
data = {
    'Name': names,
    'Location': locations,
    'Graduation_Year': graduation_years,
    'Position': positions,
    'Company': companies
}

df = pd.DataFrame(data)

# Save to CSV in the specified directory
output_file = os.path.join("D:\\1 Projects\\Ensias\\Data", "ensias_alumni_profiles.csv")
df.to_csv(output_file, index=False, encoding='utf-8-sig')  # utf-8-sig preserves special characters

print(f"Data saved successfully to {output_file}")
print(f"DataFrame shape: {df.shape}")

# Display the first few rows to verify the content
print("\nPreview of saved data:")
display(df.head())

Data saved successfully to D:\1 Projects\Ensias\Data\ensias_alumni_profiles.csv
DataFrame shape: (20, 5)

Preview of saved data:


Unnamed: 0,Name,Location,Graduation_Year,Position,Company
0,Mouad Bazzi,"Prefecture of Casablanca, Casablanca-Settat, M...",2022,Software Engineer,Attijariwafa bank
1,anas al-kouraichi,"Rabat-Salé-Kénitra, Morocco",2022,FY COMPUTING,3 yrs 8 mos
2,Fakhri Mouad,"Rabat, Rabat-Salé-Kénitra, Morocco",2023,C Developer,Journee
3,Zakariyaa Amekhroub,"Rabat, Rabat-Salé-Kénitra, Morocco",2022,Cyber Security Analyst,Orange Cyberdefense
4,Ismael Sbihi,"Rabat-Salé-Kénitra, Morocco",2021,Assistant projet,"DevTech Systems, Inc."
