In [13]:
# importing libraries
import time
import numpy as np
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException, StaleElementReferenceException

In [None]:
# initializing crome driver
driver = webdriver.Chrome()
# open url in crome driver
driver.get("https://www.linkedin.com/search/results/companies/?keywords=data%20analytics%20company&origin=CLUSTER_EXPANSION&sid=RN%40")

In [None]:
# Dictionary to store the extracted data
Skeleton = {
    "Company Name" :[],
    "Description" :[],
    "Type" :[],
    "Followers" :[],
    "Location" :[],
    "Employees" :[]
}

In [11]:
# Loop through each page of search results
for k in range(1, 50):
    url = f"https://www.linkedin.com/search/results/companies/?keywords=data%20analytics%20company&origin=CLUSTER_EXPANSION&page={k}&sid=wjp"
    driver.get(url)
    time.sleep(np.random.randint(7, 11))  # Random delay between page loads for anti-bot

    index = 0
    while True:
        # Find all company containers on the page
        data = driver.find_elements(By.CLASS_NAME, "reusable-search__result-container")
        
        # If index exceeds the number of elements, break the loop
        if index >= len(data):
            break
        
        # Get the current company element
        i = data[index]
        
        try:
            # Click on the company to open the detail page
            i.click()

            # Wait for the detail page to load specific content
            wait = WebDriverWait(driver, np.random.randint(6, 10))
            wait.until(EC.presence_of_element_located((By.XPATH, "//div[contains(@class, 'org-top-card-summary')]")))

            # Extract information on the detail page
            try:
                # Company Name
                company_name = driver.find_element(By.TAG_NAME, "h1").text
                print("Company Name:", company_name)
                Skeleton["Company Name"].append(company_name)
            except NoSuchElementException:
                Skeleton["Company Name"].append(None)

            try:
                # Description
                description = driver.find_element(By.CLASS_NAME, "org-top-card-summary__tagline").text
                print("Description:", description)
                Skeleton["Description"].append(description)
            except NoSuchElementException:
                Skeleton["Description"].append(None)

            try:
                # Employees
                employees = driver.find_element(By.CSS_SELECTOR, ".t-normal.t-black--light.link-without-visited-state.link-without-hover-state").text
                print("Employees:", employees)
                Skeleton["Employees"].append(employees)
            except NoSuchElementException:
                Skeleton["Employees"].append(None)

            # Find elements inside summary info list
            inner_info = driver.find_elements(By.CLASS_NAME, "org-top-card-summary-info-list__info-item")
            if inner_info:
                try:
                    # Type
                    type_info = inner_info[0].text
                    print("Type:", type_info)
                    Skeleton["Type"].append(type_info)
                except IndexError:
                    Skeleton["Type"].append(None)

                try:
                    # Location
                    location = inner_info[1].text
                    print("Location:", location)
                    Skeleton["Location"].append(location)
                except IndexError:
                    Skeleton["Location"].append(None)

                try:
                    # Followers
                    followers = inner_info[2].text
                    print("Followers:", followers)
                    Skeleton["Followers"].append(followers)
                except IndexError:
                    Skeleton["Followers"].append(None)

        except (TimeoutException, NoSuchElementException, StaleElementReferenceException) as e:
            print("Error navigating or extracting detail page data:", e)
            # Append None values for each key in case of an error
            Skeleton["Company Name"].append(None)
            Skeleton["Description"].append(None)
            Skeleton["Employees"].append(None)
            Skeleton["Type"].append(None)
            Skeleton["Location"].append(None)
            Skeleton["Followers"].append(None)

        finally:
            # Go back to the previous page
            driver.back()
            time.sleep(np.random.randint(2, 5))  # Random delay to mimic human behavior

        index += 1
        print("_________________________")



Company Name: Data Analytics company
Description: Connecting the world's professionals to make them more productive and successful.
Employees: 2-10 employees
Type: Business Intelligence Platforms
Location: New Delhi, Delhi
Followers: 173 followers
_________________________
Company Name: Data Science Central
Description: Industry's leading online resource and community for data practitioners, covering Machine Learning, AI, Data Science.
Employees: 2-10 employees
Type: Book and Periodical Publishing
Location: Issaquah, WA
Followers: 274K followers
_________________________
Company Name: data.world
Description: The Enterprise Data Catalog Platform
Employees: 51-200 employees
Type: Software Development
Location: Austin, Texas
Followers: 20K followers
_________________________
Company Name: Data BI LLC
Description: From building a custom dashboard to setting up data analytics in your organization from scratch. we handle your data.
Employees: 2-10 employees
Type: Data Infrastructure and Anal

In [None]:
# create dataframe
df = pd.DataFrame(Skeleton)

In [None]:
# getting know about data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 489 entries, 0 to 488
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Company Name  489 non-null    object
 1   Description   415 non-null    object
 2   Employees     486 non-null    object
 3   Type          489 non-null    object
 4   Location      489 non-null    object
 5   Followers     489 non-null    object
dtypes: object(6)
memory usage: 23.1+ KB


In [None]:
# if any duplicate drop it
df.drop_duplicates(inplace=True)

In [None]:
# view few rows of data
df.head()

Unnamed: 0,Company Name,Description,Employees,Type,Location,Followers
0,Data Analytics company,Connecting the world's professionals to make t...,2-10,Business Intelligence Platforms,"New Delhi, Delhi",173
1,Data Science Central,Industry's leading online resource and communi...,2-10,Book and Periodical Publishing,"Issaquah, WA",274K
2,data.world,The Enterprise Data Catalog Platform,51-200,Software Development,"Austin, Texas",20K
3,Data BI LLC,From building a custom dashboard to setting up...,2-10,Data Infrastructure and Analytics,"Sheridan, Wyoming",7K
4,Data Dynamics,Embedding Trust & Democracy in enterprise data...,51-200,Software Development,"Upper Saddle River, New Jersey",7K


In [None]:
# extracting number of followers
df["Followers"]= df["Followers"].str.extract(r'(\d+K?)')

In [None]:
# extracting number of employees
df["Employees"] = df["Employees"].str.extract(r'(\d+[K]?\-\d+[K]?)')

In [39]:
df

Unnamed: 0,Company Name,Description,Employees,Type,Location,Followers
0,Data Analytics company,Connecting the world's professionals to make t...,2-10,Business Intelligence Platforms,"New Delhi, Delhi",173
1,Data Science Central,Industry's leading online resource and communi...,2-10,Book and Periodical Publishing,"Issaquah, WA",274K
2,data.world,The Enterprise Data Catalog Platform,51-200,Software Development,"Austin, Texas",20K
3,Data BI LLC,From building a custom dashboard to setting up...,2-10,Data Infrastructure and Analytics,"Sheridan, Wyoming",7K
4,Data Dynamics,Embedding Trust & Democracy in enterprise data...,51-200,Software Development,"Upper Saddle River, New Jersey",7K
...,...,...,...,...,...,...
484,innoVet Health (SDVOSB),"Delivering healthcare IT, data analytics, inte...",11-50,IT Services and IT Consulting,"Dallas, Texas",17K
485,Copper Digital,Accelerating Growth & Innovation | Driving you...,51-200,IT Services and IT Consulting,"Dallas, Texas",16K
486,RomAnalytics,Making talent acquisition smarter with the rig...,2-10,Staffing and Recruiting,"Collegeville, PA",17K
487,GyanSys Inc.,,1K-5K,IT Services and IT Consulting,"Carmel, Indiana",185K


In [None]:
# creating csv file
df.to_csv("Linkedin.csv")