In [1]:
from bs4 import BeautifulSoup
import re
import os
import pandas as pd

In [2]:
def read_html_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        html_text = file.read()
    return html_text


def extract_universal_name(html_text):
    match = re.search(r"universalName:(\d+)", html_text)
    return match.group(1) if match else None


def extract_company_details(html_text):
    soup = BeautifulSoup(html_text, 'html.parser')

    # Extracting company name
    name_tag = soup.find('h1', class_='org-top-card-summary__title')
    name = name_tag.get_text(strip=True) if name_tag else None
    
    company_info = soup.find_all('div', class_='org-top-card-summary-info-list__info-item')
    
    industry = company_info[0].get_text(strip=True) if len(company_info) > 0 else None
    
    location, followers = None, None
    
    if len(company_info) > 1:
        for item in company_info[1:]:
            item = item.get_text(strip=True)
            if 'followers' in item:
                followers = item
            else:
                location = item

    # Extracting employee range
    employees_tag = soup.find('span', class_='t-normal t-black--light link-without-visited-state link-without-hover-state')
    employees = employees_tag.get_text(strip=True) if employees_tag and 'employees' in employees_tag.get_text() else None

    return {
        'name': name,
        'industry': industry,
        'location': location,
        'followers': followers,
        'employees': employees
    }


In [3]:
path = r'D:\github\garage\02_linkedin\download\\'

data = []

for file in os.listdir(path):
    if file.endswith('.html'):
        print(file)
        file_path = path + file
        
        html_text = read_html_file(file_path)
        
        company_id = extract_universal_name(html_text)
        company_details = extract_company_details(html_text)
        company_details['company_id'] = company_id
        
        if company_details['name'] is not None and company_id is not None:
            data.append(company_details)
        

df = pd.DataFrame(data)
df.head()

3P Services_ Overview _ LinkedIn.html
4th-IR_ Overview _ LinkedIn.html
5CA_ Overview _ LinkedIn.html
A Data Pro_ Overview _ LinkedIn.html
A1 Srbija d.o.o._ Overview _ LinkedIn.html
A1 Telekom Austria Group_ Overview _ LinkedIn.html
Abbott_ Overview _ LinkedIn.html
Able_ Overview _ LinkedIn.html
Accel Club_ Overview _ LinkedIn.html
Accenture_ Overview _ LinkedIn.html
Acolin_ Overview _ LinkedIn.html
Acronis_ Overview _ LinkedIn.html
Activity Stream_ Overview _ LinkedIn.html
Acumatica_ Overview _ LinkedIn.html
Addiko Bank Srbija_ Overview _ LinkedIn.html
Adient_ Overview _ LinkedIn.html
AdTech Holding_ Overview _ LinkedIn.html
AGIMA_ Overview _ LinkedIn.html
Agremo _ Overview _ LinkedIn.html
Agribusiness intelligence _ IHS Markit_ Overview _ LinkedIn.html
Ahold Delhaize_ Overview _ LinkedIn.html
AIESEC in Serbia_ Overview _ LinkedIn.html
AIK Banka_ Overview _ LinkedIn.html
Airpink_ Overview _ LinkedIn.html
Aitu-DALA_ Overview _ LinkedIn.html
Aiviq_ Overview _ LinkedIn.html
Akvelon, Inc._

FIS_ Overview _ LinkedIn.html
ForLoops_ Overview _ LinkedIn.html
Fortrade Ltd._ Overview _ LinkedIn.html
Foursquare_ Overview _ LinkedIn.html
freelance_ Overview _ LinkedIn.html
Frikom DOO_ Overview _ LinkedIn.html
Full Color® Games_ Overview _ LinkedIn.html
Fundraise Up_ Overview _ LinkedIn.html
FUNL Studio_ Overview _ LinkedIn.html
gategroup_ Overview _ LinkedIn.html
GDC (Fujitsu preferred supplier of Services)_ Overview _ LinkedIn.html
GDC Services_ Overview _ LinkedIn.html
Gemini Softwares_ About _ LinkedIn.html
Generali_ Overview _ LinkedIn.html
GENIAL Science_ Overview _ LinkedIn.html
GEODIS_ Overview _ LinkedIn.html
Geoprovider_ Overview _ LinkedIn.html
Geotaur Australia_ Overview _ LinkedIn.html
Glera Games_ Overview _ LinkedIn.html
Global Engineering Technologies_ Overview _ LinkedIn.html
Glovo_ Overview _ LinkedIn.html
Gomex Trgovina_ Overview _ LinkedIn.html
GoodsForecast_ Overview _ LinkedIn.html
Gooten_ Overview _ LinkedIn.html
Gorki List_ Overview _ LinkedIn.html
goUrban_

Play Media _ Overview _ LinkedIn.html
Playkot_ Overview _ LinkedIn.html
Playrix_ Overview _ LinkedIn.html
PLENUS Deutschland_ Overview _ LinkedIn.html
Porobic Group_ Overview _ LinkedIn.html
Positive Technologies, Inc._ Overview _ LinkedIn.html
Practicum USA_ Overview _ LinkedIn.html
PravoTech_ Overview _ LinkedIn.html
Primary Schools_ About _ LinkedIn.html
PRIMERO Rent a Car DOO Beograd_ Overview _ LinkedIn.html
Promsvyazbank_ Overview _ LinkedIn.html
PropellerAds_ Overview _ LinkedIn.html
Proton System_ Overview _ LinkedIn.html
Providenca Marketing_ Overview _ LinkedIn.html
Pruten Organic_ Overview _ LinkedIn.html
Public Policy Research Center_ Overview _ LinkedIn.html
PUJO Architects & Designers_ Overview _ LinkedIn.html
Puppy Dogs & Ice Cream_ Overview _ LinkedIn.html
Puratos_ Overview _ LinkedIn.html
PwC Serbia_ Overview _ LinkedIn.html
Qiwi_ Overview _ LinkedIn.html
QuantLabs _ Overview _ LinkedIn.html
Quantum Business Solutions_ Overview _ LinkedIn.html
R-Ladies Global_ Overview

Unnamed: 0,name,industry,location,followers,employees,company_id
0,3P Services,Public Safety,"Lohne, Germany",2K followers,51-200 employees,4027709
1,4th-IR,IT Services and IT Consulting,"Lucerne, CH",1K followers,2-10 employees,22344581
2,5CA,Outsourcing and Offshoring Consulting,"Utrecht, Utrecht",31K followers,1K-5K employees,111005
3,A Data Pro,Information Services,Sofia,9K followers,201-500 employees,45781
4,A1 Srbija d.o.o.,Telecommunications,Belgrade,26K followers,1K-5K employees,72659835


In [4]:
df[df['company_id'].duplicated()]

Unnamed: 0,name,industry,location,followers,employees,company_id


In [5]:
df[df['name'].isnull()]

Unnamed: 0,name,industry,location,followers,employees,company_id


In [6]:
df[df['company_id'].isnull()]

Unnamed: 0,name,industry,location,followers,employees,company_id


In [7]:
df[df['industry'].isnull()]

Unnamed: 0,name,industry,location,followers,employees,company_id


In [8]:
df['company_id'] = df['company_id'].astype(int)

df.to_excel('companies.xlsx', index=False)