<a href="https://colab.research.google.com/github/sj442/linkedin-visualizations/blob/main/Linkedin_Connections.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [1]:
import pandas as pd
import numpy as np
import csv

In [3]:
!pip install pyvis

Collecting pyvis
  Downloading pyvis-0.3.2-py3-none-any.whl.metadata (1.7 kB)
Collecting jedi>=0.16 (from ipython>=5.3.0->pyvis)
  Downloading jedi-0.19.2-py2.py3-none-any.whl.metadata (22 kB)
Downloading pyvis-0.3.2-py3-none-any.whl (756 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m756.0/756.0 kB[0m [31m42.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading jedi-0.19.2-py2.py3-none-any.whl (1.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m69.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: jedi, pyvis
Successfully installed jedi-0.19.2 pyvis-0.3.2


In [4]:
from pyvis.network import Network
import webbrowser
from collections import Counter

import pickle
from google.colab import files

from transformers import pipeline
import torch

## Privacy disclaimer

**Note**: I have intentionally removed all print functions that identified individuals in my network with their name, role and company.

In the visualizations, I will only display either an individual's role or the comapny they are affiliated with, to avoid giving out information that could be traced back to them.

# Dataframe Import


In [5]:
df = pd.read_csv('/content/sample_data/Connections.csv', skiprows=1)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 601 entries, 0 to 600
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   First Name     601 non-null    object
 1   Last Name      601 non-null    object
 2   URL            601 non-null    object
 3   Email Address  11 non-null     object
 4   Company        598 non-null    object
 5   Position       598 non-null    object
 6   Connected On   601 non-null    object
dtypes: object(7)
memory usage: 33.0+ KB


In [6]:
df = df.dropna(subset=['Position'])
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 598 entries, 0 to 600
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   First Name     598 non-null    object
 1   Last Name      598 non-null    object
 2   URL            598 non-null    object
 3   Email Address  11 non-null     object
 4   Company        598 non-null    object
 5   Position       598 non-null    object
 6   Connected On   598 non-null    object
dtypes: object(7)
memory usage: 37.4+ KB


In [11]:
print(df.columns)

Index(['First Name', 'Last Name', 'URL', 'Email Address', 'Company',
       'Position', 'Connected On'],
      dtype='object')


In [7]:
# Dropping emails column since it is not needed in analysis and is missing for most connections
df = df.drop('Email Address', axis=1)

# Network of roles

In [10]:
# Number of unique positions held by individuals
df.Position.nunique()

457

In [12]:
# Apply a function to consolidate the same roles which are worded differently. Eg: Founder|CEO, Founder & CEO
df['Position'] = df['Position'].apply(consolidate_positions)

In [12]:
# Applying consolidation for just Founder and Engineer roles reduces the number of unique roles
# by 14%
df['Position'].nunique()

393

## Model generated grouping

We will use a huggingface model and perform zero shot classfiication to classify the roles into a predefined list of broad role categories or industris.



In [None]:
# Check if GPU is available and use to speed up the classification
device = 0 if torch.cuda.is_available() else -1

classifier = pipeline("zero-shot-classification",
                      model="MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli",
                      device=device,
                      truncation=True,
                      max_length=64)

labels = ['Academia', 'HR', 'Internship', 'Data', 'AI/ML', 'QA', 'Engineering', 'Sales', 'Marketing', 'UX', 'Finance',
          'Operations', 'Product', 'Founder', 'Research', 'Management', 'Medicine', 'Legal', 'Architecture',
          'Consulting', 'Creative', 'Government', 'Nonprofit', 'Retired']

def classify_batch(positions, labels, batch_size=8):
    results = []
    for i in range(0, len(positions), batch_size):
        batch = positions[i:i + batch_size]
        batch = list(map(lambda x: x.lower(), batch))
        batch_results = classifier(batch, labels, multi_label=True)
        if isinstance(batch_results, dict):  # happens when only 1 item is present
            batch_results = [batch_results]
        results.extend([res['labels'][0] for res in batch_results])
    return results

def classify_position(position):
    position = position.lower()
    result = classifier(position, labels)
    return result['labels'][0]

In [None]:
# We will save the position categories locally to avoid running the above computation repeatedly

df['Position_Category'] = classify_batch(df['Position'].tolist(), labels)

# Save to Colab local
with open('classified.pkl', 'wb') as f:
    pickle.dump(df['Position_Category'].tolist(), f)

# Download to local machine
files.download('classified.pkl')

In [None]:
# Upload the file from local machine
uploaded = files.upload()

In [15]:
# Load the position categories into the dataframe
with open('classified.pkl', 'rb') as f:
    df['Position_Category'] = pickle.load(f)

In [17]:
df['Position_Category'].value_counts()

Unnamed: 0_level_0,count
Position_Category,Unnamed: 1_level_1
Management,249
Engineering,85
Founder,54
Academia,28
Consulting,27
Data,22
Creative,16
Research,15
Government,15
AI/ML,15


Too many roles have been categorized as Management and Engineering. We will attempt to refine the categorization before generating visualizations on it

In [18]:
df["Position"] = df["Position"].astype(str)
df["Position_Category"] = df.apply(lambda row: categorize_engineering(row.Position) if row.Position == 'Engineering' else row.Position_Category, axis = 1)
df["Position_Category"] = df.apply(lambda row: categorize_management(row.Position) if row.Position_Category == 'Management' else row.Position_Category, axis = 1)

In [19]:
df['Position_Category'].value_counts()

Unnamed: 0_level_0,count
Position_Category,Unnamed: 1_level_1
Leadership,122
Management,96
Founder,85
Engineering,85
Academia,28
Consulting,27
Data,22
Creative,16
Research,15
Government,15


## Create a graph using NetworkX

In [17]:
frequent_roles = Counter(df['Position']).most_common(15)
print(frequent_roles)

frequent_roles = Counter(df['Position']).most_common(15)
res = list(map(lambda x: x[0], frequent_roles))
print(res)

[('Founder', 48), ('Software Engineer', 39), ('Founder & Ceo', 22), ('Director', 12), ('Senior Software Engineer', 10), ('Ceo', 8), ('Partner', 8), ('Vice President', 6), ('Product Manager', 5), ('Head Of Product', 5), ('Data Scientist', 4), ('Senior Product Manager', 4), ('Manager', 4), ('Director Of Product Management', 4), ('Managing Director', 4)]
['Founder', 'Software Engineer', 'Founder & Ceo', 'Director', 'Senior Software Engineer', 'Ceo', 'Partner', 'Vice President', 'Product Manager', 'Head Of Product', 'Data Scientist', 'Senior Product Manager', 'Manager', 'Director Of Product Management', 'Managing Director']


In [23]:
def shorten_label(label, max_len=20):
    return label if len(label) <= max_len else label[:max_len] + "..."

# Build graph
net = Network(height='750px', width='100%', notebook=False)

net.set_options("""
var options = {
  "physics": {
    "solver": "barnesHut",
    "repulsion": {
      "nodeDistance": 120,
      "springLength": 100,
      "springConstant": 0.04
    },
    "stabilization": {
      "iterations": 200
    }
  }
}
""")

# Count how many people have each role
role_counts = Counter(df['Position_Category'])

# Add central category nodes
for role, count in role_counts.items():

    net.add_node(
        role,
        label=role,
        size=15,  # base size + scaled size
        color=category_to_color(role),
        font={"color": "#000000", "size": 16}
    )

common_roles_df = df[df["Position"].isin(res)]

# Add nodes for frequent roles and edges from central nodes to frequent role nodes
for _, row in common_roles_df.iterrows():
  position = row['Position']
  category = row['Position_Category']
  net.add_node(position,
               label=shorten_label(position),
               title=position,
               shape='ellipse',
               size=12,
               color='orange',  # node background color
               font={
                   "face": "verdana",
                   "color": "#1f77b4",
                   "size": 10,
                   "strokeWidth": 0
                   },
               physics=False)
  net.add_edge(category, position, length=100)
  net.cdn_resources = 'remote'


filtered_df = df[~df["Position"].isin(res)]

# Add connection nodes and edges
for _, row in filtered_df.iterrows():
    position = row['Position']
    category = row['Position_Category']
    net.add_node(position,
                 label=shorten_label(position),
                 title = position,
                 shape='dot',
                 color=category_to_color(row['Position_Category']),
                 size=12)
    net.add_edge(category, position)
    net.cdn_resources = 'remote'

net.save_graph("linkedin_role_network.html")
files.download("linkedin_role_network.html")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Network of companies

In [8]:
companies = df.Company.value_counts().reset_index().Company.to_list()
print(len(companies))

486


In [5]:
# Number of unique companies associated with my connections
df.Company.nunique()

486

In [5]:
df.Company.value_counts().head(10)

Unnamed: 0_level_0,count
Company,Unnamed: 1_level_1
Audible,17
Amazon,17
Meta,12
Google,6
Apple,5
"Audible, Inc.",4
Boston Consulting Group (BCG),4
Microsoft,4
Amazon Web Services (AWS),4
McKinsey & Company,4


In [9]:
company_industry_df = df['Company'].value_counts().reset_index()
company_industry_df.columns = ['Company', 'Count']
company_industry_df.sort_values(by='Count', ascending=False)
company_industry_df = company_industry_df.head(100)

companies = company_industry_df['Company'].to_list()
print(len(companies))

100


In [6]:
# Used ChatGPT to generate a company - industry mapping

company_industry_mapping = {
  "Audible": "Audiobooks & Digital Media",
  "Amazon": "E‑commerce & Cloud Computing",
  "Meta": "Social Media & Technology",
  "Google": "Internet Services & Technology",
  "Apple": "Consumer Electronics & Software",
  "Audible, Inc.": "Audiobooks & Digital Media",
  "Boston Consulting Group (BCG)": "Management Consulting",
  "Microsoft": "Software & Cloud Computing",
  "Amazon Web Services (AWS)": "Cloud Computing",
  "McKinsey & Company": "Management Consulting",
  "Cornell University": "Higher Education",
  "ZS": "Management Consulting (Healthcare Analytics)",
  "Merck": "Pharmaceuticals & Healthcare",
  "LinkedIn": "Professional Networking & Technology",
  "PayPal": "Fintech & Online Payments",
  "Genpact": "Business Process Outsourcing",
  "Snap Inc.": "Social Media & Technology",
  "DoorDash": "Food Delivery & Logistics",
  "Tower Research Capital": "Financial Services (High-frequency Trading)",
  "Zepto": "E‑commerce & Quick-Commerce",
  "Kearney": "Management Consulting",
  "Study Hall Educational Foundation (SHEF)": "Education & Nonprofit",
  "Deutsche Bank": "Banking & Financial Services",
  "Philip Morris International": "Tobacco & Consumer Goods",
  "ExxonMobil": "Oil & Gas",
  "Stealth AI Startup": "Artificial Intelligence (Startup)",
  "Syensqo": "Healthcare Analytics & Digital Lab",
  "The Akanksha Foundation": "Nonprofit / Education",
  "Bain & Company": "Management Consulting",
  "IDFC FIRST Bank": "Banking & Financial Services",
  "Accenture": "Consulting & Technology Services",
  "Shell": "Oil & Gas / Energy",
  "IBM": "Technology & Consulting",
  "Bloomberg": "Financial Data & Media",
  "Novartis": "Pharmaceuticals & Healthcare",
  "Autodesk": "Design Software",
  "JPMorgan Chase & Co.": "Banking & Financial Services",
  "Verizon": "Telecommunications",
  "American Express": "Financial Services (Payments)",
  "PowerToFly": "Online Job Marketplace & Diversity",
  "FitBudd": "Health & Fitness Technology",
  "Indian Institute of Technology, Kanpur": "Higher Education",
  "Kredivo Group": "Fintech & Credit Platform",
  "Freelance": "Independent Contractor",
  "UBS": "Banking & Wealth Management",
  "Commonwealth Bank": "Banking & Financial Services",
  "OnMyCanvas": "Custom Gifting & E‑commerce",
  "SUEZ": "Environmental Services & Water Management",
  "Government of India": "Public Sector",
  "FusionEngage": "CRM & Customer Engagement Software",
  "AUR - Architecture Urbanism Research": "Architecture & Urban Design",
  "Yaak": "AI & Communication Technology",
  "Inxeption": "E‑commerce Software & Marketplace",
  "Brasserie BFM SA": "Food & Brewery",
  "Meesho": "E‑commerce & Social Commerce",
  "NVIDIA": "Semiconductors & AI Hardware",
  "LSEG (London Stock Exchange Group)": "Financial Data & Exchange Services",
  "Consultancy Services": "Management Consulting",
  "Aravali Asset Management": "Asset Management",
  "Lincoln Electric India": "Industrial Manufacturing & Welding",
  "FanDuel": "Online Gaming & Sports Betting",
  "HDFC Life": "Insurance",
  "State Street Global Advisors": "Asset Management",
  "Dexter Capital Advisors": "Financial Advisory",
  "Kenvue": "Consumer Health Products",
  "Northern Trust Corporation": "Wealth Management & Financial Services",
  "Bumble": "Social Networking & Dating",
  "LazyGardener": "Gardening Services",
  "Starbucks": "Food & Beverage (Coffeehouse)",
  "Medikabazaar": "Healthcare Supply Chain & B2B",
  "Innolution, LLC": "Consulting Services",
  "AKS University, Satna (M.P.)": "Higher Education",
  "Smaart Classes": "Education & Tutoring",
  "DEWI Foundation": "Nonprofit",
  "Executive Placement Network": "Recruitment & Professional Services",
  "Optum": "Health Services & Technology",
  "AINE AI": "Artificial Intelligence",
  "SecurityScorecard": "Cybersecurity & Risk Management",
  "UnShaadi": "Social Networking / Matrimony Tech",
  "Cisco": "Networking & Telecommunications",
  "Stamford AI Consulting": "AI Consulting & Services",
  "Marvell Technology": "Semiconductors",
  "shikhar stone crushers/": "Construction Materials",
  "RedSeer": "Management Consulting & Analytics",
  "Chubb": "Insurance",
  "PPG": "Industrial Coatings & Materials",
  "Rayonsoft Technologies Inc": "IT Services",
  "TopSkill Managment Consulting LLP": "Management Consulting",
  "Capabilities Limited": "Consulting Services",
  "HiThrive": "Sleep / Health Technology",
  "Infoparks Kerala": "Cyber Park & IT Services",
  "Island Boys Recruiting": "Recruitment Services",
  "Alleyoop": "Marketing Technology",
  "VMware": "Virtualization & Cloud Computing",
  "Spectrum Equity": "Private Equity",
  "Rippling": "HR & IT Management Software",
  "Dexcom": "Medical Devices & Diabetes Care",
  "YoloHealth": "Health Insurance & Fintech",
  "Graviton Research Capital LLP": "Financial Services & Research",
  "TechSur Solutions": "Technology Consulting"
}

In [11]:
# Extract values
values = company_industry_mapping.values()

# Convert to a set to get unique values
unique_values_set = set(values)

# Get the count of unique values
unique_count = len(unique_values_set)
print(unique_count)

84


In [15]:
# The companies were being categorized into 84 industries. So updated my prompt
# to specify the number of industries to categorize into

company_industry_mapping_v1 = {
  "Audible": "Technology",
  "Amazon": "E-commerce",
  "Meta": "Technology",
  "Google": "Technology",
  "Apple": "Consumer Goods",
  "Audible, Inc.": "Technology",
  "Boston Consulting Group (BCG)": "Management Consulting",
  "Microsoft": "Technology",
  "Amazon Web Services (AWS)": "Technology",
  "McKinsey & Company": "Management Consulting",
  "Cornell University": "Education & Research",
  "ZS": "Management Consulting",
  "Merck": "Pharmaceuticals & Healthcare",
  "LinkedIn": "Technology",
  "PayPal": "Technology",
  "Genpact": "Technology",
  "Snap Inc.": "Media & Entertainment",
  "DoorDash": "Technology",
  "Tower Research Capital": "Financial Services",
  "Zepto": "E-commerce",
  "Kearney": "Management Consulting",
  "Study Hall Educational Foundation (SHEF)": "Education & Research",
  "Deutsche Bank": "Financial Services",
  "Philip Morris International": "Consumer Goods",
  "ExxonMobil": "Energy & Industrial",
  "Stealth AI Startup": "Other / Freelance / Startups",
  "Syensqo": "Pharmaceuticals & Healthcare",
  "The Akanksha Foundation": "Nonprofit & Foundations",
  "Bain & Company": "Management Consulting",
  "IDFC FIRST Bank": "Financial Services",
  "Accenture": "Management Consulting",
  "Shell": "Energy & Industrial",
  "IBM": "Technology",
  "Bloomberg": "Media & Entertainment",
  "Novartis": "Pharmaceuticals & Healthcare",
  "Autodesk": "Technology",
  "JPMorgan Chase & Co.": "Financial Services",
  "Verizon": "Telecommunications",
  "American Express": "Financial Services",
  "PowerToFly": "Technology",
  "FitBudd": "Technology",
  "Indian Institute of Technology, Kanpur": "Education & Research",
  "Kredivo Group": "Financial Services",
  "Freelance": "Other / Freelance / Startups",
  "UBS": "Financial Services",
  "Commonwealth Bank": "Financial Services",
  "OnMyCanvas": "E-commerce",
  "SUEZ": "Energy & Industrial",
  "Government of India": "Government & Public Sector",
  "FusionEngage": "Technology",
  "AUR - Architecture Urbanism Research": "Architecture & Urban Design",
  "Yaak": "Technology",
  "Inxeption": "E-commerce",
  "Brasserie BFM SA": "Other / Freelance / Startups",
  "Meesho": "E-commerce",
  "NVIDIA": "Technology",
  "LSEG (London Stock Exchange Group)": "Financial Services",
  "Consultancy Services": "Management Consulting",
  "Aravali Asset Management": "Financial Services",
  "Lincoln Electric India": "Energy & Industrial",
  "FanDuel": "Media & Entertainment",
  "HDFC Life": "Insurance",
  "State Street Global Advisors": "Financial Services",
  "Dexter Capital Advisors": "Financial Services",
  "Kenvue": "Pharmaceuticals & Healthcare",
  "Northern Trust Corporation": "Financial Services",
  "Bumble": "Technology",
  "LazyGardener": "Consumer Goods",
  "Starbucks": "Consumer Goods",
  "Medikabazaar": "Pharmaceuticals & Healthcare",
  "Innolution, LLC": "Management Consulting",
  "AKS University, Satna (M.P.)": "Education & Research",
  "Smaart Classes": "Education & Research",
  "DEWI Foundation": "Nonprofit & Foundations",
  "Executive Placement Network": "Recruitment & Professional Services",
  "Optum": "Pharmaceuticals & Healthcare",
  "AINE AI": "Technology",
  "SecurityScorecard": "Technology",
  "UnShaadi": "Other / Freelance / Startups",
  "Cisco": "Technology",
  "Stamford AI Consulting": "Technology",
  "Marvell Technology": "Technology",
  "shikhar stone crushers/": "Energy & Industrial",
  "RedSeer": "Management Consulting",
  "Chubb": "Insurance",
  "PPG": "Energy & Industrial",
  "Rayonsoft Technologies Inc": "Technology",
  "TopSkill Managment Consulting LLP": "Management Consulting",
  "Capabilities Limited": "Management Consulting",
  "HiThrive": "Other / Freelance / Startups",
  "Infoparks Kerala": "Government & Public Sector",
  "Island Boys Recruiting": "Recruitment & Professional Services",
  "Alleyoop": "Technology",
  "VMware": "Technology",
  "Spectrum Equity": "Financial Services",
  "Rippling": "Technology",
  "Dexcom": "Pharmaceuticals & Healthcare",
  "YoloHealth": "Technology",
  "Graviton Research Capital LLP": "Financial Services",
  "TechSur Solutions": "Technology"
}


In [16]:
company_industry_df = df['Company'].value_counts().reset_index()
company_industry_df.columns = ['Company', 'Count']
company_industry_df['Industry'] = company_industry_df['Company'].map(company_industry_mapping_v1)
company_industry_df.sort_values(by='Count', ascending=False)
company_industry_df = company_industry_df.dropna(subset=['Industry'])
company_industry_df.shape[0]

99

In [17]:
# Sum counts for both rows
audible_total = company_industry_df.loc[company_industry_df['Company'].isin(['Audible', 'Audible, Inc.']), 'Count'].sum()

print(audible_total)

industry = company_industry_df.loc[company_industry_df['Company'].isin(['Audible', 'Audible, Inc.']), 'Industry'].iloc[0]

# Drop both rows
company_industry_df = company_industry_df[~company_industry_df['Company'].isin(['Audible', 'Audible, Inc.'])]

# Add a new combined row
company_industry_df.loc[len(company_industry_df)] = ['Audible', audible_total, industry]

company_industry_df.sort_values(by='Count', ascending=False, inplace=True)
company_industry_df.head(10)

21


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  company_industry_df.sort_values(by='Count', ascending=False, inplace=True)


Unnamed: 0,Company,Count,Industry
97,Audible,21,Technology
1,Amazon,17,E-commerce
2,Meta,12,Technology
3,Google,6,Technology
4,Apple,5,Consumer Goods
7,Microsoft,4,Technology
8,Amazon Web Services (AWS),4,Technology
9,McKinsey & Company,4,Management Consulting
6,Boston Consulting Group (BCG),4,Management Consulting
11,ZS,3,Management Consulting


In [18]:
# Extract values
values = set(company_industry_mapping_v1.values())
print(values)
print(len(values))

{'Technology', 'Telecommunications', 'Consumer Goods', 'E-commerce', 'Other / Freelance / Startups', 'Insurance', 'Government & Public Sector', 'Pharmaceuticals & Healthcare', 'Financial Services', 'Education & Research', 'Energy & Industrial', 'Nonprofit & Foundations', 'Recruitment & Professional Services', 'Management Consulting', 'Architecture & Urban Design', 'Media & Entertainment'}
16


In [20]:
from pyvis.network import Network
import math

# list of industries
industries = company_industry_df['Industry'].unique()

# Setup
net = Network(height="750px", width="100%", bgcolor="white", font_color="black")

# Network options
net.set_options("""
var options = {
  "physics": {
    "solver": "forceAtlas2Based",
    "forceAtlas2Based": {
      "gravitationalConstant": -50,
      "centralGravity": 0.01,
      "springLength": 150,
      "springConstant": 0.08
    },
    "stabilization": { "iterations": 150 }
  }
}
""")

# Position 16 industry nodes in a circle
radius = 400
angle_step = 2 * math.pi / len(industries)

industry_positions = {}
for i, industry in enumerate(industries):
    angle = i * angle_step
    x = radius * math.cos(angle)
    y = radius * math.sin(angle)
    industry_positions[industry] = (x, y)

    net.add_node(industry,
                 label=industry,
                 title=industry,
                 x=x,
                 y=y,
                 fixed=True,
                 color=category_to_color(industry, industries),
                 shape='dot',
                 size=40,
                 )


for _, row in company_industry_df.iterrows():
  company = row['Company']
  industry = row['Industry']
  weight = row['Count']

  net.add_node(company,
               label=company,
               title=company,
               shape='dot',
               size=20,
               fixed=False,
               color=category_to_color(row['Industry'], industries),
)

  net.add_edge(industry, company, value=weight)

net.save_graph("linkedin_company_network.html")
files.download("linkedin_company_network.html")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Helper functions

In [13]:
import matplotlib.pyplot as plt

def get_25_colors():
    # Combine tab10 and tab20 to get 30 unique colors
    cmap1 = plt.get_cmap('tab10')  # 10 colors
    cmap2 = plt.get_cmap('tab20')  # 20 colors

    colors = []
    for i in range(10):
        colors.append(cmap1(i))
    for i in range(15):  # just take 15 from tab20 to make 25 total
        colors.append(cmap2(i))

    # Convert RGBA to hex
    hex_colors = ['#%02x%02x%02x' % tuple(int(255 * c) for c in color[:3]) for color in colors]
    return hex_colors

def category_to_color(category, categories):
    color_list = get_25_colors()
    category_map = {cat: color_list[i % len(color_list)] for i, cat in enumerate(categories)}
    return category_map.get(category, '#999999')  # fallback gray

In [7]:
import string

def consolidate_positions(position):
  position = position.lower()
  position = consolidate_ceo_roles(position)
  position = consolidate_dev_roles(position)
  return string.capwords(position)

def consolidate_ceo_roles(position):
  if ("founder" in position) and ("ceo" in position):
    return "Founder & CEO"
  elif "founder" in position:
    return "Founder"
  elif "ceo" in position or "chief executive officer" in position:
    return "CEO"
  elif "co-founder" in position and "ceo" in position:
    return "Co-Founder & CEO"
  elif "co-founder" in position:
    return "Co-Founder"
  else:
    return position


def consolidate_dev_roles(position):
  if ("software development engineer" in position) or ("software development engineer i" in position):
    return "Software Engineer"
  elif "software development engineer ii" in position or "senior software engineer" in position:
    return "Senior Software Engineer"
  elif "developer" in position or "mobile" in position or "ios" in position or "android" in position:
    return "Software Engineer"
  else:
    return position

In [8]:
def string_contains_any(text, elements):
    """
    Checks if a string contains any of the elements from a list.

    Args:
        text: The string to search in.
        elements: A list of strings to search for.

    Returns:
        True if the string contains at least one element from the list, False otherwise.
    """
    return any(element in text for element in elements)

In [9]:
def categorize_position(position):
  position = position.lower()
  if string_contains_any(position, ['educat', 'lecturer', 'faculty', "postdoctoral", "phd", "bachelor", 'bs', 'ms', 'masters', 'professor', 'teach', 'graduate']):
    return "Academia"
  elif string_contains_any(position, ["hr", "people", "recruit", 'talent']):
    return "HR"
  elif string_contains_any(position, ["intern", "internship", "interim"]):
    return "Internship"
  elif string_contains_any(position, ["data", "ml", "machine learning", 'analyst']):
    return "Data"
  elif string_contains_any(position, ["qa", "quality", "assurance", "test"]):
    return "QA"
  elif string_contains_any(position, ["engineer", "software", 'ios', 'android', 'mobile developer']):
    return "Engineering"
  elif "sales" in position:
    return "Sales"
  elif string_contains_any(position, ['brand', 'marketing']):
    return "Marketing"
  elif string_contains_any(position, ['ux', 'ui', 'design']):
    return "UX"
  elif string_contains_any(position, ['fincance', 'trade', 'insurance', 'invest', 'credit', 'actuarial']):
    return "Finance"
  elif "operations" in position:
    return "Operations"
  elif string_contains_any(position, ['pm', 'product']):
    return "Product"
  elif string_contains_any(position, ['founder', 'ceo', 'owner', 'chief executive officer']):
    return "Founder"
  elif string_contains_any(position, ["research", "scientist", 'r&d']):
    return "Research"
  elif string_contains_any(position, ['md', 'executive', 'lead', 'business', 'advisor', 'cfo', 'cto', 'cmo', 'partner', 'head', 'manager', 'leader', 'president', 'director', 'vp', 'consultant', 'principal', 'chief']):
    return "Management"
  elif string_contains_any(position, ['doctor', 'chemist', 'dentist', 'surgeon', 'physician']):
    return "Medicine"
  elif string_contains_any(position, ['legal', 'law', 'attorney']):
    return "Legal"
  else:
    return "Other"


In [10]:
def categorize_engineering(position):
  position = position.lower()
  if string_contains_any(position, ['ios', 'mobile', 'android']):
    return "Mobile"
  elif string_contains_any(position, ["stack", "java", "python", "sofware"]):
    return "Backend"
  elif string_contains_any(position, ["architect", "aws", "azure", "hardware"]):
    return "Solutions Architect"
  elif string_contains_any(position, ["data", "ml", "machine learning", 'analyst']):
    return "Data Engineer"
  elif string_contains_any(position, ["qa", "quality assurance", "test"]):
    return "QA"
  elif string_contains_any(position, ['ux', 'ui', 'design']):
    return "UX"
  else:
    return "Engineering"

In [11]:
def categorize_management(position):
  position = position.lower()
  if string_contains_any(position, ['founder', 'ceo', 'owner', 'chief executive officer', "entrepreneur"]):
    return "Founder"
  elif string_contains_any(position, ["md", 'chief', "executive", 'partner', 'head', 'lead', 'president', 'principal', 'director']):
    return "Leadership"
  else:
    return "Management"


In [None]:
# Use the OpenAI API instead for categorizing the companies into industries.

import openai
import json

openai.api_key = "your api key"

# Construct the prompt
companies_string = "\n".join(f"- {name}" for name in companies)
prompt = (
    "Here is a list of 100 companies:\n"
    f"{companies_string}\n\n"
    "Please classify these companies into 15 to 20 industry categories based on your understanding. "
    "Return the result as a JSON object where each key is a company name and the value is the inferred industry. "
    "Ensure consistency in industry names across companies. Example format:\n"
    "{\n  \"Meta\": \"Technology\",\n  \"Pfizer\": \"Healthcare\",\n  ...\n}"
)

# Create chat completion
response = openai.chat.completions.create(
    model="gpt-4o",
    messages=[{"role": "user", "content": prompt}],
    temperature=0.2,
    max_tokens=2000
)

# Get and parse the response
response_text = response.choices[0].message.content.strip()

try:
    industry_mapping = json.loads(response_text)
except json.JSONDecodeError:
    print("JSON decoding failed. Here is the raw output:\n")
    print(response_text)
    industry_mapping = {}

# Save to file
with open("company_industries_bulk.json", "w") as f:
    json.dump(industry_mapping, f, indent=2)

print("Done! Industry mapping saved to 'company_industries_bulk.json'")

