<a href="https://colab.research.google.com/github/sj442/linkedin-visualizations/blob/main/Linkedin_Connections.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [1]:
import pandas as pd
import numpy as np
import csv

## Privacy disclaimer

**Note**: I have intentionally removed all print functions that identified individuals in my network with their name, role and company.

In the visualizations, I will only display either an individual's role or the comapny they are affiliated with, to avoid giving out information that could be traced back to them.

# Dataframe Import


In [2]:
df = pd.read_csv('/content/sample_data/Connections.csv', skiprows=1)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 601 entries, 0 to 600
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   First Name     601 non-null    object
 1   Last Name      601 non-null    object
 2   URL            601 non-null    object
 3   Email Address  11 non-null     object
 4   Company        598 non-null    object
 5   Position       598 non-null    object
 6   Connected On   601 non-null    object
dtypes: object(7)
memory usage: 33.0+ KB


In [3]:
df = df.dropna(subset=['Position'])
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 598 entries, 0 to 600
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   First Name     598 non-null    object
 1   Last Name      598 non-null    object
 2   URL            598 non-null    object
 3   Email Address  11 non-null     object
 4   Company        598 non-null    object
 5   Position       598 non-null    object
 6   Connected On   598 non-null    object
dtypes: object(7)
memory usage: 37.4+ KB


In [11]:
print(df.columns)

Index(['First Name', 'Last Name', 'URL', 'Email Address', 'Company',
       'Position', 'Connected On'],
      dtype='object')


In [4]:
# Dropping emails column since it is not needed in analysis and is missing for most connections
df = df.drop('Email Address', axis=1)

In [10]:
# Number of unique positions held by individuals
df.Position.nunique()

457

In [12]:
# Apply a function to consolidate the same roles which are worded differently. Eg: Founder|CEO, Founder & CEO
df['Position'] = df['Position'].apply(consolidate_positions)

In [12]:
# Applying consolidation for just Founder and Engineer roles reduces the number of unique roles
# by 14%
df['Position'].nunique()

393

In [13]:
# Number of unique companies associated with my connections
df.Company.nunique()

486

# Model generated grouping

We will use a huggingface model and perform zero shot classfiication to classify the roles into a predefined list of broad role categories or industris.



In [None]:
from transformers import pipeline
import torch

# Check if GPU is available and use to speed up the classification
device = 0 if torch.cuda.is_available() else -1

classifier = pipeline("zero-shot-classification",
                      model="MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli",
                      device=device,
                      truncation=True,
                      max_length=64)

labels = ['Academia', 'HR', 'Internship', 'Data', 'AI/ML', 'QA', 'Engineering', 'Sales', 'Marketing', 'UX', 'Finance',
          'Operations', 'Product', 'Founder', 'Research', 'Management', 'Medicine', 'Legal', 'Architecture',
          'Consulting', 'Creative', 'Government', 'Nonprofit', 'Retired']

def classify_batch(positions, labels, batch_size=8):
    results = []
    for i in range(0, len(positions), batch_size):
        batch = positions[i:i + batch_size]
        batch = list(map(lambda x: x.lower(), batch))
        batch_results = classifier(batch, labels, multi_label=True)
        if isinstance(batch_results, dict):  # happens when only 1 item is present
            batch_results = [batch_results]
        results.extend([res['labels'][0] for res in batch_results])
    return results

def classify_position(position):
    position = position.lower()
    result = classifier(position, labels)
    return result['labels'][0]

In [None]:
import pickle
from google.colab import files
# We will save the position categories locally to avoid running the above computation repeatedly

df['Position_Category'] = classify_batch(df['Position'].tolist(), labels)

# Save to Colab local
with open('classified.pkl', 'wb') as f:
    pickle.dump(df['Position_Category'].tolist(), f)

# Download to local machine
files.download('classified.pkl')

In [None]:
from google.colab import files

# Upload the file from local machine
uploaded = files.upload()

In [15]:
import pickle
from google.colab import files

# Load the position categories into the dataframe
with open('classified.pkl', 'rb') as f:
    df['Position_Category'] = pickle.load(f)

In [17]:
df['Position_Category'].value_counts()

Unnamed: 0_level_0,count
Position_Category,Unnamed: 1_level_1
Management,249
Engineering,85
Founder,54
Academia,28
Consulting,27
Data,22
Creative,16
Research,15
Government,15
AI/ML,15


Too many roles have been categorized as Management and Engineering. We will attempt to refine the categorization before generating visualizations on it

In [18]:
df["Position"] = df["Position"].astype(str)
df["Position_Category"] = df.apply(lambda row: categorize_engineering(row.Position) if row.Position == 'Engineering' else row.Position_Category, axis = 1)
df["Position_Category"] = df.apply(lambda row: categorize_management(row.Position) if row.Position_Category == 'Management' else row.Position_Category, axis = 1)

In [19]:
df['Position_Category'].value_counts()

Unnamed: 0_level_0,count
Position_Category,Unnamed: 1_level_1
Leadership,122
Management,96
Founder,85
Engineering,85
Academia,28
Consulting,27
Data,22
Creative,16
Research,15
Government,15


# Create a graph using NetworkX

In [None]:
!pip install pyvis

In [17]:
from collections import Counter

frequent_roles = Counter(df['Position']).most_common(15)
print(frequent_roles)

frequent_roles = Counter(df['Position']).most_common(15)
res = list(map(lambda x: x[0], frequent_roles))
print(res)

[('Founder', 48), ('Software Engineer', 39), ('Founder & Ceo', 22), ('Director', 12), ('Senior Software Engineer', 10), ('Ceo', 8), ('Partner', 8), ('Vice President', 6), ('Product Manager', 5), ('Head Of Product', 5), ('Data Scientist', 4), ('Senior Product Manager', 4), ('Manager', 4), ('Director Of Product Management', 4), ('Managing Director', 4)]
['Founder', 'Software Engineer', 'Founder & Ceo', 'Director', 'Senior Software Engineer', 'Ceo', 'Partner', 'Vice President', 'Product Manager', 'Head Of Product', 'Data Scientist', 'Senior Product Manager', 'Manager', 'Director Of Product Management', 'Managing Director']


In [23]:
from pyvis.network import Network
import webbrowser
from collections import Counter

def shorten_label(label, max_len=20):
    return label if len(label) <= max_len else label[:max_len] + "..."

# Build graph
net = Network(height='750px', width='100%', notebook=False)

net.set_options("""
var options = {
  "physics": {
    "solver": "barnesHut",
    "repulsion": {
      "nodeDistance": 120,
      "springLength": 100,
      "springConstant": 0.04
    },
    "stabilization": {
      "iterations": 200
    }
  }
}
""")

# Count how many people have each role
role_counts = Counter(df['Position_Category'])

# Add central category nodes
for role, count in role_counts.items():

    net.add_node(
        role,
        label=role,
        size=15,  # base size + scaled size
        color=category_to_color(role),
        font={"color": "#000000", "size": 16}
    )

common_roles_df = df[df["Position"].isin(res)]

# Add nodes for frequent roles and edges from central nodes to frequent role nodes
for _, row in common_roles_df.iterrows():
  position = row['Position']
  category = row['Position_Category']
  net.add_node(position,
               label=shorten_label(position),
               title=position,
               shape='ellipse',
               size=12,
               color='orange',  # node background color
               font={
                   "face": "verdana",
                   "color": "#1f77b4",
                   "size": 10,
                   "strokeWidth": 0
                   },
               physics=False)
  net.add_edge(category, position, length=100)
  net.cdn_resources = 'remote'


filtered_df = df[~df["Position"].isin(res)]

# Add connection nodes and edges
for _, row in filtered_df.iterrows():
    position = row['Position']
    category = row['Position_Category']
    net.add_node(position,
                 label=shorten_label(position),
                 title = position,
                 shape='dot',
                 color=category_to_color(row['Position_Category']),
                 size=12)
    net.add_edge(category, position)
    net.cdn_resources = 'remote'

net.save_graph("linkedin_role_network.html")
files.download("linkedin_role_network.html")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Helper functions

In [19]:
import matplotlib.pyplot as plt

def get_25_colors():
    # Combine tab10 and tab20 to get 30 unique colors
    cmap1 = plt.get_cmap('tab10')  # 10 colors
    cmap2 = plt.get_cmap('tab20')  # 20 colors

    colors = []
    for i in range(10):
        colors.append(cmap1(i))
    for i in range(15):  # just take 15 from tab20 to make 25 total
        colors.append(cmap2(i))

    # Convert RGBA to hex
    hex_colors = ['#%02x%02x%02x' % tuple(int(255 * c) for c in color[:3]) for color in colors]
    return hex_colors

categories = df['Position_Category'].unique()
color_list = get_25_colors()
category_map = {cat: color_list[i % len(color_list)] for i, cat in enumerate(categories)}

def category_to_color(category):
    return category_map.get(category, '#999999')  # fallback gray

In [7]:
import string

def consolidate_positions(position):
  position = position.lower()
  position = consolidate_ceo_roles(position)
  position = consolidate_dev_roles(position)
  return string.capwords(position)

def consolidate_ceo_roles(position):
  if ("founder" in position) and ("ceo" in position):
    return "Founder & CEO"
  elif "founder" in position:
    return "Founder"
  elif "ceo" in position or "chief executive officer" in position:
    return "CEO"
  elif "co-founder" in position and "ceo" in position:
    return "Co-Founder & CEO"
  elif "co-founder" in position:
    return "Co-Founder"
  else:
    return position


def consolidate_dev_roles(position):
  if ("software development engineer" in position) or ("software development engineer i" in position):
    return "Software Engineer"
  elif "software development engineer ii" in position or "senior software engineer" in position:
    return "Senior Software Engineer"
  elif "developer" in position or "mobile" in position or "ios" in position or "android" in position:
    return "Software Engineer"
  else:
    return position

In [8]:
def string_contains_any(text, elements):
    """
    Checks if a string contains any of the elements from a list.

    Args:
        text: The string to search in.
        elements: A list of strings to search for.

    Returns:
        True if the string contains at least one element from the list, False otherwise.
    """
    return any(element in text for element in elements)

In [9]:
def categorize_position(position):
  position = position.lower()
  if string_contains_any(position, ['educat', 'lecturer', 'faculty', "postdoctoral", "phd", "bachelor", 'bs', 'ms', 'masters', 'professor', 'teach', 'graduate']):
    return "Academia"
  elif string_contains_any(position, ["hr", "people", "recruit", 'talent']):
    return "HR"
  elif string_contains_any(position, ["intern", "internship", "interim"]):
    return "Internship"
  elif string_contains_any(position, ["data", "ml", "machine learning", 'analyst']):
    return "Data"
  elif string_contains_any(position, ["qa", "quality", "assurance", "test"]):
    return "QA"
  elif string_contains_any(position, ["engineer", "software", 'ios', 'android', 'mobile developer']):
    return "Engineering"
  elif "sales" in position:
    return "Sales"
  elif string_contains_any(position, ['brand', 'marketing']):
    return "Marketing"
  elif string_contains_any(position, ['ux', 'ui', 'design']):
    return "UX"
  elif string_contains_any(position, ['fincance', 'trade', 'insurance', 'invest', 'credit', 'actuarial']):
    return "Finance"
  elif "operations" in position:
    return "Operations"
  elif string_contains_any(position, ['pm', 'product']):
    return "Product"
  elif string_contains_any(position, ['founder', 'ceo', 'owner', 'chief executive officer']):
    return "Founder"
  elif string_contains_any(position, ["research", "scientist", 'r&d']):
    return "Research"
  elif string_contains_any(position, ['md', 'executive', 'lead', 'business', 'advisor', 'cfo', 'cto', 'cmo', 'partner', 'head', 'manager', 'leader', 'president', 'director', 'vp', 'consultant', 'principal', 'chief']):
    return "Management"
  elif string_contains_any(position, ['doctor', 'chemist', 'dentist', 'surgeon', 'physician']):
    return "Medicine"
  elif string_contains_any(position, ['legal', 'law', 'attorney']):
    return "Legal"
  else:
    return "Other"


In [10]:
def categorize_engineering(position):
  position = position.lower()
  if string_contains_any(position, ['ios', 'mobile', 'android']):
    return "Mobile"
  elif string_contains_any(position, ["stack", "java", "python", "sofware"]):
    return "Backend"
  elif string_contains_any(position, ["architect", "aws", "azure", "hardware"]):
    return "Solutions Architect"
  elif string_contains_any(position, ["data", "ml", "machine learning", 'analyst']):
    return "Data Engineer"
  elif string_contains_any(position, ["qa", "quality assurance", "test"]):
    return "QA"
  elif string_contains_any(position, ['ux', 'ui', 'design']):
    return "UX"
  else:
    return "Engineering"

In [11]:
def categorize_management(position):
  position = position.lower()
  if string_contains_any(position, ['founder', 'ceo', 'owner', 'chief executive officer', "entrepreneur"]):
    return "Founder"
  elif string_contains_any(position, ["md", 'chief', "executive", 'partner', 'head', 'lead', 'president', 'principal', 'director']):
    return "Leadership"
  else:
    return "Management"
