In [None]:
import pandas as pd
import math
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
from sklearn.preprocessing import RobustScaler
from sklearn.cluster import DBSCAN

# Company Directorships Analysis
This notebook analyzes board directorships, demographics, and compensation.

In [None]:
# Load and clean data
company_directorships = pd.read_csv('company_directorships.csv')
company_directorships['software_background'] = (
    company_directorships['software_background']
        .fillna('f')
        .str.lower()
        .map(lambda x: x == 't')
)
company_directorships.sample(5)

In [None]:
# Load demographics and compensation
comp_raw = pd.read_csv('director-details.csv')
comp_raw['NAME'] = comp_raw['name'].str.upper()

def most_common(series):
    modes = series.mode()
    return modes[0] if len(modes) else None

comp_demo = pd.DataFrame({
    'age': comp_raw.groupby('NAME').age.max(),
    'compensation': comp_raw.groupby('NAME').compensation.sum(),
    'gender': comp_raw.groupby('NAME').gender.agg(most_common)
})
comp_demo['log_compensation'] = (1 + comp_demo.compensation).map(math.log10)
comp_demo.head()

In [None]:
# Handle missing genders
mask = comp_demo.gender.isin([None, 'unknown'])
print(f"Dropping {mask.sum()} missing/unknown genders")
comp_demo = comp_demo[~mask]
comp_demo.gender.value_counts()

In [None]:
# Directorship counts plot
directorship_counts = company_directorships.groupby('director_name').company_name.nunique().sort_values()
ax = directorship_counts.value_counts().sort_index().map(math.log10).plot()
ax.set_title("Log10 Directorship Count Distribution")
ax.set_xlabel("Boards per Director (log10)")
ax.set_ylabel("Number of Directors (log10)")
plt.show()

## Build network graph

In [None]:
G = nx.Graph()
for comp, director in zip(company_directorships.company_name, company_directorships.director_name):
    G.add_edge(comp, director)

# Largest connected component
largest_cc = max(nx.connected_components(G), key=len)
print(f"Largest CC size: {len(largest_cc)}")
G_sub = G.subgraph(largest_cc).copy()

## Centrality Measures
- **Eigenvector centrality**: influential nodes
- **Degree centrality**: nodes with many connections

In [None]:
centrality = pd.DataFrame({
    'eigen': pd.Series(nx.eigenvector_centrality(G_sub, max_iter=1000, tol=1e-6)),
    'degree': pd.Series(nx.degree_centrality(G_sub))
})
centrality['is_person'] = centrality.index.isin(company_directorships.director_name.unique())
people_df = centrality[centrality.is_person].merge(comp_demo, left_index=True, right_index=True)
people_df.sort_values('eigen', ascending=False).head()

## Exploratory Data Analysis

In [None]:
# Age distribution
people_df['age'].hist(bins=20)
plt.title("Age Distribution of Directors")
plt.xlabel("Age")
plt.ylabel("Count")
plt.show()

# Gender breakdown
comp_demo.gender.value_counts().plot.bar()
plt.title("Gender Breakdown")
plt.xlabel("Gender")
plt.ylabel("Count")
plt.show()

# Log compensation boxplot
people_df['log_compensation'].plot.box()
plt.title("Log10 Compensation Boxplot")
plt.ylabel("Log10(compensation + 1)")
plt.show()

# Correlation heatmap
corr = people_df[['age','log_compensation','degree','eigen']].corr()
sns.heatmap(corr, annot=True)
plt.title("Feature Correlation Matrix")
plt.show()

## Clustering with DBSCAN

In [None]:
# Scale features
scaler = RobustScaler()
X = scaler.fit_transform(people_df[['age','log_compensation','degree','eigen']])

# DBSCAN
db = DBSCAN(eps=0.4, min_samples=5)
people_df['cluster_id'] = db.fit_predict(X)
print(people_df.cluster_id.value_counts())

In [None]:
# Scatter plot of clusters
sc = plt.scatter(people_df['age'], people_df['log_compensation'],
                 c=people_df['cluster_id'], cmap='rainbow', s=6)
cbar = plt.colorbar(sc)
cbar.set_label("Cluster ID")
plt.title("DBSCAN Clusters: Age vs Log Compensation")
plt.xlabel("Age")
plt.ylabel("Log10(compensation + 1)")
plt.show()

### Cluster Interpretation
- **Cluster 1**: Mid-career, average/high pay
- **Cluster 0**: Early/mid career, below-average pay
- **Clusters 2–7**: Outliers
- **-1**: Noise

## Conclusion
- Dominant mid-career cohort
- 'Super-directors' via eigenvector centrality
- DBSCAN reveals outliers and new sub-populations