# Assignment 3: Company Directorships Analysis
### Tasks 1–3

## Setup and Data Loading

In [None]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import networkx as nx

# Load and clean data
company_directorships = pd.read_csv('company_directorships.csv')
company_directorships['software_background'] = (
    company_directorships['software_background']
      .str.strip()
      .str.lower()
      .map({'t': True, 'f': False})
)


In [None]:
# Load demographics and compensation
comp_raw = pd.read_csv('director-details.csv')
comp_raw['NAME'] = comp_raw['name'].str.upper()

# One-pass aggregation
from statistics import mode
def most_common(series):
    modes = series.mode()
    return modes[0] if len(modes) else None

agg_funcs = {
    'age': 'max',
    'compensation': 'sum',
    'gender': most_common
}
comp_demo = (
    comp_raw
      .groupby('NAME')
      .agg(agg_funcs)
      .assign(log_compensation=lambda df: np.log10(df.compensation + 1))
)


In [None]:
# Handle missing/unknown genders
comp_demo['gender'] = (
    comp_demo['gender']
      .str.lower()
      .replace('unknown', np.nan)
)
n_dropped = comp_demo.gender.isna().sum()
print(f"Dropping {n_dropped} directors with missing gender")
comp_demo = comp_demo.dropna(subset=['gender'])


## Task 1: Centrality Measures

In [None]:
# Build graph
G = nx.Graph()
for comp, director in zip(company_directorships.company_name, company_directorships.director_name):
    G.add_edge(comp, director)

# Largest connected component
largest_cc = max(nx.connected_components(G), key=len)
G_sub = G.subgraph(largest_cc).copy()

# Compute centrality measures
eigen = nx.eigenvector_centrality(G_sub, max_iter=1000, tol=1e-6)
degree = nx.degree_centrality(G_sub)
between = nx.betweenness_centrality(G_sub, normalized=True)

centrality = pd.DataFrame({
    'eigen': pd.Series(eigen),
    'degree': pd.Series(degree),
    'betweenness': pd.Series(between)
})
centrality['is_person'] = centrality.index.isin(company_directorships.director_name.unique())
people_df = centrality[centrality.is_person].merge(comp_demo, left_index=True, right_index=True)

# Inspect top directors
print("Top 5 by Eigenvector Centrality:\n", people_df['eigen'].nlargest(5), "\n")
print("Top 5 by Degree Centrality:\n", people_df['degree'].nlargest(5), "\n")
print("Top 5 by Betweenness Centrality:\n", people_df['betweenness'].nlargest(5))


### Interpretation
- **Degree centrality**: number of direct board connections.
- **Eigenvector centrality**: influence through connections to other well-connected nodes.
- **Betweenness centrality**: bridges between parts of the network, indicating broker roles.


## Task 2: Code Refactoring Highlights
Below are four key refactoring points with improved code snippets for clarity, performance, and robustness:

1. **software_background conversion**: vectorized `.map({...})`, preserving NaNs.
2. **One-pass demographic aggregation**: single `groupby().agg()` plus `.assign()`.
3. **Gender cleaning**: explicit `'unknown'`→NaN and `dropna()`.
4. **Directorship counts log-scaling**: plot distribution of log10(#boards + 1) directly.

#### Refactored Code Snippets

In [None]:
# 1. software_background conversion (see above cell)

In [None]:
# 2. One-pass demographic aggregation (see above cell)

In [None]:
# 3. Gender cleaning (see above cell)

In [None]:
# 4. Log-scaled directorship counts
counts = company_directorships.groupby('director_name').company_name.nunique()
log_counts = np.log10(counts + 1)
log_counts.hist(bins=20)
plt.title("Log10(#Boards + 1) per Director")
plt.xlabel("Log10(#Boards + 1)")
plt.ylabel("Number of Directors")
plt.show()


## Task 3: Exploratory Feature — Software Background

In [None]:
# Merge software background into people_df
sb = (
    company_directorships[['director_name','software_background']]
      .drop_duplicates()
      .set_index('director_name')
)
people_df = people_df.merge(sb, left_index=True, right_index=True)

# Compute and display summary by software background
summary = (
    people_df
      .groupby('software_background')
      .agg(
         n_directors=('age','size'),
         median_comp=('compensation','median'),
         avg_degree=('degree','mean'),
         avg_eigen=('eigen','mean')
      )
)
print(summary)
