In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import zscore
import numpy as np

In [9]:
metrics_df = pd.read_csv('metrics_df_run2.csv')

In [10]:
metrics_df

Unnamed: 0,company_linking,year,betweenness,closeness,degree,eigenvector
0,6930,1960,0.000000,1.000000,1.000000,1.000000e+00
1,24708,1960,0.000000,1.000000,1.000000,1.000000e+00
2,6930,1961,0.000000,1.000000,1.000000,1.000000e+00
3,24708,1961,0.000000,1.000000,1.000000,1.000000e+00
4,6930,1962,0.000000,1.000000,0.166667,0.000000e+00
...,...,...,...,...,...,...
312928,3903002,2025,8788.008820,0.182885,0.000313,2.724054e-12
312929,3904801,2025,12003.373911,0.162327,0.000940,1.278264e-17
312930,3905369,2025,0.000000,0.201240,0.000209,1.715523e-09
312931,3906385,2025,27476.946429,0.176810,0.000731,1.728229e-13


### Standardize Centrality Metrics Using Z-Scores
Chose z-score over min-max scaling as we are probably prone to outliers. We will have negative scores with less interpretability, but the binning in the next step should be better using this method.

In [11]:
def safe_zscore(x, ddof=1):
    # Compute mean and standard deviation
    m = x.mean()
    s = x.std(ddof=ddof)
    # If the std is nearly zero, return an array of zeros
    if np.isclose(s, 0):
        return np.zeros_like(x)
    return (x - m) / s

def normalize_betweenness(group):
    n = len(group)
    norm_factor = ((n - 1) * (n - 2) / 2) if n > 2 else 1
    group = group.copy()
    group['betweenness_normalized'] = group['betweenness'] / norm_factor
    return group

# Apply theoretical normalization for betweenness per year
metrics_df = metrics_df.groupby('year').apply(normalize_betweenness).reset_index(drop=True)

# Now apply the safe z-score transformation per year for all metrics
metrics_df[['betweenness_z', 'closeness_z', 'degree_z', 'eigenvector_z']] = (
    metrics_df.groupby('year')[['betweenness_normalized', 'closeness', 'degree', 'eigenvector']]
    .transform(lambda x: safe_zscore(x, ddof=1))
)

  metrics_df = metrics_df.groupby('year').apply(normalize_betweenness).reset_index(drop=True)


In [12]:
metrics_df

Unnamed: 0,company_linking,year,betweenness,closeness,degree,eigenvector,betweenness_normalized,betweenness_z,closeness_z,degree_z,eigenvector_z
0,6930,1960,0.000000,1.000000,1.000000,1.000000e+00,0.000000,0.000000,0.000000,0.000000,0.000000
1,24708,1960,0.000000,1.000000,1.000000,1.000000e+00,0.000000,0.000000,0.000000,0.000000,0.000000
2,6930,1961,0.000000,1.000000,1.000000,1.000000e+00,0.000000,0.000000,0.000000,0.000000,0.000000
3,24708,1961,0.000000,1.000000,1.000000,1.000000e+00,0.000000,0.000000,0.000000,0.000000,0.000000
4,6930,1962,0.000000,1.000000,0.166667,0.000000e+00,0.000000,-0.377964,0.585540,-0.377964,-0.781897
...,...,...,...,...,...,...,...,...,...,...,...
312928,3903002,2025,8788.008820,0.182885,0.000313,2.724054e-12,0.000192,-0.196724,-0.125623,-0.419176,-0.046688
312929,3904801,2025,12003.373911,0.162327,0.000940,1.278264e-17,0.000262,-0.156000,-0.267902,0.538198,-0.046688
312930,3905369,2025,0.000000,0.201240,0.000209,1.715523e-09,0.000000,-0.308029,0.001414,-0.578739,-0.046688
312931,3906385,2025,27476.946429,0.176810,0.000731,1.728229e-13,0.000600,0.039980,-0.167662,0.219073,-0.046688


In [13]:
metrics_df = metrics_df.drop(columns=['betweenness', 'closeness', 'degree', 'eigenvector', 'betweenness_normalized'])

In [14]:
# Rename columns
metrics_df = metrics_df.rename(columns={
    'company_linking': 'company_id',
    'betweenness_z': 'betweenness', 
    'closeness_z': 'closeness',
    'degree_z': 'degree',
    'eigenvector_z': 'eigenvector'
})


### Create Connectivity Variable

In [15]:
metrics_df['connectivity'] = metrics_df['degree']*0.1 + metrics_df['betweenness']*0.3 + metrics_df['closeness']*0.2 + metrics_df['eigenvector']*0.4

In [16]:
metrics_df

Unnamed: 0,company_id,year,betweenness,closeness,degree,eigenvector,connectivity
0,6930,1960,0.000000,0.000000,0.000000,0.000000,0.000000
1,24708,1960,0.000000,0.000000,0.000000,0.000000,0.000000
2,6930,1961,0.000000,0.000000,0.000000,0.000000,0.000000
3,24708,1961,0.000000,0.000000,0.000000,0.000000,0.000000
4,6930,1962,-0.377964,0.585540,-0.377964,-0.781897,-0.346837
...,...,...,...,...,...,...,...
312928,3903002,2025,-0.196724,-0.125623,-0.419176,-0.046688,-0.144735
312929,3904801,2025,-0.156000,-0.267902,0.538198,-0.046688,-0.065236
312930,3905369,2025,-0.308029,0.001414,-0.578739,-0.046688,-0.168675
312931,3906385,2025,0.039980,-0.167662,0.219073,-0.046688,-0.018306


In [18]:
metrics_df.to_csv('normalized_metrics.csv', index=False)