In [1]:
import numpy as np
import pandas as pd
import networkx as nx
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.cm import ScalarMappable
from matplotlib import cm, colors

from ecomplexity import ecomplexity
from ecomplexity import proximity

In [2]:
# parameters
window = [2020, 2021]
export_cdf_yearly = False
export_cdf_window = True

# for ecomplexity calculcation
key_cols = {
    "time": "year",
    "loc": "iso2_code",
    "prod": "language",
    "val": "num_pushers",
}


**1 - git complexity**

In [3]:
# data IN
data = pd.read_csv("../data/languages.csv")

In [4]:
# filter functions
def drop_specifics_from_list(data, filter_list):
    """filter specific languages from list -- motivated by RM del Rio-Chanona et al 2023"""
    data = data[~data["language"].str.contains(filter_list, case=False, regex=True)]
    return data

def top_languages_filter(data, nr_languages):
    """keep top x number of languages ONLY"""
    top_languages = data.groupby(["language"])["num_pushers"].agg("sum").reset_index().sort_values(by="num_pushers", ascending=False)
    top_languages = list(top_languages["language"])[:nr_languages]
    data = data[data["language"].isin(top_languages)]
    return data
    
def drop_country_codes_from_list(data, country_list):
    data = data[~data["iso2_code"].isin(country_list)]
    return data

def dataframe_for_ecomplexity(data, focal_year, quarter_list):
    """aggregate and transform dataframe for ecomplexity functions"""
    data = data[(data["year"]==focal_year) & (data["quarter"].isin(quarter_list))]
    data = data\
        .groupby(["year", "iso2_code", "language"])["num_pushers"]\
        .agg("sum")\
        .reset_index()\
        .sort_values(by="num_pushers", ascending=False)    
    return data
    
def add_period_ids(data):
    """create missing semester ID and construct different period IDs"""
    data["semester"] = np.where(data["quarter"] <= 2, 1, 2)
    data["semester_id"] = data["year"].astype(str).str.cat(data["semester"].astype(str), sep="s")
    data["quarter_id"] = data["year"].astype(str).str.cat(data["quarter"].astype(str), sep="q")
    return data

In [5]:
# steps to prep dataframe of ecomplexity
prev_filter = "|".join(["yaml", "json", "text", "svg", "Markdown", "xml"])
df = drop_specifics_from_list(data, filter_list=prev_filter)
df = top_languages_filter(df, nr_languages=150)
df = drop_country_codes_from_list(df, country_list=["EU"])
df = add_period_ids(df)
print(df.shape)

(74289, 9)


In [6]:
# loop to calculate ecomplexity for each year
years = sorted(df["year"].unique())
for y in years:
    df_prep = dataframe_for_ecomplexity(df, focal_year=y, quarter_list=[1, 2, 3, 4])
    cdf = ecomplexity(df_prep, key_cols)
    
    if export_cdf_yearly==True:
        cdf.to_csv(f"../outputs/complexity_table{y}.csv", index=False, sep=";")

2020
2021
2022
2023


In [7]:
# window information
window_data = df[df["year"].isin(window)]  
window_id = '-'.join(map(str, window))
window_data["year"] = window[0]

# ecomplexity for window
df_wprep = dataframe_for_ecomplexity(window_data, focal_year=window[0], quarter_list=[1, 2, 3, 4])
cdf_w = ecomplexity(df_wprep, key_cols)
cdf_w["window"] = window_id
if export_cdf_window==True:
    cdf_w.to_csv(f"../outputs/complexity_table{window_id}.csv", index=False, sep=";")

2020


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  window_data["year"] = window[0]


**2 - language space**

In [8]:
# github space
space_df = proximity(df_wprep, key_cols)
# space_df.to_csv("../outputs/space_table.csv", sep=";", index=False)

2020


In [9]:
def edgelist_for_github_space(data, key_columns):
    """transform the ecomplexity proximity output for visualization"""
    data = data[key_columns]

    # drop zero -- non-existing edges
    data = data[data[key_columns[2]] > 0]

    # drop self loops
    data = data[data[key_columns[0]] != data[key_columns[1]]]
    return data

In [10]:
def maximum_spanning_tree(data, key_columns):
    table = data.copy()
    table["distance"] = 1.0 / table[key_columns[2]]
    G = nx.from_pandas_edgelist(table, source = key_columns[0], target = key_columns[1], edge_attr = ["distance", key_columns[2]])
    T = nx.minimum_spanning_tree(G, weight = "distance")
    table2 = nx.to_pandas_edgelist(T)
    table2 = table2[table2[key_columns[2]] > 0]
    table2.rename(columns = {"source": key_columns[0], "target": key_columns[1], key_columns[2]: "score"}, inplace = True)
    table = pd.merge(
        table,
        table2,
        on=key_columns[0:2]
    )  
    table["edge"] = table.apply(lambda x: "%s-%s" % (min(x[key_columns[0]], x[key_columns[1]]), max(x[key_columns[0]], x[key_columns[1]])), axis = 1)
    table = table.drop_duplicates(subset = ["edge"])
    table = table.drop("edge", 1)
    return table[key_columns]

In [11]:
def add_edges(mst_edges, all_edges, nr_edges_to_add):
    # drop mst edges from the full edgelist
    mst_edges["drop"] = 1
    all_edges = pd.merge(
        all_edges,
        mst_edges,
        on = ["language_1", "language_2", "proximity"],
        how="left"
    )
    all_edges = all_edges[all_edges["drop"] != 1].drop(columns="drop")

    # sort and select
    all_edges = all_edges.sort_values(by="proximity", ascending=False).iloc[:nr_edges_to_add]

    # add to mst edgelist
    software_space_el = pd.concat([mst_el, all_edges])
    return software_space_el

In [12]:
# from space table to MST w/ additional edges
space_table = edgelist_for_github_space(space_df, key_columns=["language_1", "language_2", "proximity"])
mst_el = maximum_spanning_tree(space_table, key_columns=["language_1", "language_2", "proximity"])
mst_graph = nx.from_pandas_edgelist(mst_el, source="language_1", target="language_2")
n_nodes = mst_graph.number_of_nodes()
n_edges = n_nodes * 2
software_space_el = add_edges(mst_el, space_table, nr_edges_to_add=n_edges)

  table = table.drop("edge", 1)


In [13]:
# export for Herr Wachs
software_space_el.to_csv(f"../outputs/software_space_edgelist{window_id}.csv", index=False, sep=";")