In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
from itertools import product
from ecomplexity import ecomplexity
from ecomplexity import proximity
from ecomplexity import calc_density
import country_converter as coco
import itertools

import yaml
import sys, os
sys.path.append(os.path.abspath(".."))

from utils.utils import *
from utils.config_utils import load_config

In [2]:
# parameters
config = load_config()
focal_year = config["focal_year"]
selected_period = config["selected_period"]

**clustered languages -- theoretical version**

In [3]:
# data IN
data = pd.read_csv("../../data/inputs/languages.csv")

# use data_prep_functions to clean the dataframe of ECI_software calculation
data = data[data["year"].isin([2020, 2021, 2022, 2023])]
prev_filter = "|".join(["yaml", "json", "text", "svg", "Markdown", "xml"])
df = drop_specifics_from_list(data, filter_list=prev_filter)
df = top_languages_filter(df, nr_languages=150)
df = drop_country_codes_from_list(df, country_list=["EU"])
df = add_period_ids(df, period=selected_period)
print(df.shape)

(93076, 7)


In [4]:
# clusters of languages
cl_df = pd.read_csv("../../data/outputs/language_to_cluster_mapping.csv")\
    .rename(columns={"Language":"language", "Cluster":"cluster_id", "Cluster Name":"cluster_name"})

# combine
cl_df = pd.merge(
    df,
    cl_df,
    on="language",
    how="left"
)

# drop NAs... -- not so great
cl_df.dropna(subset=["cluster_id", "cluster_name"], inplace=True)

# aggregate by clusters
cl_df = cl_df.groupby(["iso2_code", "period", "cluster_name"])["num_pushers"].agg("mean").reset_index()

# for ecomplexity calculcation
key_cols = {
    "time": "period",
    "loc": "iso2_code",
    "prod": "cluster_name",
    "val": "num_pushers",
}

# software complexity calculation -- period IDs -- 1 means 2020 on yearly basis
ccdf = []
ppdf = []
year_dict = {1 : 2020, 2 : 2021, 3 : 2022, 4 : 2023}
for k in year_dict.keys():
    dfb = cl_df[cl_df["period"]==k]
    cdf = ecomplexity(dfb, key_cols)
    cdf["year"] = year_dict[k]

    pdf = proximity(dfb, key_cols)
    pdf["year"] = year_dict[k]

    # combine yearly dataframes
    ccdf.append(cdf)
    ppdf.append(pdf)
    print(year_dict[k], " DONE")

1
Percentage of pairs compared that meet log-supermodularity condition: 9.78%
1
2020  DONE
2




Percentage of pairs compared that meet log-supermodularity condition: 10.50%
2
2021  DONE
3




Percentage of pairs compared that meet log-supermodularity condition: 11.00%
3
2022  DONE
4




Percentage of pairs compared that meet log-supermodularity condition: 11.77%
4
2023  DONE




In [5]:
# combine and save -- complexity
cluster_cdf = pd.concat(ccdf, axis=0, ignore_index=True)
cluster_cdf.to_csv("../../data/outputs/eci_clusters_theory_2020_2023.csv", sep=";", index=False)

In [6]:
# combine and save -- language-cluster proximity
prox_df = pd.concat(ppdf, axis=0, ignore_index=True)
prox_df.to_csv("../../data/outputs/proximity_clusters_theory_2020_2023.csv", sep=";", index=False)

In [7]:
### ENTRY -- based on clusters

# relatedness density -- as in Hidalgo et al. (2007) Science
cdf = pd.read_csv("../../data/outputs/eci_clusters_theory_2020_2023.csv", sep=";")
rel_dens = cdf[cdf["year"] == 2020][["iso2_code", "cluster_name", "density"]].drop_duplicates()



# data IN
data = pd.read_csv("../../data/inputs/languages.csv")
selected_period = "year"

# steps to prep dataframe of ecomplexity
prev_filter = "|".join(["yaml", "json", "text", "svg", "Markdown", "xml"])
df = drop_specifics_from_list(data, filter_list=prev_filter)
df = top_languages_filter(df, nr_languages=150)
df = drop_country_codes_from_list(df, country_list=["EU"])
df = add_period_ids(df, period=selected_period)
print(df.shape)

# clusters of languages
cl_df = pd.read_csv("../../data/outputs/language_to_cluster_mapping.csv")\
    .rename(columns={"Language":"language", "Cluster":"cluster_id", "Cluster Name":"cluster_name"})

# combine
df = pd.merge(
    df,
    cl_df,
    on="language",
    how="left"
)

# drop NAs... -- not so great
df.dropna(subset=["cluster_id", "cluster_name"], inplace=True)

# aggregate by clusters
df = df.groupby(["iso2_code", "period", "cluster_name"])["num_pushers"].agg("mean").reset_index()


def bundle_data_clusters(data, periods):
    """aggreagte data for period by taking the mean number active developers"""
    data = (
        data[data["period"].isin(periods)]
        .groupby(["iso2_code", "cluster_name"])["num_pushers"]
        .agg("mean")
        .reset_index()
    )
    data["period"] = 1
    data["num_pushers"] = data["num_pushers"].astype(int)
    return data



# threshold for RCA : 1.00
ps = [1, 2, 3, 4]
rca_tables = []
for p in ps:
    print(p)
    temp = bundle_data_clusters(df, periods=[p])
    temp["period"] = p
    rca_tables.append(rca_calculation(temp, c_column="iso2_code", p_column="cluster_name", value_column="num_pushers", threshold=1))

#    dfbs.append(temp)
rca_tables = pd.concat(rca_tables)


# identify entry following the given patterns
entry_pattern = [0,0,1,1]
consider_pattern = [0,0,0,0]
ent = rca_tables.sort_values(["period"], ascending=True).groupby(["iso2_code","cluster_name"])["rca01"].agg(list).reset_index()
ent["entry01"] = ent["rca01"].apply(lambda x: x == entry_pattern).astype(int)
ent["consider00"] = ent["rca01"].apply(lambda x: x == consider_pattern).astype(int)



# full combination
all_countries = ent["iso2_code"].unique()
all_languages = ent["cluster_name"].unique()

all_combinations = list(product(all_countries, all_languages))
full_df = pd.DataFrame(all_combinations, columns=["iso2_code", "cluster_name"])\
    .sort_values(["iso2_code", "cluster_name"])

# join entries
full_df = pd.merge(
    full_df,
    ent[["iso2_code", "cluster_name", "entry01", "consider00"]],
    on=["iso2_code", "cluster_name"],
    how="left"
).fillna(0)

# join complexity
cdf = pd.read_csv("../../data/outputs/eci_clusters_theory_2020_2023.csv", sep=";")
cdf = cdf[cdf["year"]==2020]
full_df = pd.merge(
    full_df,
    cdf[["iso2_code", "cluster_name", "pci", "ubiquity"]],
    on=["iso2_code", "cluster_name"],
    how="left"
)

# join RCA from the baseline period
full_df = pd.merge(
    full_df,
    rca_tables[rca_tables["period"]==3].loc[:,["iso2_code", "cluster_name", "rca01"]],
    on=["iso2_code", "cluster_name"],
    how="left"
)
full_df["rca01"] = full_df["rca01"].fillna(0)

# drop languages with no complexity value
full_df.dropna(subset=["pci"], inplace=True)



# join to full_df with entries and PCI
full_df = pd.merge(
    full_df,
    rel_dens,
    on=["iso2_code", "cluster_name"],
    how="left"
)



# export for entry models -- only consider 00, 01 patterns
full_df["entry01"] = full_df["entry01"].astype(int)
full_df["consider00"] = full_df["consider00"].astype(int)
export_df = full_df[(full_df["entry01"]==1) | (full_df["consider00"]==1)]
export_df.to_csv("../../data/outputs/data_entry_regressions_0011_clusters_theory.csv", index=False, sep=";")

(93076, 7)
1
2
3
4


In [8]:
### EXIT -- 1.00 threshold

# relatedness density -- as in Hidalgo et al. (2007) Science
cdf = pd.read_csv("../../data/outputs/eci_clusters_theory_2020_2023.csv", sep=";")
rel_dens = cdf[cdf["year"] == 2020][["iso2_code", "cluster_name", "density"]].drop_duplicates()



# data IN
data = pd.read_csv("../../data/inputs/languages.csv")
selected_period = "year"

# steps to prep dataframe of ecomplexity
prev_filter = "|".join(["yaml", "json", "text", "svg", "Markdown", "xml"])
df = drop_specifics_from_list(data, filter_list=prev_filter)
df = top_languages_filter(df, nr_languages=150)
df = drop_country_codes_from_list(df, country_list=["EU"])
df = add_period_ids(df, period=selected_period)
print(df.shape)

# clusters of languages
cl_df = pd.read_csv("../../data/outputs/language_to_cluster_mapping.csv")\
    .rename(columns={"Language":"language", "Cluster":"cluster_id", "Cluster Name":"cluster_name"})

# combine
df = pd.merge(
    df,
    cl_df,
    on="language",
    how="left"
)

# drop NAs... -- not so great
df.dropna(subset=["cluster_id", "cluster_name"], inplace=True)

# aggregate by clusters
df = df.groupby(["iso2_code", "period", "cluster_name"])["num_pushers"].agg("mean").reset_index()


def bundle_data_clusters(data, periods):
    """aggreagte data for period by taking the mean number active developers"""
    data = (
        data[data["period"].isin(periods)]
        .groupby(["iso2_code", "cluster_name"])["num_pushers"]
        .agg("mean")
        .reset_index()
    )
    data["period"] = 1
    data["num_pushers"] = data["num_pushers"].astype(int)
    return data


# threshold for RCA : 1.00
ps = [1, 2, 3, 4]
rca_tables = []
for p in ps:
    print(p)
    temp = bundle_data_clusters(df, periods=[p])
    temp["period"] = p
    rca_tables.append(rca_calculation(temp, c_column="iso2_code", p_column="cluster_name", value_column="num_pushers", threshold=1))

#    dfbs.append(temp)
rca_tables = pd.concat(rca_tables)



# identify entry following the given patterns
exit_pattern = [1,1,0,0]
consider_pattern = [1,1,1,1]
ext = rca_tables.sort_values(["period"], ascending=True).groupby(["iso2_code","cluster_name"])["rca01"].agg(list).reset_index()
ext["entry01"] = ext["rca01"].apply(lambda x: x == exit_pattern).astype(int)
ext["consider00"] = ext["rca01"].apply(lambda x: x == consider_pattern).astype(int)



# full combination
all_countries = ext["iso2_code"].unique()
all_languages = ext["cluster_name"].unique()

all_combinations = list(product(all_countries, all_languages))
full_df = pd.DataFrame(all_combinations, columns=["iso2_code", "cluster_name"])\
    .sort_values(["iso2_code", "cluster_name"])

# join entries
full_df = pd.merge(
    full_df,
    ext[["iso2_code", "cluster_name", "entry01", "consider00"]],
    on=["iso2_code", "cluster_name"],
    how="left"
).fillna(0)

# join complexity
cdf = pd.read_csv("../../data/outputs/eci_clusters_theory_2020_2023.csv", sep=";")
cdf = cdf[cdf["year"]==2020]
full_df = pd.merge(
    full_df,
    cdf[["iso2_code", "cluster_name", "pci", "ubiquity"]],
    on=["iso2_code", "cluster_name"],
    how="left"
)

# join RCA from the baseline period
full_df = pd.merge(
    full_df,
    rca_tables[rca_tables["period"]==3].loc[:,["iso2_code", "cluster_name", "rca01"]],
    on=["iso2_code", "cluster_name"],
    how="left"
)
full_df["rca01"] = full_df["rca01"].fillna(0)

# drop languages with no complexity value
full_df.dropna(subset=["pci"], inplace=True)



# join to full_df with entries and PCI
full_df = pd.merge(
    full_df,
    rel_dens,
    on=["iso2_code", "cluster_name"],
    how="left"
)



# export for entry models -- only consider 00, 01 patterns
full_df["entry01"] = full_df["entry01"].astype(int)
full_df.rename(columns={"entry01":"exit01"}, inplace=True)
full_df["consider00"] = full_df["consider00"].astype(int)
export_df = full_df[(full_df["exit01"]==1) | (full_df["consider00"]==1)]
export_df.to_csv("../../data/outputs/data_exit_regressions_1100_clusters_theory.csv", index=False, sep=";")

(93076, 7)
1
2
3
4
