In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
from itertools import product
from ecomplexity import ecomplexity
from ecomplexity import proximity
from ecomplexity import calc_density
import country_converter as coco
import itertools

# stats
import scipy.stats as stats
from scipy.stats import linregress
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

from utils import *

**clustered languages -- co-occurrence version**

In [2]:
# parameter to choose year / semester / quarter to construct period IDs
selected_period = "year"

# data IN
data = pd.read_csv("../data/languages.csv")

# use data_prep_functions to clean the dataframe of ECI_software calculation
data = data[data["year"].isin([2020, 2021, 2022, 2023])]
prev_filter = "|".join(["yaml", "json", "text", "svg", "Markdown", "xml"])
df = drop_specifics_from_list(data, filter_list=prev_filter)
df = top_languages_filter(df, nr_languages=150)
df = drop_country_codes_from_list(df, country_list=["EU"])
df = add_period_ids(df, period=selected_period)
print(df.shape)

(93076, 7)


In [3]:
# clusters of languages
cl_df = pd.read_csv("../data/language_clusters_gh_cos_hier_ward_d1.csv")\
    .rename(columns={"Language":"language", "Cluster":"cluster_id"})\
    .iloc[:,1:]

# combine
cl_df = pd.merge(
    df,
    cl_df,
    on="language",
    how="left"
)

# drop NAs... -- not so great
cl_df.dropna(subset=["cluster_id", "cluster_id"], inplace=True)

# aggregate by clusters
cl_df = cl_df.groupby(["iso2_code", "period", "cluster_id"])["num_pushers"].agg("mean").reset_index()

# for ecomplexity calculcation
key_cols = {
    "time": "period",
    "loc": "iso2_code",
    "prod": "cluster_id",
    "val": "num_pushers",
}

# software complexity calculation -- period IDs -- 1 means 2020 on yearly basis
ccdf = []
ppdf = []
year_dict = {1 : 2020, 2 : 2021, 3 : 2022, 4 : 2023}
for k in year_dict.keys():
    dfb = cl_df[cl_df["period"]==k]
    cdf = ecomplexity(dfb, key_cols)
    cdf["year"] = year_dict[k]

    pdf = proximity(dfb, key_cols)
    pdf["year"] = year_dict[k]

    # combine yearly dataframes
    ccdf.append(cdf)
    ppdf.append(pdf)
    print(year_dict[k], " DONE")

1
Percentage of pairs compared that meet log-supermodularity condition: 6.68%
1
2020  DONE
2




Percentage of pairs compared that meet log-supermodularity condition: 7.22%
2
2021  DONE
3




Percentage of pairs compared that meet log-supermodularity condition: 7.73%
3
2022  DONE
4




Percentage of pairs compared that meet log-supermodularity condition: 8.72%
4
2023  DONE




In [20]:
# combine and save -- complexity
cluster_cdf = pd.concat(ccdf, axis=0, ignore_index=True)
cluster_cdf.to_csv("../outputs/eci_clusters_cooc_2020_2023.csv", sep=";", index=False)

In [21]:
# combine and save -- language-cluster proximity
prox_df = pd.concat(ppdf, axis=0, ignore_index=True)
prox_df.to_csv("../outputs/cluster_proximity_2020_2023.csv", sep=";", index=False)

In [5]:
# comparison
cluster_cdf1 = pd.read_csv("../outputs/eci_clusters_2020_2023.csv", sep=";")
cluster_cdf = pd.read_csv("../outputs/eci_clusters_cooc_2020_2023.csv", sep=";")
eci_software = pd.read_csv("../outputs/eci_software_2020_2023.csv", sep=";")

cc_df = pd.merge(
    eci_software[eci_software["year"]==2020][["iso2_code", "eci"]].drop_duplicates(),
    cluster_cdf[cluster_cdf["year"]==2020][["iso2_code", "eci"]].drop_duplicates(),
    on=["iso2_code"],
    how="left",
    suffixes=["_software", "_cluster"]
)
temp = pd.merge(
    cc_df,
    cluster_cdf1[cluster_cdf1["year"]==2020][["iso2_code", "eci"]].drop_duplicates(),
    on=["iso2_code"],
    how="left"
).rename(columns={"eci":"eci_cluster_theory", "eci_cluster":"eci_cluster_cooccurrence"})

temp[["eci_software", "eci_cluster_theory", "eci_cluster_cooccurrence"]].corr()

Unnamed: 0,eci_software,eci_cluster_theory,eci_cluster_cooccurrence
eci_software,1.0,0.983028,0.970453
eci_cluster_theory,0.983028,1.0,0.973888
eci_cluster_cooccurrence,0.970453,0.973888,1.0


In [6]:
print(cluster_cdf["cluster_id"].nunique(), "unique clusters")
print(eci_software["language"].nunique(), "unique clusters")

57 unique clusters
150 unique clusters


In [7]:
### ENTRY -- based on clusters

# relatedness density -- as in Hidalgo et al. (2007) Science
cdf = pd.read_csv("../outputs/eci_clusters_cooc_2020_2023.csv", sep=";")
rel_dens = cdf[cdf["year"] == 2020][["iso2_code", "cluster_id", "density"]].drop_duplicates()



# data IN
data = pd.read_csv("../data/languages.csv")
selected_period = "year"

# steps to prep dataframe of ecomplexity
prev_filter = "|".join(["yaml", "json", "text", "svg", "Markdown", "xml"])
df = drop_specifics_from_list(data, filter_list=prev_filter)
df = top_languages_filter(df, nr_languages=150)
df = drop_country_codes_from_list(df, country_list=["EU"])
df = add_period_ids(df, period=selected_period)
print(df.shape)

# clusters of languages
cl_df = pd.read_csv("../data/language_clusters_gh_cos_hier_ward_d1.csv")\
    .rename(columns={"Language":"language", "Cluster":"cluster_id"})\
    .iloc[:,1:]

# combine
df = pd.merge(
    df,
    cl_df,
    on="language",
    how="left"
)

# drop NAs... -- not so great
df.dropna(subset=["cluster_id"], inplace=True)

# aggregate by clusters
df = df.groupby(["iso2_code", "period", "cluster_id"])["num_pushers"].agg("mean").reset_index()


def bundle_data_clusters(data, periods):
    """aggreagte data for period by taking the mean number active developers"""
    data = (
        data[data["period"].isin(periods)]
        .groupby(["iso2_code", "cluster_id"])["num_pushers"]
        .agg("mean")
        .reset_index()
    )
    data["period"] = 1
    data["num_pushers"] = data["num_pushers"].astype(int)
    return data



# threshold for RCA : 1.00
ps = [1, 2, 3, 4]
rca_tables = []
for p in ps:
    print(p)
    temp = bundle_data_clusters(df, periods=[p])
    temp["period"] = p
    rca_tables.append(rca_calculation(temp, c_column="iso2_code", p_column="cluster_id", value_column="num_pushers", threshold=1))

#    dfbs.append(temp)
rca_tables = pd.concat(rca_tables)


# identify entry following the given patterns
entry_pattern = [0,0,1,1]
consider_pattern = [0,0,0,0]
ent = rca_tables.sort_values(["period"], ascending=True).groupby(["iso2_code","cluster_id"])["rca01"].agg(list).reset_index()
ent["entry01"] = ent["rca01"].apply(lambda x: x == entry_pattern).astype(int)
ent["consider00"] = ent["rca01"].apply(lambda x: x == consider_pattern).astype(int)



# full combination
all_countries = ent["iso2_code"].unique()
all_languages = ent["cluster_id"].unique()

all_combinations = list(product(all_countries, all_languages))
full_df = pd.DataFrame(all_combinations, columns=["iso2_code", "cluster_id"])\
    .sort_values(["iso2_code", "cluster_id"])

# join entries
full_df = pd.merge(
    full_df,
    ent[["iso2_code", "cluster_id", "entry01", "consider00"]],
    on=["iso2_code", "cluster_id"],
    how="left"
).fillna(0)

# join complexity
cdf = pd.read_csv("../outputs/eci_clusters_cooc_2020_2023.csv", sep=";")
cdf = cdf[cdf["year"]==2020]
full_df = pd.merge(
    full_df,
    cdf[["iso2_code", "cluster_id", "pci", "ubiquity"]],
    on=["iso2_code", "cluster_id"],
    how="left"
)

# join RCA from the baseline period
full_df = pd.merge(
    full_df,
    rca_tables[rca_tables["period"]==3].loc[:,["iso2_code", "cluster_id", "rca01"]],
    on=["iso2_code", "cluster_id"],
    how="left"
)
full_df["rca01"] = full_df["rca01"].fillna(0)

# drop languages with no complexity value
full_df.dropna(subset=["pci"], inplace=True)



# join to full_df with entries and PCI
full_df = pd.merge(
    full_df,
    rel_dens,
    on=["iso2_code", "cluster_id"],
    how="left"
)



# export for entry models -- only consider 00, 01 patterns
full_df["entry01"] = full_df["entry01"].astype(int)
full_df["consider00"] = full_df["consider00"].astype(int)
export_df = full_df[(full_df["entry01"]==1) | (full_df["consider00"]==1)]
export_df.to_csv("../outputs/data_entry_regressions_0011_clusters_cooc.csv", index=False, sep=";")
#export_df.to_csv("../outputs/data_entry_regressions_0011_threshold05.csv", index=False, sep=";")

(93076, 7)
1
2
3
4


In [8]:
### EXIT -- 1.00 threshold

# relatedness density -- as in Hidalgo et al. (2007) Science
cdf = pd.read_csv("../outputs/eci_clusters_cooc_2020_2023.csv", sep=";")
rel_dens = cdf[cdf["year"] == 2020][["iso2_code", "cluster_id", "density"]].drop_duplicates()



# data IN
data = pd.read_csv("../data/languages.csv")
selected_period = "year"

# steps to prep dataframe of ecomplexity
prev_filter = "|".join(["yaml", "json", "text", "svg", "Markdown", "xml"])
df = drop_specifics_from_list(data, filter_list=prev_filter)
df = top_languages_filter(df, nr_languages=150)
df = drop_country_codes_from_list(df, country_list=["EU"])
df = add_period_ids(df, period=selected_period)
print(df.shape)

# clusters of languages
cl_df = pd.read_csv("../data/language_clusters_gh_cos_hier_ward_d1.csv")\
    .rename(columns={"Language":"language", "Cluster":"cluster_id"})\
    .iloc[:,1:]

# combine
df = pd.merge(
    df,
    cl_df,
    on="language",
    how="left"
)

# drop NAs... -- not so great
df.dropna(subset=["cluster_id", "cluster_id"], inplace=True)

# aggregate by clusters
df = df.groupby(["iso2_code", "period", "cluster_id"])["num_pushers"].agg("mean").reset_index()


def bundle_data_clusters(data, periods):
    """aggreagte data for period by taking the mean number active developers"""
    data = (
        data[data["period"].isin(periods)]
        .groupby(["iso2_code", "cluster_id"])["num_pushers"]
        .agg("mean")
        .reset_index()
    )
    data["period"] = 1
    data["num_pushers"] = data["num_pushers"].astype(int)
    return data


# threshold for RCA : 1.00
ps = [1, 2, 3, 4]
rca_tables = []
for p in ps:
    print(p)
    temp = bundle_data_clusters(df, periods=[p])
    temp["period"] = p
    rca_tables.append(rca_calculation(temp, c_column="iso2_code", p_column="cluster_id", value_column="num_pushers", threshold=1))

#    dfbs.append(temp)
rca_tables = pd.concat(rca_tables)



# identify entry following the given patterns
exit_pattern = [1,1,0,0]
consider_pattern = [1,1,1,1]
ext = rca_tables.sort_values(["period"], ascending=True).groupby(["iso2_code","cluster_id"])["rca01"].agg(list).reset_index()
ext["entry01"] = ext["rca01"].apply(lambda x: x == exit_pattern).astype(int)
ext["consider00"] = ext["rca01"].apply(lambda x: x == consider_pattern).astype(int)



# full combination
all_countries = ext["iso2_code"].unique()
all_languages = ext["cluster_id"].unique()

all_combinations = list(product(all_countries, all_languages))
full_df = pd.DataFrame(all_combinations, columns=["iso2_code", "cluster_id"])\
    .sort_values(["iso2_code", "cluster_id"])

# join entries
full_df = pd.merge(
    full_df,
    ext[["iso2_code", "cluster_id", "entry01", "consider00"]],
    on=["iso2_code", "cluster_id"],
    how="left"
).fillna(0)

# join complexity
cdf = pd.read_csv("../outputs/eci_clusters_cooc_2020_2023.csv", sep=";")
cdf = cdf[cdf["year"]==2020]
full_df = pd.merge(
    full_df,
    cdf[["iso2_code", "cluster_id", "pci", "ubiquity"]],
    on=["iso2_code", "cluster_id"],
    how="left"
)

# join RCA from the baseline period
full_df = pd.merge(
    full_df,
    rca_tables[rca_tables["period"]==3].loc[:,["iso2_code", "cluster_id", "rca01"]],
    on=["iso2_code", "cluster_id"],
    how="left"
)
full_df["rca01"] = full_df["rca01"].fillna(0)

# drop languages with no complexity value
full_df.dropna(subset=["pci"], inplace=True)



# join to full_df with entries and PCI
full_df = pd.merge(
    full_df,
    rel_dens,
    on=["iso2_code", "cluster_id"],
    how="left"
)



# export for entry models -- only consider 00, 01 patterns
full_df["entry01"] = full_df["entry01"].astype(int)
full_df.rename(columns={"entry01":"exit01"}, inplace=True)
full_df["consider00"] = full_df["consider00"].astype(int)
export_df = full_df[(full_df["exit01"]==1) | (full_df["consider00"]==1)]
export_df.to_csv("../outputs/data_exit_regressions_1100_clusters_cooc.csv", index=False, sep=";")
#export_df.to_csv("../outputs/data_entry_regressions_0011_threshold05.csv", index=False, sep=";")

(93076, 7)
1
2
3
4


**IV for co-occurrence clusters**

In [4]:
# ECI_software table
#cdf = pd.read_csv("../outputs/eci_software_2020_2023.csv", sep=";")
cdf = pd.read_csv("../outputs/eci_clusters_cooc_2020_2023.csv", sep=";")

# neighboring countries from https://github.com/geodatasource/country-borders
nc = pd.read_csv("../data/geodatasource_country_borders.csv")

In [5]:
locations = list(set(cdf["iso2_code"].to_list()))
full_prod_countries = pd.DataFrame(itertools.product(locations, repeat=2), columns=["iso2_code1", "iso2_code2"])

In [6]:
full_prod_countries = pd.merge(
    full_prod_countries,
    nc,
    left_on=["iso2_code1", "iso2_code2"],
    right_on=["country_code", "country_border_code"],
    how="left"
)
full_prod_countries["neighbor01"] = full_prod_countries["country_border_code"].notna().astype(int)
full_prod_countries = full_prod_countries[["iso2_code1", "iso2_code2", "neighbor01"]]

In [7]:
# select year
year_list = [2020, 2021, 2022, 2023]
cdf2 = []
for y in year_list:
    print(y)

    tcdf = cdf[cdf["year"] == y]
    tcdf.year.isna().sum()

    # generate full product dataframe
    locations = list(set(tcdf["iso2_code"].to_list()))
    full_prod_countries = pd.DataFrame(itertools.product(locations, repeat=2), columns=["iso2_code1", "iso2_code2"])
    full_prod_countries = pd.merge(
        full_prod_countries,
        nc,
        left_on=["iso2_code1", "iso2_code2"],
        right_on=["country_code", "country_border_code"],
        how="left"
    )
    full_prod_countries["neighbor01"] = full_prod_countries["country_border_code"].notna().astype(int)
    full_prod_countries = full_prod_countries[["iso2_code1", "iso2_code2", "neighbor01"]]

    # add location - mcp array to location pairs
    mcp_temp = tcdf.groupby("iso2_code")["mcp"].apply(np.array).reset_index()
    full_prod_countries = pd.merge(
        full_prod_countries,
        mcp_temp,
        left_on="iso2_code1",
        right_on="iso2_code",
        how="left"
    )
    full_prod_countries = pd.merge(
        full_prod_countries,
        mcp_temp,
        left_on="iso2_code2",
        right_on="iso2_code",
        how="left"
    )
    full_prod_countries = full_prod_countries\
        .drop(columns=["iso2_code_x", "iso2_code_y"])\
        .rename(columns={"mcp_x":"mcp_array1", "mcp_y":"mcp_array2"})

    # minimum conditional probability -- to measure similarity between locations
    full_prod_countries["spec_similarity"] = full_prod_countries.apply(lambda r: round(sum(r["mcp_array1"] * r["mcp_array2"]) / max(sum(r["mcp_array1"]), sum(r["mcp_array2"])), 3), axis=1)

    # drop iso2_code1 == iso2_code2 cases and neighbors
    sim_spec_df = full_prod_countries[(full_prod_countries["iso2_code1"] != full_prod_countries["iso2_code2"]) & (full_prod_countries["neighbor01"] == 0)]
    
    # keep the top3 most similar countries
    sim_spec_df = sim_spec_df.groupby(["iso2_code1"])["spec_similarity"]\
        .nlargest(3)\
        .reset_index()\
        .rename(columns={"level_1":"iso2_code2_index"})

    # merge similar location names by index
    sim_spec_df = pd.merge(
        sim_spec_df,
        full_prod_countries[["iso2_code2"]].reset_index(),
        left_on="iso2_code2_index",
        right_on="index",
        how="left"
    )

    # merge ECI values by location name
    sim_spec_df = pd.merge(
        sim_spec_df,
        tcdf[["iso2_code", "eci"]].drop_duplicates(),
        left_on="iso2_code2",
        right_on="iso2_code",
        how="left"
    )

    # merge distance values by location name
    sim_spec_df = pd.merge(
        sim_spec_df,
        full_prod_countries,
        on=["iso2_code1", "iso2_code2"],
        how="left"
    )

    # average ECI of the top 3 most similar location 
    avg_comp_sim_spec = sim_spec_df.groupby(["iso2_code1"])\
        .agg(
            avg_eci_similar_spec = pd.NamedAgg("eci", np.mean))\
        .reset_index()\
        .rename(columns={"iso2_code1" : "iso2_code"})

    # join to full comb table
    tcdf = pd.merge(
        tcdf,
        avg_comp_sim_spec,
        on="iso2_code",
        how="left"
    )
    cdf2.append(tcdf)

2020


  .agg(


2021


  .agg(


2022


  .agg(


2023


  .agg(


In [9]:
cdf2[0]

Unnamed: 0,iso2_code,cluster_id,num_pushers,period,diversity,ubiquity,mcp,eci,pci,density,coi,cog,rca,year,avg_eci_similar_spec
0,AE,1.0,2414.333333,1,8,85,1,-0.604691,-1.531008,0.551220,-0.486011,-0.000000,1.033574,2020,-0.49600
1,AE,2.0,292.250000,1,8,36,0,-0.604691,1.048489,0.076690,-0.486011,0.724783,0.887635,2020,-0.49600
2,AE,3.0,246.500000,1,8,38,0,-0.604691,1.410831,0.066669,-0.486011,0.842606,0.764871,2020,-0.49600
3,AE,4.0,0.000000,1,8,32,0,-0.604691,1.548653,0.061788,-0.486011,0.812965,0.000000,2020,-0.49600
4,AE,5.0,189.785714,1,8,50,1,-0.604691,-0.420012,0.388770,-0.486011,0.000000,1.274215,2020,-0.49600
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8374,ZW,54.0,0.000000,1,6,42,0,-0.753676,1.028947,0.065524,-0.678235,0.445385,0.000000,2020,-0.73992
8375,ZW,55.0,0.000000,1,6,16,0,-0.753676,2.183614,0.021669,-0.678235,1.070497,0.000000,2020,-0.73992
8376,ZW,56.0,0.000000,1,6,14,0,-0.753676,2.169926,0.019789,-0.678235,0.999220,0.000000,2020,-0.73992
8377,ZW,57.0,0.000000,1,6,22,0,-0.753676,1.449351,0.031942,-0.678235,0.792187,0.000000,2020,-0.73992


In [13]:
# join and save
cdf2 = pd.concat(cdf2)
cdf2.to_csv(f"../outputs/si_eci_clusters_cooc_2020_2023_ivreg.csv", index=False, sep=";")
# eci_clusters_cooc_2020_2023