In [2]:
import numpy as np
import pandas as pd
import networkx as nx
from itertools import product
from ecomplexity import ecomplexity
from ecomplexity import proximity
from ecomplexity import calc_density
import country_converter as coco
import itertools

# spatial autocorrelation
import geopandas as gpd
from pysal.lib import weights
from libpysal.io import open as psopen
from splot.esda import (
    moran_scatterplot, lisa_cluster, plot_local_autocorrelation, plot_moran
)
from splot.libpysal import plot_spatial_weights
import esda

# stats
import scipy.stats as stats
from scipy.stats import linregress
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

from utils import *

**different RCA thresholds**

In [2]:
# saved in data prep
eci_software = pd.read_csv("../outputs/eci_software_2020_2023.csv", sep=";")

In [3]:
# year focus
eci_df = eci_software[eci_software["year"]==2020]
eci_df["mcp075"] = np.where(eci_df["rca"]>=0.75, 1, 0)
eci_df["mcp125"] = np.where(eci_df["rca"]>=1.25, 1, 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  eci_df["mcp075"] = np.where(eci_df["rca"]>=0.75, 1, 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  eci_df["mcp125"] = np.where(eci_df["rca"]>=1.25, 1, 0)


In [4]:
def mcp_crosstable(df, columns):
    mcp_crosstable = df[columns].value_counts().reset_index().sort_values(by=columns).rename(columns={"count":"obs"})
    mcp_crosstable["obs_share"] = round(mcp_crosstable["obs"] / mcp_crosstable["obs"].sum(), 2)
    return mcp_crosstable

In [5]:
mcp_crosstable(eci_df, columns=["mcp", "mcp075"])

Unnamed: 0,mcp,mcp075,obs,obs_share
0,0,0,15766,0.77
2,0,1,1074,0.05
1,1,1,3593,0.18


In [6]:
mcp_crosstable(eci_df, columns=["mcp", "mcp125"])

Unnamed: 0,mcp,mcp125,obs,obs_share
0,0,0,16840,0.82
2,1,0,1191,0.06
1,1,1,2402,0.12


In [13]:
# data IN -- for threshold 0.75 AND 1.25
data = pd.read_csv("../data/languages.csv")

# parameter to choose year / semester / quarter to construct period IDs
selected_period = "year"

# for ecomplexity calculcation
key_cols = {
    "time": "period",
    "loc": "iso2_code",
    "prod": "language",
    "val": "num_pushers",
}

# use data_prep_functions to clean the dataframe of ECI_software calculation
data = data[data["year"].isin([2020, 2021, 2022, 2023])]
prev_filter = "|".join(["yaml", "json", "text", "svg", "Markdown", "xml"])
df = drop_specifics_from_list(data, filter_list=prev_filter)
df = top_languages_filter(df, nr_languages=150)
df = drop_country_codes_from_list(df, country_list=["EU"])
df = add_period_ids(df, period=selected_period)
print(df.shape)

(93076, 7)


In [14]:
# software complexity calculation -- period IDs -- 1 means 2020 on yearly basis
ccdf = []
ppdf = []
year_dict = {1 : 2020, 2 : 2021, 3 : 2022, 4 : 2023}
for k in year_dict.keys():
    dfb = bundle_data(df, periods=[k])
    #cdf = ecomplexity(dfb, key_cols, rca_mcp_threshold=0.75)
    cdf = ecomplexity(dfb, key_cols, rca_mcp_threshold=1.25)
    cdf["year"] = year_dict[k]

    #pdf = proximity(dfb, key_cols, rca_mcp_threshold=0.75)
    pdf = proximity(dfb, key_cols, rca_mcp_threshold=1.25)
    pdf["year"] = year_dict[k]

    # combine yearly dataframes
    ccdf.append(cdf)
    ppdf.append(pdf)
    print(year_dict[k], " DONE")

1
Percentage of pairs compared that meet log-supermodularity condition: 4.90%
1
2020  DONE
1




Percentage of pairs compared that meet log-supermodularity condition: 4.28%
1
2021  DONE
1




Percentage of pairs compared that meet log-supermodularity condition: 4.40%
1
2022  DONE
1




Percentage of pairs compared that meet log-supermodularity condition: 5.11%
1
2023  DONE




In [15]:
# combine and save -- complexity
cdf = pd.concat(ccdf, axis=0, ignore_index=True)
#cdf.to_csv("../outputs/eci_software_2020_2023_threshold_075.csv", sep=";", index=False)
cdf.to_csv("../outputs/eci_software_2020_2023_threshold_125.csv", sep=";", index=False)

In [16]:
# combine and save -- language proximity
prox_df = pd.concat(ppdf, axis=0, ignore_index=True)
#prox_df.to_csv("../outputs/proximity_2020_2023_threshold_075.csv", sep=";", index=False)
prox_df.to_csv("../outputs/proximity_2020_2023_threshold_125.csv", sep=";", index=False)

In [37]:
# correlation matrix
cdf100 = pd.read_csv("../outputs/eci_software_2020_2023.csv", sep=";")
cdf100 = cdf100[cdf100["year"]==2020]
cdf075 = pd.read_csv("../outputs/eci_software_2020_2023_threshold_075.csv", sep=";")
cdf075 = cdf075[cdf075["year"]==2020]
cdf125 = pd.read_csv("../outputs/eci_software_2020_2023_threshold_125.csv", sep=";")
cdf125 = cdf125[cdf125["year"]==2020].rename(columns={"eci":"eci125"})

full_cdf = pd.merge(
    cdf100[["iso2_code", "eci"]].drop_duplicates(),
    cdf075[["iso2_code", "eci"]].drop_duplicates(),
    on=["iso2_code"],
    how="left",
    suffixes=["100", "075"]
)
full_cdf = pd.merge(
    full_cdf,
    cdf125,
    on=["iso2_code"],
    how="left"
)

In [3]:
### ENTRY -- 0.75 threshold

# relatedness density -- as in Hidalgo et al. (2007) Science
cdf = pd.read_csv("../outputs/eci_software_2020_2023_threshold_075.csv", sep=";")
rel_dens = cdf[cdf["year"] == 2020][["iso2_code", "language", "density"]].drop_duplicates()



# data IN
data = pd.read_csv("../data/languages.csv")
selected_period = "year"

# steps to prep dataframe of ecomplexity
prev_filter = "|".join(["yaml", "json", "text", "svg", "Markdown", "xml"])
df = drop_specifics_from_list(data, filter_list=prev_filter)
df = top_languages_filter(df, nr_languages=150)
df = drop_country_codes_from_list(df, country_list=["EU"])
df = add_period_ids(df, period=selected_period)
print(df.shape)



# threshold for RCA : 0.75
ps = [1, 2, 3, 4]
rca_tables = []
for p in ps:
    print(p)
    temp = bundle_data(df, periods=[p])
    temp["period"] = p
    rca_tables.append(rca_calculation(temp, c_column="iso2_code", p_column="language", value_column="num_pushers", threshold=0.75))

#    dfbs.append(temp)
rca_tables = pd.concat(rca_tables)



# identify entry following the given patterns
entry_pattern = [0,0,1,1]
consider_pattern = [0,0,0,0]
ent = rca_tables.sort_values(["period"], ascending=True).groupby(["iso2_code","language"])["rca01"].agg(list).reset_index()
ent["entry01"] = ent["rca01"].apply(lambda x: x == entry_pattern).astype(int)
ent["consider00"] = ent["rca01"].apply(lambda x: x == consider_pattern).astype(int)



# full combination
all_countries = ent["iso2_code"].unique()
all_languages = ent["language"].unique()

all_combinations = list(product(all_countries, all_languages))
full_df = pd.DataFrame(all_combinations, columns=["iso2_code", "language"])\
    .sort_values(["iso2_code", "language"])

# join entries
full_df = pd.merge(
    full_df,
    ent[["iso2_code", "language", "entry01", "consider00"]],
    on=["iso2_code", "language"],
    how="left"
).fillna(0)

# join complexity
cdf = pd.read_csv("../outputs/eci_software_2020_2023_threshold_075.csv", sep=";")
cdf = cdf[cdf["year"]==2020]
full_df = pd.merge(
    full_df,
    cdf[["iso2_code", "language", "pci", "ubiquity"]],
    on=["iso2_code", "language"],
    how="left"
)

# join RCA from the baseline period
full_df = pd.merge(
    full_df,
    rca_tables[rca_tables["period"]==3].loc[:,["iso2_code", "language", "rca01"]],
    on=["iso2_code", "language"],
    how="left"
)
full_df["rca01"] = full_df["rca01"].fillna(0)

# drop languages with no complexity value
full_df.dropna(subset=["pci"], inplace=True)



# join to full_df with entries and PCI
full_df = pd.merge(
    full_df,
    rel_dens,
    on=["iso2_code", "language"],
    how="left"
)



# export for entry models -- only consider 00, 01 patterns
full_df["entry01"] = full_df["entry01"].astype(int)
full_df["consider00"] = full_df["consider00"].astype(int)
export_df = full_df[(full_df["entry01"]==1) | (full_df["consider00"]==1)]
export_df.to_csv("../outputs/data_entry_regressions_0011_threshold_075.csv", index=False, sep=";")
#export_df.to_csv("../outputs/data_entry_regressions_0011_threshold05.csv", index=False, sep=";")

(93076, 7)
1
2
3
4


In [6]:
### ENTRY -- 1.25 threshold

# relatedness density -- as in Hidalgo et al. (2007) Science
cdf = pd.read_csv("../outputs/eci_software_2020_2023_threshold_125.csv", sep=";")
rel_dens = cdf[cdf["year"] == 2020][["iso2_code", "language", "density"]].drop_duplicates()



# data IN
data = pd.read_csv("../data/languages.csv")
selected_period = "year"

# steps to prep dataframe of ecomplexity
prev_filter = "|".join(["yaml", "json", "text", "svg", "Markdown", "xml"])
df = drop_specifics_from_list(data, filter_list=prev_filter)
df = top_languages_filter(df, nr_languages=150)
df = drop_country_codes_from_list(df, country_list=["EU"])
df = add_period_ids(df, period=selected_period)
print(df.shape)



# threshold for RCA : 1.25
ps = [1, 2, 3, 4]
rca_tables = []
for p in ps:
    print(p)
    temp = bundle_data(df, periods=[p])
    temp["period"] = p
    rca_tables.append(rca_calculation(temp, c_column="iso2_code", p_column="language", value_column="num_pushers", threshold=1.25))

#    dfbs.append(temp)
rca_tables = pd.concat(rca_tables)



# identify entry following the given patterns
entry_pattern = [0,0,1,1]
consider_pattern = [0,0,0,0]
ent = rca_tables.sort_values(["period"], ascending=True).groupby(["iso2_code","language"])["rca01"].agg(list).reset_index()
ent["entry01"] = ent["rca01"].apply(lambda x: x == entry_pattern).astype(int)
ent["consider00"] = ent["rca01"].apply(lambda x: x == consider_pattern).astype(int)



# full combination
all_countries = ent["iso2_code"].unique()
all_languages = ent["language"].unique()

all_combinations = list(product(all_countries, all_languages))
full_df = pd.DataFrame(all_combinations, columns=["iso2_code", "language"])\
    .sort_values(["iso2_code", "language"])

# join entries
full_df = pd.merge(
    full_df,
    ent[["iso2_code", "language", "entry01", "consider00"]],
    on=["iso2_code", "language"],
    how="left"
).fillna(0)

# join complexity
cdf = pd.read_csv("../outputs/eci_software_2020_2023_threshold_125.csv", sep=";")
cdf = cdf[cdf["year"]==2020]
full_df = pd.merge(
    full_df,
    cdf[["iso2_code", "language", "pci", "ubiquity"]],
    on=["iso2_code", "language"],
    how="left"
)

# join RCA from the baseline period
full_df = pd.merge(
    full_df,
    rca_tables[rca_tables["period"]==3].loc[:,["iso2_code", "language", "rca01"]],
    on=["iso2_code", "language"],
    how="left"
)
full_df["rca01"] = full_df["rca01"].fillna(0)

# drop languages with no complexity value
full_df.dropna(subset=["pci"], inplace=True)



# join to full_df with entries and PCI
full_df = pd.merge(
    full_df,
    rel_dens,
    on=["iso2_code", "language"],
    how="left"
)



# export for entry models -- only consider 00, 01 patterns
full_df["entry01"] = full_df["entry01"].astype(int)
full_df["consider00"] = full_df["consider00"].astype(int)
export_df = full_df[(full_df["entry01"]==1) | (full_df["consider00"]==1)]
export_df.to_csv("../outputs/data_entry_regressions_0011_threshold_125.csv", index=False, sep=";")
#export_df.to_csv("../outputs/data_entry_regressions_0011_threshold05.csv", index=False, sep=";")

(93076, 7)
1
2
3
4


In [10]:
### EXIT -- 0.75 threshold

# relatedness density -- as in Hidalgo et al. (2007) Science
cdf = pd.read_csv("../outputs/eci_software_2020_2023_threshold_075.csv", sep=";")
rel_dens = cdf[cdf["year"] == 2020][["iso2_code", "language", "density"]].drop_duplicates()



# data IN
data = pd.read_csv("../data/languages.csv")
selected_period = "year"

# steps to prep dataframe of ecomplexity
prev_filter = "|".join(["yaml", "json", "text", "svg", "Markdown", "xml"])
df = drop_specifics_from_list(data, filter_list=prev_filter)
df = top_languages_filter(df, nr_languages=150)
df = drop_country_codes_from_list(df, country_list=["EU"])
df = add_period_ids(df, period=selected_period)
print(df.shape)



# threshold for RCA : 0.75
ps = [1, 2, 3, 4]
rca_tables = []
for p in ps:
    print(p)
    temp = bundle_data(df, periods=[p])
    temp["period"] = p
    rca_tables.append(rca_calculation(temp, c_column="iso2_code", p_column="language", value_column="num_pushers", threshold=0.75))

#    dfbs.append(temp)
rca_tables = pd.concat(rca_tables)



# identify entry following the given patterns
exit_pattern = [1,1,0,0]
consider_pattern = [1,1,1,1]
ext = rca_tables.sort_values(["period"], ascending=True).groupby(["iso2_code","language"])["rca01"].agg(list).reset_index()
ext["entry01"] = ext["rca01"].apply(lambda x: x == exit_pattern).astype(int)
ext["consider00"] = ext["rca01"].apply(lambda x: x == consider_pattern).astype(int)



# full combination
all_countries = ext["iso2_code"].unique()
all_languages = ext["language"].unique()

all_combinations = list(product(all_countries, all_languages))
full_df = pd.DataFrame(all_combinations, columns=["iso2_code", "language"])\
    .sort_values(["iso2_code", "language"])

# join entries
full_df = pd.merge(
    full_df,
    ext[["iso2_code", "language", "entry01", "consider00"]],
    on=["iso2_code", "language"],
    how="left"
).fillna(0)

# join complexity
cdf = pd.read_csv("../outputs/eci_software_2020_2023_threshold_075.csv", sep=";")
cdf = cdf[cdf["year"]==2020]
full_df = pd.merge(
    full_df,
    cdf[["iso2_code", "language", "pci", "ubiquity"]],
    on=["iso2_code", "language"],
    how="left"
)

# join RCA from the baseline period
full_df = pd.merge(
    full_df,
    rca_tables[rca_tables["period"]==3].loc[:,["iso2_code", "language", "rca01"]],
    on=["iso2_code", "language"],
    how="left"
)
full_df["rca01"] = full_df["rca01"].fillna(0)

# drop languages with no complexity value
full_df.dropna(subset=["pci"], inplace=True)



# join to full_df with entries and PCI
full_df = pd.merge(
    full_df,
    rel_dens,
    on=["iso2_code", "language"],
    how="left"
)



# export for entry models -- only consider 00, 01 patterns
full_df["entry01"] = full_df["entry01"].astype(int)
full_df.rename(columns={"entry01":"exit01"}, inplace=True)
full_df["consider00"] = full_df["consider00"].astype(int)
export_df = full_df[(full_df["exit01"]==1) | (full_df["consider00"]==1)]
export_df.to_csv("../outputs/data_exit_regressions_1100_threshold_075.csv", index=False, sep=";")
#export_df.to_csv("../outputs/data_entry_regressions_0011_threshold05.csv", index=False, sep=";")

(93076, 7)
1
2
3
4


In [11]:
### EXIT -- 1.25 threshold

# relatedness density -- as in Hidalgo et al. (2007) Science
cdf = pd.read_csv("../outputs/eci_software_2020_2023_threshold_125.csv", sep=";")
rel_dens = cdf[cdf["year"] == 2020][["iso2_code", "language", "density"]].drop_duplicates()



# data IN
data = pd.read_csv("../data/languages.csv")
selected_period = "year"

# steps to prep dataframe of ecomplexity
prev_filter = "|".join(["yaml", "json", "text", "svg", "Markdown", "xml"])
df = drop_specifics_from_list(data, filter_list=prev_filter)
df = top_languages_filter(df, nr_languages=150)
df = drop_country_codes_from_list(df, country_list=["EU"])
df = add_period_ids(df, period=selected_period)
print(df.shape)



# threshold for RCA : 1.25
ps = [1, 2, 3, 4]
rca_tables = []
for p in ps:
    print(p)
    temp = bundle_data(df, periods=[p])
    temp["period"] = p
    rca_tables.append(rca_calculation(temp, c_column="iso2_code", p_column="language", value_column="num_pushers", threshold=1.25))

#    dfbs.append(temp)
rca_tables = pd.concat(rca_tables)



# identify entry following the given patterns
exit_pattern = [1,1,0,0]
consider_pattern = [1,1,1,1]
ext = rca_tables.sort_values(["period"], ascending=True).groupby(["iso2_code","language"])["rca01"].agg(list).reset_index()
ext["entry01"] = ext["rca01"].apply(lambda x: x == exit_pattern).astype(int)
ext["consider00"] = ext["rca01"].apply(lambda x: x == consider_pattern).astype(int)



# full combination
all_countries = ext["iso2_code"].unique()
all_languages = ext["language"].unique()

all_combinations = list(product(all_countries, all_languages))
full_df = pd.DataFrame(all_combinations, columns=["iso2_code", "language"])\
    .sort_values(["iso2_code", "language"])

# join entries
full_df = pd.merge(
    full_df,
    ext[["iso2_code", "language", "entry01", "consider00"]],
    on=["iso2_code", "language"],
    how="left"
).fillna(0)

# join complexity
cdf = pd.read_csv("../outputs/eci_software_2020_2023_threshold_125.csv", sep=";")
cdf = cdf[cdf["year"]==2020]
full_df = pd.merge(
    full_df,
    cdf[["iso2_code", "language", "pci", "ubiquity"]],
    on=["iso2_code", "language"],
    how="left"
)

# join RCA from the baseline period
full_df = pd.merge(
    full_df,
    rca_tables[rca_tables["period"]==3].loc[:,["iso2_code", "language", "rca01"]],
    on=["iso2_code", "language"],
    how="left"
)
full_df["rca01"] = full_df["rca01"].fillna(0)

# drop languages with no complexity value
full_df.dropna(subset=["pci"], inplace=True)



# join to full_df with entries and PCI
full_df = pd.merge(
    full_df,
    rel_dens,
    on=["iso2_code", "language"],
    how="left"
)



# export for entry models -- only consider 00, 01 patterns
full_df["entry01"] = full_df["entry01"].astype(int)
full_df.rename(columns={"entry01":"exit01"}, inplace=True)
full_df["consider00"] = full_df["consider00"].astype(int)
export_df = full_df[(full_df["exit01"]==1) | (full_df["consider00"]==1)]
export_df.to_csv("../outputs/data_exit_regressions_1100_threshold_125.csv", index=False, sep=";")
#export_df.to_csv("../outputs/data_entry_regressions_0011_threshold05.csv", index=False, sep=";")

(93076, 7)
1
2
3
4


**spatial autocorrelation**

In [4]:
# data -- ECI_software
cdf = pd.read_csv("../outputs/eci_software_2020_2023.csv", sep=";")
cdf = cdf[cdf["year"]==2020]
cdf = cdf[["iso2_code", "eci"]].drop_duplicates()

# data -- world map
cmap = gpd.read_file("../data/world-administrative-boundaries.geojson")
cmap = cmap[["iso3", "iso_3166_1_alpha_2_codes", "name", "geometry"]].rename(columns={"iso_3166_1_alpha_2_codes" : "iso2"})

cmap = pd.merge(
    cmap,
    cdf,
    left_on="iso2",
    right_on="iso2_code",
    how="left"
)

# clean up
cmap = cmap.drop_duplicates(subset=["iso2"])
#cmap.dropna(subset="iso2", inplace=True)
cmap.dropna(subset="eci", inplace=True)

In [5]:
### spatial autocorrelation

# index setting
cmap = cmap.set_index("iso2", drop=False)

# create the spatial weights matrix
w = weights.Queen.from_dataframe(cmap, idVariable="iso2")

# drop island
cmap = cmap.drop(w.islands)

# create the spatial weights matrix
w = weights.Queen.from_dataframe(cmap, idVariable="iso2")

# row standardize the matrix
w.transform = "R"

# spatial lag
cmap["w_eci"] = weights.lag_spatial(w, cmap["eci"])

# z score
cmap["eci_std"] = (cmap["eci"] - cmap["eci"].mean()) / cmap["eci"].std()
cmap["w_eci_std"] = weights.lag_spatial(w, cmap["eci_std"])

# Moran I
mi = esda.Moran(cmap["eci"], w)
print(mi.I, "Moran's I")
print(mi.p_sim, "significance")

  w = weights.Queen.from_dataframe(cmap, idVariable="iso2")
 There are 27 disconnected components.
 There are 22 islands with ids: CY, JM, LK, AU, PH, MG, KR, MU, JP, MT, SG, BH, BB, PR, SN, MV, IS, CU, RE, TW, NZ, TT.
  W.__init__(self, neighbors, ids=ids, **kw)
  w = weights.Queen.from_dataframe(cmap, idVariable="iso2")


0.5849299169848287 Moran's I
0.001 significance


 There are 5 disconnected components.
  W.__init__(self, neighbors, ids=ids, **kw)


**topics -- instead of languages**

In [43]:
# parameter to choose year / semester / quarter to construct period IDs
selected_period = "year"

# for ecomplexity calculcation
key_cols = {
    "time": "period",
    "loc": "iso2_code",
    "prod": "topic",
    "val": "num_pushers",
}

# data IN
data = pd.read_csv("../data/topics.csv")

# use data_prep_functions to clean the dataframe of ECI_software calculation
print(data.shape)
data = data[data["year"].isin([2020, 2021, 2022, 2023])]
prev_filter = "|".join(["yaml", "json", "text", "svg", "Markdown", "xml"])
df = drop_specifics_from_list(data, filter_list=prev_filter, key_column="topic")
df = top_languages_filter(df, nr_languages=150, key_column="topic")
df = drop_country_codes_from_list(df, country_list=["EU"])
df = add_period_ids(df, period=selected_period)
print(df.shape)

(61857, 5)
(22328, 6)


In [44]:
df["topic"].nunique()

150

In [45]:
# software complexity calculation -- period IDs -- 1 means 2020 on yearly basis
ccdf = []
ppdf = []
year_dict = {1 : 2020, 2 : 2021, 3 : 2022, 4 : 2023}
for k in year_dict.keys():
    dfb = bundle_data(df, periods=[k], key_column="topic")
    cdf = ecomplexity(dfb, key_cols)
    cdf["year"] = year_dict[k]

    pdf = proximity(dfb, key_cols)
    pdf["year"] = year_dict[k]

    # combine yearly dataframes
    ccdf.append(cdf)
    ppdf.append(pdf)
    print(year_dict[k], " DONE")

cdf = pd.concat(ccdf, axis=0, ignore_index=True)

1
Percentage of pairs compared that meet log-supermodularity condition: 1.33%
1
2020  DONE
1




Percentage of pairs compared that meet log-supermodularity condition: 0.21%
1
2021  DONE
1




Percentage of pairs compared that meet log-supermodularity condition: 0.55%
1
2022  DONE
1




Percentage of pairs compared that meet log-supermodularity condition: 0.41%
1
2023  DONE




In [46]:
cdf[cdf["year"]==2020].sort_values(by="pci", ascending=False)["topic"].unique()

array(['distributed-systems', 'big-data', 'hpc', 'aws-lambda', 'robotics',
       'science', 'streaming', 'config', 'containers', 'microsoft', 'gpu',
       'azure', 'advent-of-code', 'tailwindcss', 'pandas', 'numpy',
       'portfolio-website', 'jupyter-notebook', 'readme', 'simulation',
       'cloud', 'terraform', 'spark', 'visualization', 'raspberry-pi',
       'minecraft', 'r', 'ai', 'data-structures', 'symfony', 'jest',
       'dotnet-core', 'open-source', 'flask', 'github',
       'html-css-javascript', 'styled-components', 'mongoose',
       'material-ui', 'tensorflow', 'discord-bot', 'graphql',
       'automation', 'testing', 'portfolio', 'hactoberfest2020',
       'competitive-programming', 'webdevelopment', 'hactoberfest',
       'hactoberfest-accepted', 'dsa', 'monitoring', 'postgresql',
       'security', 'node', 'sql', 'nextjs', 'react-native', 'macos',
       'windows', 'bootstrap', 'database', 'git', 'data-science',
       'express', 'rust', 'bot', 'ruby', 'pytorch', 'f

In [42]:
cdf[cdf["year"]==2020].sort_values(by="pci", ascending=False)["topic"].unique()

array(['actions', 'accessibility', 'a11y', 'julia', 'gis', 'jupyter',
       'circleci', 'jsx', 'jdbc', 'javascript-library', 'javafx',
       'jamstack', 'cicd', 'gov', 'real-time', 'realtime', 'google-cloud',
       'google-cloud-platform', 'google-maps-api', 'reactive',
       'government', 'irc', 'gpu', 'ios-app', 'grafana', 'interpreter',
       'graphics', 'interactive', 'integration', 'civic-tech',
       'cloud-computing', 'geospatial', 'game-engine', 'frc-robot',
       'labspt11', 'labs20', 'collaboration', 'fsharp', 'full-stack',
       'python-library', 'qt', 'quantum-computing', 'r-package',
       'functional-programming', 'kibana', 'kernel', 'code-for-america',
       'cocoapods', 'genomics', 'radiuss', 'gamedev', 'games', 'gatsbyjs',
       'gcp', 'cncf', 'clustering', 'cluster', 'cloudformation',
       'cloud-native', 'infrastructure-as-code', 'clojure', 'climate',
       'k8s', 'chrome', 'icons', 'reddit', 'chess', 'robotics',
       'high-performance-computing', 'ho

In [16]:
df["topic"].nunique()

150

In [25]:
df[df["topic"].str.contains("gh")]["topic"].nunique()

3

**clustered languages -- theoretical version**

In [6]:
# parameter to choose year / semester / quarter to construct period IDs
selected_period = "year"

# data IN
data = pd.read_csv("../data/languages.csv")

# use data_prep_functions to clean the dataframe of ECI_software calculation
data = data[data["year"].isin([2020, 2021, 2022, 2023])]
prev_filter = "|".join(["yaml", "json", "text", "svg", "Markdown", "xml"])
df = drop_specifics_from_list(data, filter_list=prev_filter)
df = top_languages_filter(df, nr_languages=150)
df = drop_country_codes_from_list(df, country_list=["EU"])
df = add_period_ids(df, period=selected_period)
print(df.shape)

(93076, 7)


In [8]:
# clusters of languages
cl_df = pd.read_csv("../data/language_to_cluster_mapping.csv")\
    .rename(columns={"Language":"language", "Cluster":"cluster_id", "Cluster Name":"cluster_name"})

# combine
cl_df = pd.merge(
    df,
    cl_df,
    on="language",
    how="left"
)

# drop NAs... -- not so great
cl_df.dropna(subset=["cluster_id", "cluster_name"], inplace=True)

# aggregate by clusters
cl_df = cl_df.groupby(["iso2_code", "period", "cluster_name"])["num_pushers"].agg("mean").reset_index()

# for ecomplexity calculcation
key_cols = {
    "time": "period",
    "loc": "iso2_code",
    "prod": "cluster_name",
    "val": "num_pushers",
}

# software complexity calculation -- period IDs -- 1 means 2020 on yearly basis
ccdf = []
ppdf = []
year_dict = {1 : 2020, 2 : 2021, 3 : 2022, 4 : 2023}
for k in year_dict.keys():
    dfb = cl_df[cl_df["period"]==k]
    cdf = ecomplexity(dfb, key_cols)
    cdf["year"] = year_dict[k]

    pdf = proximity(dfb, key_cols)
    pdf["year"] = year_dict[k]

    # combine yearly dataframes
    ccdf.append(cdf)
    ppdf.append(pdf)
    print(year_dict[k], " DONE")

1
Percentage of pairs compared that meet log-supermodularity condition: 9.78%
1
2020  DONE
2




Percentage of pairs compared that meet log-supermodularity condition: 10.50%
2
2021  DONE
3




Percentage of pairs compared that meet log-supermodularity condition: 11.00%
3
2022  DONE
4




Percentage of pairs compared that meet log-supermodularity condition: 11.77%
4
2023  DONE




In [9]:
# combine and save -- complexity
cluster_cdf = pd.concat(ccdf, axis=0, ignore_index=True)
cluster_cdf.to_csv("../outputs/eci_clusters_2020_2023.csv", sep=";", index=False)

In [20]:
# comparison
cluster_cdf = pd.read_csv("../outputs/eci_clusters_2020_2023.csv", sep=";")
eci_software = pd.read_csv("../outputs/eci_software_2020_2023.csv", sep=";")

cc_df = pd.merge(
    eci_software[eci_software["year"]==2020][["iso2_code", "eci"]].drop_duplicates(),
    cluster_cdf[cluster_cdf["year"]==2020][["iso2_code", "eci"]].drop_duplicates(),
    on=["iso2_code"],
    how="left",
    suffixes=["_software", "_cluster"]
)

cc_df[["eci_software", "eci_cluster"]].corr()

Unnamed: 0,eci_software,eci_cluster
eci_software,1.0,0.983028
eci_cluster,0.983028,1.0


In [18]:
print(cluster_cdf["cluster_name"].nunique(), "unique clusters")
print(eci_software["language"].nunique(), "unique clusters")

38 unique clusters
150 unique clusters


In [10]:
### ENTRY -- based on clusters

# relatedness density -- as in Hidalgo et al. (2007) Science
cdf = pd.read_csv("../outputs/eci_clusters_2020_2023.csv", sep=";")
rel_dens = cdf[cdf["year"] == 2020][["iso2_code", "cluster_name", "density"]].drop_duplicates()



# data IN
data = pd.read_csv("../data/languages.csv")
selected_period = "year"

# steps to prep dataframe of ecomplexity
prev_filter = "|".join(["yaml", "json", "text", "svg", "Markdown", "xml"])
df = drop_specifics_from_list(data, filter_list=prev_filter)
df = top_languages_filter(df, nr_languages=150)
df = drop_country_codes_from_list(df, country_list=["EU"])
df = add_period_ids(df, period=selected_period)
print(df.shape)

# clusters of languages
cl_df = pd.read_csv("../data/language_to_cluster_mapping.csv")\
    .rename(columns={"Language":"language", "Cluster":"cluster_id", "Cluster Name":"cluster_name"})

# combine
df = pd.merge(
    df,
    cl_df,
    on="language",
    how="left"
)

# drop NAs... -- not so great
df.dropna(subset=["cluster_id", "cluster_name"], inplace=True)

# aggregate by clusters
df = df.groupby(["iso2_code", "period", "cluster_name"])["num_pushers"].agg("mean").reset_index()


def bundle_data_clusters(data, periods):
    """aggreagte data for period by taking the mean number active developers"""
    data = (
        data[data["period"].isin(periods)]
        .groupby(["iso2_code", "cluster_name"])["num_pushers"]
        .agg("mean")
        .reset_index()
    )
    data["period"] = 1
    data["num_pushers"] = data["num_pushers"].astype(int)
    return data



# threshold for RCA : 1.00
ps = [1, 2, 3, 4]
rca_tables = []
for p in ps:
    print(p)
    temp = bundle_data_clusters(df, periods=[p])
    temp["period"] = p
    rca_tables.append(rca_calculation(temp, c_column="iso2_code", p_column="cluster_name", value_column="num_pushers", threshold=1))

#    dfbs.append(temp)
rca_tables = pd.concat(rca_tables)


# identify entry following the given patterns
entry_pattern = [0,0,1,1]
consider_pattern = [0,0,0,0]
ent = rca_tables.sort_values(["period"], ascending=True).groupby(["iso2_code","cluster_name"])["rca01"].agg(list).reset_index()
ent["entry01"] = ent["rca01"].apply(lambda x: x == entry_pattern).astype(int)
ent["consider00"] = ent["rca01"].apply(lambda x: x == consider_pattern).astype(int)



# full combination
all_countries = ent["iso2_code"].unique()
all_languages = ent["cluster_name"].unique()

all_combinations = list(product(all_countries, all_languages))
full_df = pd.DataFrame(all_combinations, columns=["iso2_code", "cluster_name"])\
    .sort_values(["iso2_code", "cluster_name"])

# join entries
full_df = pd.merge(
    full_df,
    ent[["iso2_code", "cluster_name", "entry01", "consider00"]],
    on=["iso2_code", "cluster_name"],
    how="left"
).fillna(0)

# join complexity
cdf = pd.read_csv("../outputs/eci_clusters_2020_2023.csv", sep=";")
cdf = cdf[cdf["year"]==2020]
full_df = pd.merge(
    full_df,
    cdf[["iso2_code", "cluster_name", "pci", "ubiquity"]],
    on=["iso2_code", "cluster_name"],
    how="left"
)

# join RCA from the baseline period
full_df = pd.merge(
    full_df,
    rca_tables[rca_tables["period"]==3].loc[:,["iso2_code", "cluster_name", "rca01"]],
    on=["iso2_code", "cluster_name"],
    how="left"
)
full_df["rca01"] = full_df["rca01"].fillna(0)

# drop languages with no complexity value
full_df.dropna(subset=["pci"], inplace=True)



# join to full_df with entries and PCI
full_df = pd.merge(
    full_df,
    rel_dens,
    on=["iso2_code", "cluster_name"],
    how="left"
)



# export for entry models -- only consider 00, 01 patterns
full_df["entry01"] = full_df["entry01"].astype(int)
full_df["consider00"] = full_df["consider00"].astype(int)
export_df = full_df[(full_df["entry01"]==1) | (full_df["consider00"]==1)]
export_df.to_csv("../outputs/data_entry_regressions_0011_clusters.csv", index=False, sep=";")
#export_df.to_csv("../outputs/data_entry_regressions_0011_threshold05.csv", index=False, sep=";")

(93076, 7)
1
2
3
4


In [12]:
### EXIT -- 1.00 threshold

# relatedness density -- as in Hidalgo et al. (2007) Science
cdf = pd.read_csv("../outputs/eci_clusters_2020_2023.csv", sep=";")
rel_dens = cdf[cdf["year"] == 2020][["iso2_code", "cluster_name", "density"]].drop_duplicates()



# data IN
data = pd.read_csv("../data/languages.csv")
selected_period = "year"

# steps to prep dataframe of ecomplexity
prev_filter = "|".join(["yaml", "json", "text", "svg", "Markdown", "xml"])
df = drop_specifics_from_list(data, filter_list=prev_filter)
df = top_languages_filter(df, nr_languages=150)
df = drop_country_codes_from_list(df, country_list=["EU"])
df = add_period_ids(df, period=selected_period)
print(df.shape)

# clusters of languages
cl_df = pd.read_csv("../data/language_to_cluster_mapping.csv")\
    .rename(columns={"Language":"language", "Cluster":"cluster_id", "Cluster Name":"cluster_name"})

# combine
df = pd.merge(
    df,
    cl_df,
    on="language",
    how="left"
)

# drop NAs... -- not so great
df.dropna(subset=["cluster_id", "cluster_name"], inplace=True)

# aggregate by clusters
df = df.groupby(["iso2_code", "period", "cluster_name"])["num_pushers"].agg("mean").reset_index()


def bundle_data_clusters(data, periods):
    """aggreagte data for period by taking the mean number active developers"""
    data = (
        data[data["period"].isin(periods)]
        .groupby(["iso2_code", "cluster_name"])["num_pushers"]
        .agg("mean")
        .reset_index()
    )
    data["period"] = 1
    data["num_pushers"] = data["num_pushers"].astype(int)
    return data


# threshold for RCA : 1.00
ps = [1, 2, 3, 4]
rca_tables = []
for p in ps:
    print(p)
    temp = bundle_data_clusters(df, periods=[p])
    temp["period"] = p
    rca_tables.append(rca_calculation(temp, c_column="iso2_code", p_column="cluster_name", value_column="num_pushers", threshold=1))

#    dfbs.append(temp)
rca_tables = pd.concat(rca_tables)



# identify entry following the given patterns
exit_pattern = [1,1,0,0]
consider_pattern = [1,1,1,1]
ext = rca_tables.sort_values(["period"], ascending=True).groupby(["iso2_code","cluster_name"])["rca01"].agg(list).reset_index()
ext["entry01"] = ext["rca01"].apply(lambda x: x == exit_pattern).astype(int)
ext["consider00"] = ext["rca01"].apply(lambda x: x == consider_pattern).astype(int)



# full combination
all_countries = ext["iso2_code"].unique()
all_languages = ext["cluster_name"].unique()

all_combinations = list(product(all_countries, all_languages))
full_df = pd.DataFrame(all_combinations, columns=["iso2_code", "cluster_name"])\
    .sort_values(["iso2_code", "cluster_name"])

# join entries
full_df = pd.merge(
    full_df,
    ext[["iso2_code", "cluster_name", "entry01", "consider00"]],
    on=["iso2_code", "cluster_name"],
    how="left"
).fillna(0)

# join complexity
cdf = pd.read_csv("../outputs/eci_clusters_2020_2023.csv", sep=";")
cdf = cdf[cdf["year"]==2020]
full_df = pd.merge(
    full_df,
    cdf[["iso2_code", "cluster_name", "pci", "ubiquity"]],
    on=["iso2_code", "cluster_name"],
    how="left"
)

# join RCA from the baseline period
full_df = pd.merge(
    full_df,
    rca_tables[rca_tables["period"]==3].loc[:,["iso2_code", "cluster_name", "rca01"]],
    on=["iso2_code", "cluster_name"],
    how="left"
)
full_df["rca01"] = full_df["rca01"].fillna(0)

# drop languages with no complexity value
full_df.dropna(subset=["pci"], inplace=True)



# join to full_df with entries and PCI
full_df = pd.merge(
    full_df,
    rel_dens,
    on=["iso2_code", "cluster_name"],
    how="left"
)



# export for entry models -- only consider 00, 01 patterns
full_df["entry01"] = full_df["entry01"].astype(int)
full_df.rename(columns={"entry01":"exit01"}, inplace=True)
full_df["consider00"] = full_df["consider00"].astype(int)
export_df = full_df[(full_df["exit01"]==1) | (full_df["consider00"]==1)]
export_df.to_csv("../outputs/data_exit_regressions_1100_clusters.csv", index=False, sep=";")
#export_df.to_csv("../outputs/data_entry_regressions_0011_threshold05.csv", index=False, sep=";")

(93076, 7)
1
2
3
4


**clustered languages -- co-occurrence version**

In [14]:
# parameter to choose year / semester / quarter to construct period IDs
selected_period = "year"

# data IN
data = pd.read_csv("../data/languages.csv")

# use data_prep_functions to clean the dataframe of ECI_software calculation
data = data[data["year"].isin([2020, 2021, 2022, 2023])]
prev_filter = "|".join(["yaml", "json", "text", "svg", "Markdown", "xml"])
df = drop_specifics_from_list(data, filter_list=prev_filter)
df = top_languages_filter(df, nr_languages=150)
df = drop_country_codes_from_list(df, country_list=["EU"])
df = add_period_ids(df, period=selected_period)
print(df.shape)

(93076, 7)


In [15]:
# clusters of languages
cl_df = pd.read_csv("../data/language_clusters_gh_cos_hier_ward_d1.csv")\
    .rename(columns={"Language":"language", "Cluster":"cluster_id"})\
    .iloc[:,1:]

# combine
cl_df = pd.merge(
    df,
    cl_df,
    on="language",
    how="left"
)

# drop NAs... -- not so great
cl_df.dropna(subset=["cluster_id", "cluster_id"], inplace=True)

# aggregate by clusters
cl_df = cl_df.groupby(["iso2_code", "period", "cluster_id"])["num_pushers"].agg("mean").reset_index()

# for ecomplexity calculcation
key_cols = {
    "time": "period",
    "loc": "iso2_code",
    "prod": "cluster_id",
    "val": "num_pushers",
}

# software complexity calculation -- period IDs -- 1 means 2020 on yearly basis
ccdf = []
ppdf = []
year_dict = {1 : 2020, 2 : 2021, 3 : 2022, 4 : 2023}
for k in year_dict.keys():
    dfb = cl_df[cl_df["period"]==k]
    cdf = ecomplexity(dfb, key_cols)
    cdf["year"] = year_dict[k]

    pdf = proximity(dfb, key_cols)
    pdf["year"] = year_dict[k]

    # combine yearly dataframes
    ccdf.append(cdf)
    ppdf.append(pdf)
    print(year_dict[k], " DONE")

1
Percentage of pairs compared that meet log-supermodularity condition: 6.68%
1
2020  DONE
2




Percentage of pairs compared that meet log-supermodularity condition: 7.22%
2
2021  DONE
3




Percentage of pairs compared that meet log-supermodularity condition: 7.73%
3
2022  DONE
4




Percentage of pairs compared that meet log-supermodularity condition: 8.72%
4
2023  DONE




In [16]:
# combine and save -- complexity
cluster_cdf = pd.concat(ccdf, axis=0, ignore_index=True)
cluster_cdf.to_csv("../outputs/eci_clusters_cooc_2020_2023.csv", sep=";", index=False)

In [21]:
# comparison
cluster_cdf1 = pd.read_csv("../outputs/eci_clusters_2020_2023.csv", sep=";")
cluster_cdf = pd.read_csv("../outputs/eci_clusters_cooc_2020_2023.csv", sep=";")
eci_software = pd.read_csv("../outputs/eci_software_2020_2023.csv", sep=";")

cc_df = pd.merge(
    eci_software[eci_software["year"]==2020][["iso2_code", "eci"]].drop_duplicates(),
    cluster_cdf[cluster_cdf["year"]==2020][["iso2_code", "eci"]].drop_duplicates(),
    on=["iso2_code"],
    how="left",
    suffixes=["_software", "_cluster"]
)
temp = pd.merge(
    cc_df,
    cluster_cdf1[cluster_cdf1["year"]==2020][["iso2_code", "eci"]].drop_duplicates(),
    on=["iso2_code"],
    how="left"
).rename(columns={"eci":"eci_cluster_theory", "eci_cluster":"eci_cluster_cooccurrence"})

temp[["eci_software", "eci_cluster_theory", "eci_cluster_cooccurrence"]].corr()

Unnamed: 0,eci_software,eci_cluster_theory,eci_cluster_cooccurrence
eci_software,1.0,0.983028,0.970453
eci_cluster_theory,0.983028,1.0,0.973888
eci_cluster_cooccurrence,0.970453,0.973888,1.0


In [23]:
print(cluster_cdf["cluster_id"].nunique(), "unique clusters")
print(eci_software["language"].nunique(), "unique clusters")

57 unique clusters
150 unique clusters


In [24]:
### ENTRY -- based on clusters

# relatedness density -- as in Hidalgo et al. (2007) Science
cdf = pd.read_csv("../outputs/eci_clusters_cooc_2020_2023.csv", sep=";")
rel_dens = cdf[cdf["year"] == 2020][["iso2_code", "cluster_id", "density"]].drop_duplicates()



# data IN
data = pd.read_csv("../data/languages.csv")
selected_period = "year"

# steps to prep dataframe of ecomplexity
prev_filter = "|".join(["yaml", "json", "text", "svg", "Markdown", "xml"])
df = drop_specifics_from_list(data, filter_list=prev_filter)
df = top_languages_filter(df, nr_languages=150)
df = drop_country_codes_from_list(df, country_list=["EU"])
df = add_period_ids(df, period=selected_period)
print(df.shape)

# clusters of languages
cl_df = pd.read_csv("../data/language_clusters_gh_cos_hier_ward_d1.csv")\
    .rename(columns={"Language":"language", "Cluster":"cluster_id"})\
    .iloc[:,1:]

# combine
df = pd.merge(
    df,
    cl_df,
    on="language",
    how="left"
)

# drop NAs... -- not so great
df.dropna(subset=["cluster_id"], inplace=True)

# aggregate by clusters
df = df.groupby(["iso2_code", "period", "cluster_id"])["num_pushers"].agg("mean").reset_index()


def bundle_data_clusters(data, periods):
    """aggreagte data for period by taking the mean number active developers"""
    data = (
        data[data["period"].isin(periods)]
        .groupby(["iso2_code", "cluster_id"])["num_pushers"]
        .agg("mean")
        .reset_index()
    )
    data["period"] = 1
    data["num_pushers"] = data["num_pushers"].astype(int)
    return data



# threshold for RCA : 1.00
ps = [1, 2, 3, 4]
rca_tables = []
for p in ps:
    print(p)
    temp = bundle_data_clusters(df, periods=[p])
    temp["period"] = p
    rca_tables.append(rca_calculation(temp, c_column="iso2_code", p_column="cluster_id", value_column="num_pushers", threshold=1))

#    dfbs.append(temp)
rca_tables = pd.concat(rca_tables)


# identify entry following the given patterns
entry_pattern = [0,0,1,1]
consider_pattern = [0,0,0,0]
ent = rca_tables.sort_values(["period"], ascending=True).groupby(["iso2_code","cluster_id"])["rca01"].agg(list).reset_index()
ent["entry01"] = ent["rca01"].apply(lambda x: x == entry_pattern).astype(int)
ent["consider00"] = ent["rca01"].apply(lambda x: x == consider_pattern).astype(int)



# full combination
all_countries = ent["iso2_code"].unique()
all_languages = ent["cluster_id"].unique()

all_combinations = list(product(all_countries, all_languages))
full_df = pd.DataFrame(all_combinations, columns=["iso2_code", "cluster_id"])\
    .sort_values(["iso2_code", "cluster_id"])

# join entries
full_df = pd.merge(
    full_df,
    ent[["iso2_code", "cluster_id", "entry01", "consider00"]],
    on=["iso2_code", "cluster_id"],
    how="left"
).fillna(0)

# join complexity
cdf = pd.read_csv("../outputs/eci_clusters_cooc_2020_2023.csv", sep=";")
cdf = cdf[cdf["year"]==2020]
full_df = pd.merge(
    full_df,
    cdf[["iso2_code", "cluster_id", "pci", "ubiquity"]],
    on=["iso2_code", "cluster_id"],
    how="left"
)

# join RCA from the baseline period
full_df = pd.merge(
    full_df,
    rca_tables[rca_tables["period"]==3].loc[:,["iso2_code", "cluster_id", "rca01"]],
    on=["iso2_code", "cluster_id"],
    how="left"
)
full_df["rca01"] = full_df["rca01"].fillna(0)

# drop languages with no complexity value
full_df.dropna(subset=["pci"], inplace=True)



# join to full_df with entries and PCI
full_df = pd.merge(
    full_df,
    rel_dens,
    on=["iso2_code", "cluster_id"],
    how="left"
)



# export for entry models -- only consider 00, 01 patterns
full_df["entry01"] = full_df["entry01"].astype(int)
full_df["consider00"] = full_df["consider00"].astype(int)
export_df = full_df[(full_df["entry01"]==1) | (full_df["consider00"]==1)]
export_df.to_csv("../outputs/data_entry_regressions_0011_clusters_cooc.csv", index=False, sep=";")
#export_df.to_csv("../outputs/data_entry_regressions_0011_threshold05.csv", index=False, sep=";")

(93076, 7)
1
2
3
4


In [26]:
### EXIT -- 1.00 threshold

# relatedness density -- as in Hidalgo et al. (2007) Science
cdf = pd.read_csv("../outputs/eci_clusters_cooc_2020_2023.csv", sep=";")
rel_dens = cdf[cdf["year"] == 2020][["iso2_code", "cluster_id", "density"]].drop_duplicates()



# data IN
data = pd.read_csv("../data/languages.csv")
selected_period = "year"

# steps to prep dataframe of ecomplexity
prev_filter = "|".join(["yaml", "json", "text", "svg", "Markdown", "xml"])
df = drop_specifics_from_list(data, filter_list=prev_filter)
df = top_languages_filter(df, nr_languages=150)
df = drop_country_codes_from_list(df, country_list=["EU"])
df = add_period_ids(df, period=selected_period)
print(df.shape)

# clusters of languages
cl_df = pd.read_csv("../data/language_clusters_gh_cos_hier_ward_d1.csv")\
    .rename(columns={"Language":"language", "Cluster":"cluster_id"})\
    .iloc[:,1:]

# combine
df = pd.merge(
    df,
    cl_df,
    on="language",
    how="left"
)

# drop NAs... -- not so great
df.dropna(subset=["cluster_id", "cluster_id"], inplace=True)

# aggregate by clusters
df = df.groupby(["iso2_code", "period", "cluster_id"])["num_pushers"].agg("mean").reset_index()


def bundle_data_clusters(data, periods):
    """aggreagte data for period by taking the mean number active developers"""
    data = (
        data[data["period"].isin(periods)]
        .groupby(["iso2_code", "cluster_id"])["num_pushers"]
        .agg("mean")
        .reset_index()
    )
    data["period"] = 1
    data["num_pushers"] = data["num_pushers"].astype(int)
    return data


# threshold for RCA : 1.00
ps = [1, 2, 3, 4]
rca_tables = []
for p in ps:
    print(p)
    temp = bundle_data_clusters(df, periods=[p])
    temp["period"] = p
    rca_tables.append(rca_calculation(temp, c_column="iso2_code", p_column="cluster_id", value_column="num_pushers", threshold=1))

#    dfbs.append(temp)
rca_tables = pd.concat(rca_tables)



# identify entry following the given patterns
exit_pattern = [1,1,0,0]
consider_pattern = [1,1,1,1]
ext = rca_tables.sort_values(["period"], ascending=True).groupby(["iso2_code","cluster_id"])["rca01"].agg(list).reset_index()
ext["entry01"] = ext["rca01"].apply(lambda x: x == exit_pattern).astype(int)
ext["consider00"] = ext["rca01"].apply(lambda x: x == consider_pattern).astype(int)



# full combination
all_countries = ext["iso2_code"].unique()
all_languages = ext["cluster_id"].unique()

all_combinations = list(product(all_countries, all_languages))
full_df = pd.DataFrame(all_combinations, columns=["iso2_code", "cluster_id"])\
    .sort_values(["iso2_code", "cluster_id"])

# join entries
full_df = pd.merge(
    full_df,
    ext[["iso2_code", "cluster_id", "entry01", "consider00"]],
    on=["iso2_code", "cluster_id"],
    how="left"
).fillna(0)

# join complexity
cdf = pd.read_csv("../outputs/eci_clusters_cooc_2020_2023.csv", sep=";")
cdf = cdf[cdf["year"]==2020]
full_df = pd.merge(
    full_df,
    cdf[["iso2_code", "cluster_id", "pci", "ubiquity"]],
    on=["iso2_code", "cluster_id"],
    how="left"
)

# join RCA from the baseline period
full_df = pd.merge(
    full_df,
    rca_tables[rca_tables["period"]==3].loc[:,["iso2_code", "cluster_id", "rca01"]],
    on=["iso2_code", "cluster_id"],
    how="left"
)
full_df["rca01"] = full_df["rca01"].fillna(0)

# drop languages with no complexity value
full_df.dropna(subset=["pci"], inplace=True)



# join to full_df with entries and PCI
full_df = pd.merge(
    full_df,
    rel_dens,
    on=["iso2_code", "cluster_id"],
    how="left"
)



# export for entry models -- only consider 00, 01 patterns
full_df["entry01"] = full_df["entry01"].astype(int)
full_df.rename(columns={"entry01":"exit01"}, inplace=True)
full_df["consider00"] = full_df["consider00"].astype(int)
export_df = full_df[(full_df["exit01"]==1) | (full_df["consider00"]==1)]
export_df.to_csv("../outputs/data_exit_regressions_1100_clusters_cooc.csv", index=False, sep=";")
#export_df.to_csv("../outputs/data_entry_regressions_0011_threshold05.csv", index=False, sep=";")

(93076, 7)
1
2
3
4


**Figure 1 note -- RCA R2s for 2021**

In [26]:
df = pd.read_csv("../outputs/eci_regression_table.csv", sep=";")
df = df[df["year"]==2021]
df = df[["iso2_code", "eci_software", "eci_trade", "eci_tech", "eci_research"]].drop_duplicates()

In [27]:
def normalize_column_to_range(table, column_name):
    min_val = table[column_name].min()
    max_val = table[column_name].max()
    table[column_name] = 2 * (table[column_name] - min_val) / (max_val - min_val) - 1
    return table

In [28]:
df = normalize_column_to_range(df, "eci_software")
df = normalize_column_to_range(df, "eci_trade")
df = normalize_column_to_range(df, "eci_tech")
df = normalize_column_to_range(df, "eci_research")

In [29]:
def eci_correlations(df, key_variables):
    df.dropna(subset=key_variables, inplace=True)
    slope, intercept, r_value, p_value, std_err = stats.linregress(df[key_variables[0]], df[key_variables[1]])
    r_squared = r_value ** 2
    print("R2", round(r_squared, 3), "p-value", round(p_value, 3), "   ", key_variables[0], "  ", key_variables[1])

In [30]:
eci_correlations(df, key_variables=["eci_software", "eci_trade"])

R2 0.665 p-value 0.0     eci_software    eci_trade


In [31]:
eci_correlations(df, key_variables=["eci_software", "eci_tech"])

R2 0.7 p-value 0.0     eci_software    eci_tech


In [32]:
eci_correlations(df, key_variables=["eci_software", "eci_research"])

R2 0.488 p-value 0.0     eci_software    eci_research
