In [1]:
import numpy as np
import pandas as pd
import networkx as nx
from itertools import product
from ecomplexity import ecomplexity
from ecomplexity import proximity
from ecomplexity import calc_density
import country_converter as coco
import itertools

from data_prep_functions import *

**Part 0 - general data preparation**

In [2]:
# parameter to choose year / semester / quarter to construct period IDs
selected_period = "year"

# for ecomplexity calculcation
key_cols = {
    "time": "period",
    "loc": "iso2_code",
    "prod": "language",
    "val": "num_pushers",
}

In [3]:
# data IN
data = pd.read_csv("../data/languages.csv")

In [4]:
# use data_prep_functions to clean the dataframe of ECI_software calculation
prev_filter = "|".join(["yaml", "json", "text", "svg", "Markdown", "xml"])
df = drop_specifics_from_list(data, filter_list=prev_filter)
df = top_languages_filter(df, nr_languages=150)
df = drop_country_codes_from_list(df, country_list=["EU"])
df = add_period_ids(df, period=selected_period)
print(df.shape)

(88775, 7)


**Part 1 - from M_cl to ECI_software and language proximity - based on yearly data**

In [5]:
# software complexity calculation -- period IDs -- 1 means 2020 on yearly basis
ccdf = []
ppdf = []
year_dict = {1 : 2020, 2 : 2021, 3 : 2022, 4 : 2023}
for k in year_dict.keys():
    dfb = bundle_data(df, periods=[k])
    cdf = ecomplexity(dfb, key_cols)
    cdf["year"] = year_dict[k]

    pdf = proximity(dfb, key_cols)
    pdf["year"] = year_dict[k]

    # combine yearly dataframes
    ccdf.append(cdf)
    ppdf.append(pdf)
    print(year_dict[k], " DONE")

1
Percentage of pairs compared that meet log-supermodularity condition: 6.75%
1
2020  DONE
1




Percentage of pairs compared that meet log-supermodularity condition: 3.69%
1
2021  DONE
1




Percentage of pairs compared that meet log-supermodularity condition: 3.91%
1
2022  DONE
1




Percentage of pairs compared that meet log-supermodularity condition: 6.38%
1
2023  DONE




In [6]:
# combine and save -- complexity
cdf = pd.concat(ccdf, axis=0, ignore_index=True)
cdf.to_csv("../outputs/eci_software_2020_2023.csv", sep=";", index=False)

In [7]:
# combine and save -- language proximity
prox_df = pd.concat(ppdf, axis=0, ignore_index=True)
prox_df.to_csv("../outputs/proximity_2020_2023.csv", sep=";", index=False)

**Part 2 - comparing ECI(software, trade, technology, research)**

In [8]:
# read in developers (for filtering option)
df = pd.read_csv("../data/developers.csv")
df = df.groupby(["iso2_code", "year"])["developers"].agg("mean").reset_index()
df["developers"] = df["developers"].astype(int)
df = df[df["year"]==2020]

# add ECI_software
eci_software = pd.read_csv("../outputs/eci_software_2020_2023.csv", sep=";")
eci_software = eci_software[eci_software["year"]==2020]
country_to_seci_2020 = dict(zip(eci_software["iso2_code"], eci_software["eci"]))
df["software_eci_2020"] = df["iso2_code"].map(country_to_seci_2020)

In [9]:
# read in 3 other ECI measures
trade_eci = pd.read_csv("../data/eci_hs6_hs96_trade.csv")
trade_eci = trade_eci[["Country", "2020"]]
tech_eci = pd.read_csv("../data/eci_tech.csv")
tech_eci = tech_eci[["Country", "2020"]]
research_eci = pd.read_csv("../data/Data-ECI-Research.csv")
research_eci = research_eci[["Country", "2020"]]

In [10]:
# map country names to iso2_codes - using the country_converter package
c_to_iso = dict(
    zip(trade_eci.Country.unique(), coco.convert(names=trade_eci.Country.unique(), to="ISO2")))
trade_eci["Country"] = trade_eci["Country"].map(c_to_iso)
iso_to_trade_eci = dict(trade_eci.values)
df["trade_eci_2020"] = df["iso2_code"].map(iso_to_trade_eci)

tech_eci["Country"] = tech_eci["Country"].map(c_to_iso)
iso_to_tech_eci = dict(tech_eci.values)
df["tech_eci_2020"] = df["iso2_code"].map(iso_to_tech_eci)

research_eci["Country"] = research_eci["Country"].map(c_to_iso)
iso_to_research_eci = dict(research_eci.values)
df["research_eci_2020"] = df["iso2_code"].map(iso_to_research_eci)

In [11]:
# data from CEPII -- http://www.cepii.fr/CEPII/en/bdd_modele/bdd_modele_item.asp?id=8
trade_df = pd.read_csv("../data/Gravity_V202211.csv")
trade_df = trade_df[trade_df["year"] == 2020]
trade_df['country_id_o'] = trade_df['country_id_o'].map(lambda x: x.replace('.2',''))
trade_df['country_id_d'] = trade_df['country_id_d'].map(lambda x: x.replace('.2',''))

  trade_df = pd.read_csv("../data/Gravity_V202211.csv")


In [12]:
# code transformation
iso3_codes = pd.concat([trade_df["iso3_o"], trade_df["iso3_d"]]).unique().tolist()
iso2_codes = coco.convert(names=iso3_codes, to="ISO2")
codes = pd.DataFrame(iso3_codes, iso2_codes).reset_index()
codes.columns = ["iso2_code", "iso3_code"]

trade_df = pd.merge(
    trade_df,
    codes,
    left_on="iso3_o",
    right_on="iso3_code",
    how="left"
)

# keep useful country data and join to 
key_country_info = [
    "year",
    "iso3_code",
    "iso2_code",
    "pop_o",
    "gdp_o",
    "gdpcap_o",
    "gdp_ppp_o",
    "gdpcap_ppp_o"
]
country_info = trade_df[key_country_info].drop_duplicates()

ANT not found in ISO3
CSK not found in ISO3
DDR not found in ISO3
SCG not found in ISO3
SUN not found in ISO3
VDR not found in ISO3
YMD not found in ISO3
YUG not found in ISO3


In [13]:
# combine country info the ECI collector dataframe
df = pd.merge(
    df,
    country_info,
    on=["iso2_code", "year"],
    how="left"
)

In [14]:
# add GINI -- https://data.worldbank.org/indicator/SI.POV.GINI
gini_df = pd.read_excel("../data/gini_worldbank_data.xls")

# too many NAs -- take the average across 10+ years
years_list = [str(year) for year in range(2010, 2020)]
gini_df["gini_mean"] = gini_df[years_list].mean(axis=1)
gini_df = gini_df[["Country Code", "gini_mean"]]
gini_df.columns = ["iso3_code", "gini_mean"]
gini_df.dropna(subset="gini_mean", inplace=True)

# join iso2_codes -- create above
gini_df = pd.merge(
    gini_df,
    codes,
    on="iso3_code",
    how="left"
)
gini_df.dropna(subset="iso2_code", inplace=True)

In [15]:
# combine GINI with ECI collector dataframe
df = pd.merge(
    df,
    gini_df,
    on=["iso2_code", "iso3_code"],
    how="left"
)

In [16]:
# add emissions -- new data from Viktor -- ask about preparation details
emdf = pd.read_csv("../data/regressions_emissions_data.csv")
emdf = emdf[["country", "emissions", "nat_res"]].drop_duplicates()

df = pd.merge(
    df,
    emdf,
    left_on="iso3_code",
    right_on="country",
    how="left"
)

In [17]:
# export
df.to_csv("../outputs/eci_comparisons_2020.csv", sep=";", index=False)

**Part 2b - matrices based on trade/research/publications**

In [18]:
def mat_reshape(path, column_labels):
    """to reshape the matrices from Viktor Stojkoski"""
    mat = pd.read_csv(path)
    mat.set_index("Row", inplace=True)
    mat = mat.unstack().reset_index()
    mat.columns = column_labels
    return mat

In [19]:
# read and reshape matrices
trade_df = mat_reshape(path="../data/stojkoski_etal_data/trade_matrix_data_2020.csv", column_labels=["product", "iso3_code", "value"])
patent_df = mat_reshape(path="../data/stojkoski_etal_data/pct_data_2020.csv", column_labels=["class", "iso3_code", "value"])
research_df = mat_reshape(path="../data/stojkoski_etal_data/pub_matrix_data_2020.csv", column_labels=["category", "iso3_code", "value"])

# country code correction
iso3_codes = pd.concat([trade_df["iso3_code"], patent_df["iso3_code"], research_df["iso3_code"]]).unique().tolist()
iso2_codes = coco.convert(names=iso3_codes, to="ISO2")
codes2 = pd.DataFrame(iso3_codes, iso2_codes).reset_index()
codes2.columns = ["iso2_code", "iso3_code"]

trade_df = pd.merge(
    trade_df,
    codes2,
    on="iso3_code",
    how="left"
)
trade_df = trade_df[trade_df["iso2_code"] != "not found"]

patent_df = pd.merge(
    patent_df,
    codes2,
    on="iso3_code",
    how="left"
)
patent_df = patent_df[patent_df["iso2_code"] != "not found"]

research_df = pd.merge(
    research_df,
    codes2,
    on="iso3_code",
    how="left"
)
research_df = research_df[research_df["iso2_code"] != "not found"]

ANT not found in ISO3
YUG not found in ISO3
CSE not found in ISO3
DDE not found in ISO3
EPO not found in ISO3
XKO not found in ISO3
SFE not found in ISO3
SUE not found in ISO3
XTP not found in ISO3
XUB not found in ISO3
FST not found in ISO3
PIT not found in ISO3


In [20]:
# country level info from the gravity dataset
grav_df = pd.read_csv("../data/Gravity_V202211.csv")
grav_df = grav_df[grav_df["year"] == 2020]
grav_df['country_id_o'] = grav_df['country_id_o'].map(lambda x: x.replace('.2',''))
grav_df['country_id_d'] = grav_df['country_id_d'].map(lambda x: x.replace('.2',''))

# population above 1 million
countries_1m_pop = list(set(grav_df[grav_df["pop_o"]>1000]["iso3_o"].to_list()))
trade_df = trade_df[trade_df["iso3_code"].isin(countries_1m_pop)]
patent_df = patent_df[patent_df["iso3_code"].isin(countries_1m_pop)]
research_df = research_df[research_df["iso3_code"].isin(countries_1m_pop)]

# total export value of 1 billion USD
above_1b_export = trade_df.groupby(["iso2_code"])["value"].agg("sum").reset_index()
above_1b_export = list(set(above_1b_export[above_1b_export["value"]>10**9]["iso2_code"].to_list()))
trade_df = trade_df[trade_df["iso2_code"].isin(above_1b_export)]

# MIN 4 patent
min4_patents = patent_df.groupby(["iso2_code"])["value"].agg("sum").reset_index()
min4_patents = list(set(min4_patents[min4_patents["value"] > 4]["iso2_code"].to_list()))
patent_df = patent_df[patent_df["iso2_code"].isin(min4_patents)]

# countries w/ MIN 100 publications in a year - category w/ more than 30 published papers a year
min100_publications = research_df.groupby(["iso2_code"])["value"].agg("sum").reset_index()
min100_publications = list(set(min100_publications[min100_publications["value"] >= 100]["iso2_code"].to_list()))
min30_papers = research_df.groupby(["category"])["value"].agg("sum").reset_index()
min30_papers = list(set(min30_papers[min30_papers["value"] >= 30]["category"].to_list()))
research_df = research_df[(research_df["category"].isin(min30_papers))]

# replace below 3 papers per country/category to 0
research_df["value"] = np.where(research_df["value"]<3, 0, research_df["value"])

  grav_df = pd.read_csv("../data/Gravity_V202211.csv")


In [21]:
# replace below avg 100 citations per country/category to 0
years = [2017, 2018, 2019, 2020]
citations = []
for y in years:
    temp = mat_reshape(path=f"../data/stojkoski_etal_data/cit_matrix_data_{y}.csv", column_labels=["category", "iso3_code", "citations"])
    temp["year"] = y
    citations.append(temp)

citations = pd.concat(citations)
citations = citations.groupby(["category", "iso3_code"])["citations"].agg("mean").reset_index()

research_df = pd.merge(
    research_df,
    citations,
    on=["iso3_code", "category"],
    how="left"
)
research_df["value"] = np.where(research_df["citations"]<100, 0, research_df["value"])

In [22]:
# calculate complexity and mcp
key_cols_trade = {
    "time": "period",
    "loc": "iso2_code",
    "prod": "product",
    "val": "value",
}
trade_df["period"] = 1
trade_cdf = ecomplexity(trade_df, key_cols_trade)

key_cols_patent = {
    "time": "period",
    "loc": "iso2_code",
    "prod": "class",
    "val": "value",
}
patent_df["period"] = 1
patent_cdf = ecomplexity(patent_df, key_cols_patent)

key_cols_research = {
    "time": "period",
    "loc": "iso2_code",
    "prod": "category",
    "val": "value",
}
research_df["period"] = 1
research_cdf = ecomplexity(research_df, key_cols_research)

1
Percentage of pairs compared that meet log-supermodularity condition: 31.28%




1
Percentage of pairs compared that meet log-supermodularity condition: 3.43%
1




Percentage of pairs compared that meet log-supermodularity condition: 14.35%




In [23]:
# save for figures
trade_cdf.to_csv("../outputs/trade_cdf_2020.csv", sep=";", index=False)
patent_cdf.to_csv("../outputs/patent_cdf_2020.csv", sep=";", index=False)
research_cdf.to_csv("../outputs/research_cdf_2020.csv", sep=";", index=False)

**Part 3 - for entry regressions**

In [24]:
# relatedness density -- as in Hidalgo et al. (2007) Science
cdf = pd.read_csv("../outputs/eci_software_2020_2023.csv", sep=";")
rel_dens = cdf[cdf["year"] == 2020][["iso2_code", "language", "density"]].drop_duplicates()

In [25]:
# data IN
data = pd.read_csv("../data/languages.csv")
selected_period = "year"

# steps to prep dataframe of ecomplexity
prev_filter = "|".join(["yaml", "json", "text", "svg", "Markdown", "xml"])
df = drop_specifics_from_list(data, filter_list=prev_filter)
df = top_languages_filter(df, nr_languages=150)
df = drop_country_codes_from_list(df, country_list=["EU"])
df = add_period_ids(df, period=selected_period)
print(df.shape)

(88775, 7)


In [26]:
# threshold for RCA : 1
ps = [1, 2, 3, 4]
rca_tables = []
for p in ps:
    print(p)
    temp = bundle_data(df, periods=[p])
    temp["period"] = p
    rca_tables.append(rca_calculation(temp, c_column="iso2_code", p_column="language", value_column="num_pushers", threshold=1))

#    dfbs.append(temp)
rca_tables = pd.concat(rca_tables)

1
2
3
4


In [27]:
# identify entry following the given patterns
entry_pattern = [0,0,1,1]
consider_pattern = [0,0,0,0]
ent = rca_tables.sort_values(["period"], ascending=True).groupby(["iso2_code","language"])["rca01"].agg(list).reset_index()
ent["entry01"] = ent["rca01"].apply(lambda x: x == entry_pattern).astype(int)
ent["consider00"] = ent["rca01"].apply(lambda x: x == consider_pattern).astype(int)

In [28]:
# full combination
all_countries = ent["iso2_code"].unique()
all_languages = ent["language"].unique()

all_combinations = list(product(all_countries, all_languages))
full_df = pd.DataFrame(all_combinations, columns=["iso2_code", "language"])\
    .sort_values(["iso2_code", "language"])

# join entries
full_df = pd.merge(
    full_df,
    ent[["iso2_code", "language", "entry01", "consider00"]],
    on=["iso2_code", "language"],
    how="left"
).fillna(0)

# join complexity
cdf = pd.read_csv("../outputs/eci_software_2020_2023.csv", sep=";")
cdf = cdf[cdf["year"]==2020]
full_df = pd.merge(
    full_df,
    cdf[["iso2_code", "language", "pci", "ubiquity"]],
    on=["iso2_code", "language"],
    how="left"
)

# join RCA from the baseline period
full_df = pd.merge(
    full_df,
    rca_tables[rca_tables["period"]==3].loc[:,["iso2_code", "language", "rca01"]],
    on=["iso2_code", "language"],
    how="left"
)
full_df["rca01"] = full_df["rca01"].fillna(0)

# drop languages with no complexity value
full_df.dropna(subset=["pci"], inplace=True)

In [29]:
# join to full_df with entries and PCI
full_df = pd.merge(
    full_df,
    rel_dens,
    on=["iso2_code", "language"],
    how="left"
)

In [30]:
# export for entry models -- only consider 00, 01 patterns
full_df["entry01"] = full_df["entry01"].astype(int)
full_df["consider00"] = full_df["consider00"].astype(int)
export_df = full_df[(full_df["entry01"]==1) | (full_df["consider00"]==1)]
export_df.to_csv("../outputs/data_entry_regressions_0011.csv", index=False, sep=";")
#export_df.to_csv("../outputs/data_entry_regressions_0011_threshold05.csv", index=False, sep=";")

**Part 4 - for exit regressions**

In [31]:
# relatedness density -- as in Hidalgo et al. (2007) Science
cdf = pd.read_csv("../outputs/eci_software_2020_2023.csv", sep=";")
rel_dens = cdf[cdf["year"] == 2020][["iso2_code", "language", "density"]].drop_duplicates()

In [32]:
# data IN
data = pd.read_csv("../data/languages.csv")
selected_period = "year"

# steps to prep dataframe of ecomplexity
prev_filter = "|".join(["yaml", "json", "text", "svg", "Markdown", "xml"])
df = drop_specifics_from_list(data, filter_list=prev_filter)
df = top_languages_filter(df, nr_languages=150)
df = drop_country_codes_from_list(df, country_list=["EU"])
df = add_period_ids(df, period=selected_period)
print(df.shape)

(88775, 7)


In [33]:
# threshold for RCA : 1
ps = [1, 2, 3, 4]
rca_tables = []
for p in ps:
    print(p)
    temp = bundle_data(df, periods=[p])
    temp["period"] = p
    rca_tables.append(rca_calculation(temp, c_column="iso2_code", p_column="language", value_column="num_pushers", threshold=1))

#    dfbs.append(temp)
rca_tables = pd.concat(rca_tables)

1
2
3
4


In [34]:
# identify entry following the given patterns
exit_pattern = [1,1,0,0]
consider_pattern = [1,1,1,1]
ext = rca_tables.sort_values(["period"], ascending=True).groupby(["iso2_code","language"])["rca01"].agg(list).reset_index()
ext["entry01"] = ext["rca01"].apply(lambda x: x == exit_pattern).astype(int)
ext["consider00"] = ext["rca01"].apply(lambda x: x == consider_pattern).astype(int)

In [35]:
# full combination
all_countries = ext["iso2_code"].unique()
all_languages = ext["language"].unique()

all_combinations = list(product(all_countries, all_languages))
full_df = pd.DataFrame(all_combinations, columns=["iso2_code", "language"])\
    .sort_values(["iso2_code", "language"])

# join entries
full_df = pd.merge(
    full_df,
    ext[["iso2_code", "language", "entry01", "consider00"]],
    on=["iso2_code", "language"],
    how="left"
).fillna(0)

# join complexity
cdf = pd.read_csv("../outputs/eci_software_2020_2023.csv", sep=";")
cdf = cdf[cdf["year"]==2020]
full_df = pd.merge(
    full_df,
    cdf[["iso2_code", "language", "pci", "ubiquity"]],
    on=["iso2_code", "language"],
    how="left"
)

# join RCA from the baseline period
full_df = pd.merge(
    full_df,
    rca_tables[rca_tables["period"]==3].loc[:,["iso2_code", "language", "rca01"]],
    on=["iso2_code", "language"],
    how="left"
)
full_df["rca01"] = full_df["rca01"].fillna(0)

# drop languages with no complexity value
full_df.dropna(subset=["pci"], inplace=True)

In [36]:
# join to full_df with entries and PCI
full_df = pd.merge(
    full_df,
    rel_dens,
    on=["iso2_code", "language"],
    how="left"
)

In [37]:
# export for entry models -- only consider 00, 01 patterns
full_df["entry01"] = full_df["entry01"].astype(int)
full_df.rename(columns={"entry01":"exit01"}, inplace=True)
full_df["consider00"] = full_df["consider00"].astype(int)
export_df = full_df[(full_df["exit01"]==1) | (full_df["consider00"]==1)]
export_df.to_csv("../outputs/data_exit_regressions_1100.csv", index=False, sep=";")
#export_df.to_csv("../outputs/data_entry_regressions_0011_threshold05.csv", index=False, sep=";")

**IV -- 3 most similar non-neighboring countries**

In [60]:
# ECI_software table
cdf = pd.read_csv("../outputs/eci_software_2020_2023.csv", sep=";")

# neighboring countries from https://github.com/geodatasource/country-borders
nc = pd.read_csv("../data/geodatasource_country_borders.csv")

In [61]:
locations = list(set(cdf["iso2_code"].to_list()))
full_prod_countries = pd.DataFrame(itertools.product(locations, repeat=2), columns=["iso2_code1", "iso2_code2"])


In [62]:
full_prod_countries = pd.merge(
    full_prod_countries,
    nc,
    left_on=["iso2_code1", "iso2_code2"],
    right_on=["country_code", "country_border_code"],
    how="left"
)
full_prod_countries["neighbor01"] = full_prod_countries["country_border_code"].notna().astype(int)
full_prod_countries = full_prod_countries[["iso2_code1", "iso2_code2", "neighbor01"]]

In [63]:
# select year
year_list = [2020, 2021, 2022, 2023]
cdf2 = []
for y in year_list:
    print(y)

    tcdf = cdf[cdf["year"] == y]
    tcdf.year.isna().sum()

    # generate full product dataframe
    locations = list(set(tcdf["iso2_code"].to_list()))
    full_prod_countries = pd.DataFrame(itertools.product(locations, repeat=2), columns=["iso2_code1", "iso2_code2"])
    full_prod_countries = pd.merge(
        full_prod_countries,
        nc,
        left_on=["iso2_code1", "iso2_code2"],
        right_on=["country_code", "country_border_code"],
        how="left"
    )
    full_prod_countries["neighbor01"] = full_prod_countries["country_border_code"].notna().astype(int)
    full_prod_countries = full_prod_countries[["iso2_code1", "iso2_code2", "neighbor01"]]

    # add location - mcp array to location pairs
    mcp_temp = tcdf.groupby("iso2_code")["mcp"].apply(np.array).reset_index()
    full_prod_countries = pd.merge(
        full_prod_countries,
        mcp_temp,
        left_on="iso2_code1",
        right_on="iso2_code",
        how="left"
    )
    full_prod_countries = pd.merge(
        full_prod_countries,
        mcp_temp,
        left_on="iso2_code2",
        right_on="iso2_code",
        how="left"
    )
    full_prod_countries = full_prod_countries\
        .drop(columns=["iso2_code_x", "iso2_code_y"])\
        .rename(columns={"mcp_x":"mcp_array1", "mcp_y":"mcp_array2"})

    # minimum conditional probability -- to measure similarity between locations
    full_prod_countries["spec_similarity"] = full_prod_countries.apply(lambda r: round(sum(r["mcp_array1"] * r["mcp_array2"]) / max(sum(r["mcp_array1"]), sum(r["mcp_array2"])), 3), axis=1)

    # drop iso2_code1 == iso2_code2 cases and neighbors
    sim_spec_df = full_prod_countries[(full_prod_countries["iso2_code1"] != full_prod_countries["iso2_code2"]) & (full_prod_countries["neighbor01"] == 0)]
    
    # keep the top3 most similar countries
    sim_spec_df = sim_spec_df.groupby(["iso2_code1"])["spec_similarity"]\
        .nlargest(3)\
        .reset_index()\
        .rename(columns={"level_1":"iso2_code2_index"})

    # merge similar location names by index
    sim_spec_df = pd.merge(
        sim_spec_df,
        full_prod_countries[["iso2_code2"]].reset_index(),
        left_on="iso2_code2_index",
        right_on="index",
        how="left"
    )

    # merge ECI values by location name
    sim_spec_df = pd.merge(
        sim_spec_df,
        tcdf[["iso2_code", "eci"]].drop_duplicates(),
        left_on="iso2_code2",
        right_on="iso2_code",
        how="left"
    )

    # merge distance values by location name
    sim_spec_df = pd.merge(
        sim_spec_df,
        full_prod_countries,
        on=["iso2_code1", "iso2_code2"],
        how="left"
    )

    # average ECI of the top 3 most similar location 
    avg_comp_sim_spec = sim_spec_df.groupby(["iso2_code1"])\
        .agg(
            avg_eci_similar_spec = pd.NamedAgg("eci", np.mean))\
        .reset_index()\
        .rename(columns={"iso2_code1" : "iso2_code"})

    # join to full comb table
    tcdf = pd.merge(
        tcdf,
        avg_comp_sim_spec,
        on="iso2_code",
        how="left"
    )
    cdf2.append(tcdf)

2020


  .agg(


2021


  .agg(


2022


  .agg(


2023


  .agg(


In [64]:
# join and save
cdf2 = pd.concat(cdf2)
cdf2.to_csv(f"../outputs/eci_software_2020_2023_iv_v2.csv", index=False, sep=";")