In [3]:
import numpy as np
import pandas as pd
import networkx as nx
from itertools import product
from ecomplexity import ecomplexity
from ecomplexity import proximity
import country_converter

**Part 0 - general data preparation**

In [4]:
# parameter to choose year / semester / quarter to construct period IDs
selected_period = "year"

# for ecomplexity calculcation
key_cols = {
    "time": "period",
    "loc": "iso2_code",
    "prod": "language",
    "val": "num_pushers",
}

In [10]:
# data IN
data = pd.read_csv("../data/languages.csv")

In [11]:
# filter functions
def drop_specifics_from_list(data, filter_list):
    """filter specific languages from list -- motivated by RM del Rio-Chanona et al 2023"""
    data = data[~data["language"].str.contains(filter_list, case=False, regex=True)]
    return data

def top_languages_filter(data, nr_languages):
    """keep top x number of languages ONLY"""
    top_languages = data.groupby(["language"])["num_pushers"].agg("sum").reset_index().sort_values(by="num_pushers", ascending=False)
    top_languages = list(top_languages["language"])[:nr_languages]
    data = data[data["language"].isin(top_languages)]
    return data
    
def drop_country_codes_from_list(data, country_list):
    data = data[~data["iso2_code"].isin(country_list)]
    data = data.dropna(subset="iso2_code")
    return data

def add_period_ids(data, period):
    """create missing semester ID and construct different period IDs"""
    if period=="year":
        year_to_period = dict(zip(data["year"].unique(), list(range(1, len(data["year"].unique()) + 1))))
        data["period"] = data["year"].map(year_to_period)
    if period=="semester":
        data["semester"] = np.where(data["quarter"] <= 2, 1, 2)
        data["semester_id"] = data["year"].astype(str).str.cat(data["semester"].astype(str), sep="s")
        semester_to_period = dict(zip(data["semester_id"].unique(), list(range(1, len(data["semester_id"].unique()) + 1))))
        data["period"] = data["semester_id"].map(semester_to_period)
    if period=="quarter":
        data["quarter_id"] = data["year"].astype(str).str.cat(data["quarter"].astype(str), sep="q")
        quarter_to_period = dict(zip(data["quarter_id"].unique(), list(range(1, len(data["quarter_id"].unique()) + 1))))
        data["period"] = data["quarter_id"].map(quarter_to_period)
    return data

In [12]:
# steps to prep dataframe of ecomplexity
prev_filter = "|".join(["yaml", "json", "text", "svg", "Markdown", "xml"])
df = drop_specifics_from_list(data, filter_list=prev_filter)
df = top_languages_filter(df, nr_languages=150)
df = drop_country_codes_from_list(df, country_list=["EU"])
df = add_period_ids(df, period=selected_period)
print(df.shape)

(84934, 7)


**Part 1 - from M_cl to complexity and relatedness - based on 2020,2021 data**

In [13]:
# bundle data for M_cl
def bundle_data(data, periods):
    data = data[data["period"].isin(periods)]\
        .groupby(["iso2_code", "language"])["num_pushers"]\
        .agg("sum")\
        .reset_index()
    data["period"] = 1
    return data

# period IDs -- 1,2 means 2020, 2021 on yearly basis
dfb = bundle_data(df, periods=[1,2])

In [19]:
# software complexity calculation
cdf = ecomplexity(dfb, key_cols)
cdf.to_csv("../outputs/software_complexity_2020_2021_based.csv", index=False, sep=";")

# software relatedness calculation
rel_df = proximity(dfb, key_cols)
rel_df.to_csv("../outputs/software_relatedness_2020_2021_based.csv", sep=";", index=False)

1
1


**Part 2 - comparing ECI(software, trade, technology, research)**

In [23]:
# read in developers (for filtering option)
df = pd.read_csv("../data/developers.csv")
df = df.groupby(["iso2_code", "year"])["developers"].agg("mean").reset_index()
df["developers"] = df["developers"].astype(int)
df = df[df["year"]==2020]

# add software ECI
cdf = pd.read_csv("../outputs/software_complexity_2020_2021_based.csv", sep=";")
country_to_seci_2020 = dict(cdf.groupby(["iso2_code"])["eci"].mean())
df["software_eci_2020"] = df["iso2_code"].map(country_to_seci_2020)

In [24]:
# read in 3 other ECI measures
trade_eci = pd.read_csv("../data/eci_hs6_hs96_trade.csv")
trade_eci = trade_eci[["Country", "2020"]]
tech_eci = pd.read_csv("../data/eci_tech.csv")
tech_eci = tech_eci[["Country", "2020"]]
research_eci = pd.read_csv("../data/Data-ECI-Research.csv")
research_eci = research_eci[["Country", "2020"]]

In [25]:
# map - using the country_converter package to 
c_to_iso = dict(
    zip(trade_eci.Country.unique(), country_converter.convert(names=trade_eci.Country.unique(), to="ISO2")))
trade_eci["Country"] = trade_eci["Country"].map(c_to_iso)
iso_to_trade_eci = dict(trade_eci.values)
df["trade_eci_2020"] = df["iso2_code"].map(iso_to_trade_eci)

tech_eci["Country"] = tech_eci["Country"].map(c_to_iso)
iso_to_tech_eci = dict(tech_eci.values)
df["tech_eci_2020"] = df["iso2_code"].map(iso_to_tech_eci)

research_eci["Country"] = research_eci["Country"].map(c_to_iso)
iso_to_research_eci = dict(research_eci.values)
df["research_eci_2020"] = df["iso2_code"].map(iso_to_research_eci)

In [27]:
# export
df.to_csv("../outputs/eci_comparisons_2020.csv", sep=";", index=False)

**Part 3 - software space from relatedness**

In [18]:
def edgelist_cleaning_for_software_space(data, key_columns):
    """get software space network from raw proximity values"""
    data = data[key_columns]

    # drop zero -- non-existing edges
    data = data[data[key_columns[2]] > 0]

    # drop self loops
    data = data[data[key_columns[0]] != data[key_columns[1]]]
    return data

def maximum_spanning_tree(data, key_columns):
    """get the maximum spanning tree of the full relatedness based network"""
    table = data.copy()
    table["distance"] = 1.0 / table[key_columns[2]]
    G = nx.from_pandas_edgelist(table, source = key_columns[0], target = key_columns[1], edge_attr = ["distance", key_columns[2]])
    T = nx.minimum_spanning_tree(G, weight = "distance")
    table2 = nx.to_pandas_edgelist(T)
    table2 = table2[table2[key_columns[2]] > 0]
    table2.rename(columns = {"source": key_columns[0], "target": key_columns[1], key_columns[2]: "score"}, inplace = True)
    table = pd.merge(
        table,
        table2,
        on=key_columns[0:2]
    )  
    table["edge"] = table.apply(lambda x: "%s-%s" % (min(x[key_columns[0]], x[key_columns[1]]), max(x[key_columns[0]], x[key_columns[1]])), axis = 1)
    table = table.drop_duplicates(subset = ["edge"])
    table = table.drop("edge", 1)
    return table[key_columns]

def add_edges(mst_edges, all_edges, nr_edges_to_add):
    """add edges to the maximum spanning tree to have a 1/3 nodes/edges ratio"""
    # drop mst edges from the full edgelist
    mst_edges["drop"] = 1
    all_edges = pd.merge(
        all_edges,
        mst_edges,
        on = ["language_1", "language_2", "proximity"],
        how="left"
    )
    all_edges = all_edges[all_edges["drop"] != 1].drop(columns="drop")

    # sort and select
    all_edges = all_edges.sort_values(by="proximity", ascending=False).iloc[:nr_edges_to_add]

    # add to mst edgelist
    software_space_el = pd.concat([mst_edges, all_edges])
    return software_space_el

In [22]:
# from relatedness table to software space (MST w/ additional edges)
space_table = edgelist_cleaning_for_software_space(rel_df, key_columns=["language_1", "language_2", "proximity"])
mst_el = maximum_spanning_tree(space_table, key_columns=["language_1", "language_2", "proximity"])
mst_graph = nx.from_pandas_edgelist(mst_el, source="language_1", target="language_2")
n_nodes = mst_graph.number_of_nodes()
n_edges = n_nodes * 2
software_space_el = add_edges(mst_el, space_table, nr_edges_to_add=n_edges)

# export final software space edgelist
software_space_el.to_csv("../outputs/software_space_edgelist_2020_2021_based.csv", index=False, sep=";")

  table = table.drop("edge", 1)


**Part 4 - regression data for cross-sectional entry models (2022-2023)**

In [25]:
# bundle cleaned data for M_cl -- 3 : 2022 / 4 : 2023
dfb3 = bundle_data(df, periods=[3])
dfb4 = bundle_data(df, periods=[4])
dfb3["period"] = 3
dfb4["period"] = 4
dfbs = pd.concat([dfb3, dfb4])

In [26]:
def rca_calculation(table, c_column, p_column, value_column):
    """calculate RCA from an M_cp dataframe"""
    table["e_p"] = table.groupby(p_column)[value_column].transform("sum")
    table["e_c"] = table.groupby(c_column)[value_column].transform("sum")
    table["e"] = table[value_column].sum()

    table["rca"] = (table[value_column] / table["e_p"]) / (table["e_c"] / table["e"])
    table["rca01"] = np.where(table["rca"] >= 1, 1, 0)
    return table

In [27]:
# calculate RCA for each period
rca_tables = list()
for p in dfbs["period"].unique():
    rca_df = dfbs[dfbs["period"]==p]
    rca_tables.append(rca_calculation(rca_df, c_column="iso2_code", p_column="language", value_column="num_pushers"))
rca_tables = pd.concat(rca_tables)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  table["e_p"] = table.groupby(p_column)[value_column].transform("sum")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  table["e_c"] = table.groupby(c_column)[value_column].transform("sum")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  table["e"] = table[value_column].sum()
A value is trying to be se

In [28]:
# identify entry following the given patterns
entry_pattern = [0,1]
consider_pattern = [0,0]
ent = rca_tables.sort_values(["period"], ascending=True).groupby(["iso2_code","language"])["rca01"].agg(list).reset_index()
ent["entry01"] = ent["rca01"].apply(lambda x: x == entry_pattern).astype(int)
ent["consider00"] = ent["rca01"].apply(lambda x: x == consider_pattern).astype(int)

In [29]:
# full combination
all_countries = ent["iso2_code"].unique()
all_languages = ent["language"].unique()

all_combinations = list(product(all_countries, all_languages))
full_df = pd.DataFrame(all_combinations, columns=["iso2_code", "language"])\
    .sort_values(["iso2_code", "language"])

# join entries
full_df = pd.merge(
    full_df,
    ent[["iso2_code", "language", "entry01", "consider00"]],
    on=["iso2_code", "language"],
    how="left"
).fillna(0)

# join complexity
# cdf = pd.read_csv("../outputs/complexity_table2020.csv", sep=";")
full_df = pd.merge(
    full_df,
    cdf[["iso2_code", "language", "pci"]],
    on=["iso2_code", "language"],
    how="left"
)

# join RCA from the baseline period
full_df = pd.merge(
    full_df,
    rca_tables[rca_tables["period"]==3].loc[:,["iso2_code", "language", "rca01"]],
    on=["iso2_code", "language"],
    how="left"
)
full_df["rca01"] = full_df["rca01"].fillna(0)

# drop languages with no complexity value
full_df.dropna(subset=["pci"], inplace=True)

In [30]:
# calculate relatedness ddensity 
# software_space_el = pd.read_csv("../outputs/software_space_edgelist_2020_2021_based.csv", sep=";")
software_space_el["proximity"] = 1

# symmetric relatedness matrix
relatedness = pd.pivot_table(
    software_space_el,
    values="proximity",
    index=["language_1"],
    columns=["language_2"],
    aggfunc=np.sum,
    margins=False
)
relatedness = relatedness.combine_first(relatedness.T).fillna(0).astype(int)

# matrix from RCA values in the baseline period
rca_tables = rca_tables[rca_tables["language"].isin(relatedness.columns)]
mat = pd.pivot_table(
    rca_tables[rca_tables["period"]==3].loc[:,["iso2_code", "language", "rca01"]],
    values="rca01",
    index=["iso2_code"],
    columns=["language"],
    aggfunc=np.sum,
    margins=False
).fillna(0).astype(int)

# relatedness density
rel = np.dot(mat, relatedness)
reltot = np.sum(relatedness, axis=0)
reltot = reltot.values.flatten()
reldens = rel / reltot
reldens_df = pd.DataFrame(reldens)
reldens_df.index = mat.index
reldens_df
reldens_df.columns = mat.columns
reldens_df = reldens_df.rename_axis("iso2_code")\
  .reset_index()\
  .melt("iso2_code", value_name="rel_density", var_name="language")\
  .reset_index(drop=True)

In [31]:
# join to full_df with entries and PCI
full_df = pd.merge(
    full_df,
    reldens_df,
    on=["iso2_code", "language"],
    how="left"
)

In [32]:
# export for entry models -- only consider 00, 01 patterns
full_df["entry01"] = full_df["entry01"].astype(int)
full_df["consider00"] = full_df["consider00"].astype(int)
export_df = full_df[(full_df["entry01"]==1) | (full_df["consider00"]==1)]
export_df.to_csv("../outputs/data_entry_regressions_2020_2021_based.csv", index=False, sep=";")

**Part 5 - data for ECI related regressions**

In [6]:
# data from CEPII -- http://www.cepii.fr/CEPII/en/bdd_modele/bdd_modele_item.asp?id=8
trade_df = pd.read_csv("../data/Gravity_V202211.csv")
trade_df = trade_df[trade_df["year"] == 2020]
trade_df['country_id_o'] = trade_df['country_id_o'].map(lambda x: x.replace('.2',''))
trade_df['country_id_d'] = trade_df['country_id_d'].map(lambda x: x.replace('.2',''))

  trade_df = pd.read_csv("../data/Gravity_V202211.csv")


In [7]:
trade_df.head(2)

Unnamed: 0,year,country_id_o,country_id_d,iso3_o,iso3_d,iso3num_o,iso3num_d,country_exists_o,country_exists_d,gmt_offset_2020_o,...,entry_time_o,entry_time_d,entry_tp_o,entry_tp_d,tradeflow_comtrade_o,tradeflow_comtrade_d,tradeflow_baci,manuf_tradeflow_baci,tradeflow_imf_o,tradeflow_imf_d
72,2020,ABW,ABW,ABW,ABW,533.0,533.0,1,1,-4.0,...,,,,,,,,,,
146,2020,ABW,AFG,ABW,AFG,533.0,4.0,1,1,-4.0,...,,,,,,,,,,


In [None]:
columns_to_keep = [
    "year",
    "country_id_o",
    "country_id_d",
    "iso3_o",
    "iso3_d",
    #"tradeflow_baci",
    "gmt_offset_2020_o",
    "gmt_offset_2020_d",
    "distw_harmonic",
    "dist",
    "scaled_sci_2021",
    "pop_o",
    "pop_d",
    "gdp_o",
    "gdp_d",
    "gdpcap_o",
    "gdpcap_d",
    "gdp_ppp_o",
    "gdp_ppp_d",
    "gdpcap_ppp_o",
    "gdpcap_ppp_d"
]

In [9]:
cdf = pd.read_csv("../outputs/software_complexity_2020_2021_based.csv", sep=";")

In [10]:
cdf.head(2)

Unnamed: 0,iso2_code,language,num_pushers,period,diversity,ubiquity,mcp,eci,pci,density,coi,cog,rca
0,AE,AIDL,0,1,19,8,0,-0.17948,1.964787,0.036024,-0.177167,0.746792,0.0
1,AE,AMPL,0,1,19,7,0,-0.17948,2.611604,0.020808,-0.177167,1.201992,0.0
