In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
from itertools import product
from ecomplexity import ecomplexity
from ecomplexity import proximity
from ecomplexity import calc_density
import country_converter as coco
import itertools

import yaml
import sys, os
sys.path.append(os.path.abspath(".."))

from utils.utils import *
from utils.config_utils import load_config


In [5]:
# parameters
config = load_config()
focal_year = config["focal_year"]
selected_period = config["selected_period"]

**topics -- instead of languages**

In [7]:
# for ecomplexity calculcation
key_cols = {
    "time": "period",
    "loc": "iso2_code",
    "prod": "topic",
    "val": "num_pushers",
}

# data IN
data = pd.read_csv("../../data/inputs/topics.csv")

# use data_prep_functions to clean the dataframe of ECI_software calculation
print(data.shape)
data = data[data["year"].isin([2020, 2021, 2022, 2023])]
prev_filter = "|".join(["yaml", "json", "text", "svg", "Markdown", "xml"])
df = drop_specifics_from_list(data, filter_list=prev_filter, key_column="topic")
df = top_languages_filter(df, nr_languages=200, key_column="topic")
df = drop_country_codes_from_list(df, country_list=["EU"])
df = add_period_ids(df, period=selected_period)
print(df.shape)

(61857, 5)
(25445, 6)


In [8]:
# software complexity calculation -- period IDs -- 1 means 2020 on yearly basis
ccdf = []
ppdf = []
year_dict = {1 : 2020, 2 : 2021, 3 : 2022, 4 : 2023}
for k in year_dict.keys():
    dfb = bundle_data(df, periods=[k], key_column="topic")
    cdf = ecomplexity(dfb, key_cols)
    cdf["year"] = year_dict[k]

    pdf = proximity(dfb, key_cols)
    pdf["year"] = year_dict[k]

    # combine yearly dataframes
    ccdf.append(cdf)
    ppdf.append(pdf)
    print(year_dict[k], " DONE")

cdf = pd.concat(ccdf, axis=0, ignore_index=True)

1
Percentage of pairs compared that meet log-supermodularity condition: 0.91%
1
2020  DONE
1




Percentage of pairs compared that meet log-supermodularity condition: 0.32%
1
2021  DONE
1




Percentage of pairs compared that meet log-supermodularity condition: 0.12%
1
2022  DONE
1




Percentage of pairs compared that meet log-supermodularity condition: 0.73%
1
2023  DONE




In [9]:
# combine and save -- complexity
topic_cdf = pd.concat(ccdf, axis=0, ignore_index=True)
topic_cdf.to_csv("../../data/outputs/eci_topics_2020_2023.csv", sep=";", index=False)

In [11]:
### ENTRY -- based on topics

# relatedness density -- as in Hidalgo et al. (2007) Science
cdf = pd.read_csv("../../data/outputs/eci_topics_2020_2023.csv", sep=";")
rel_dens = cdf[cdf["year"] == 2020][["iso2_code", "topic", "density"]].drop_duplicates()


# data IN
data = pd.read_csv("../../data/inputs/topics.csv")

# use data_prep_functions to clean the dataframe of ECI_software calculation
print(data.shape)
data = data[data["year"].isin([2020, 2021, 2022, 2023])]
prev_filter = "|".join(["yaml", "json", "text", "svg", "Markdown", "xml"])
df = drop_specifics_from_list(data, filter_list=prev_filter, key_column="topic")
df = top_languages_filter(df, nr_languages=200, key_column="topic")
df = drop_country_codes_from_list(df, country_list=["EU"])
df = add_period_ids(df, period=selected_period)
print(df.shape)


# threshold for RCA : 1
ps = [1, 2, 3, 4]
rca_tables = []
for p in ps:
    print(p)
    temp = bundle_data(df, periods=[p], key_column="topic")
    temp["period"] = p
    rca_tables.append(rca_calculation(temp, c_column="iso2_code", p_column="topic", value_column="num_pushers", threshold=1))

#    dfbs.append(temp)
rca_tables = pd.concat(rca_tables)


# identify entry following the given patterns
entry_pattern = [0,0,1,1]
consider_pattern = [0,0,0,0]
ent = rca_tables.sort_values(["period"], ascending=True).groupby(["iso2_code","topic"])["rca01"].agg(list).reset_index()
ent["entry01"] = ent["rca01"].apply(lambda x: x == entry_pattern).astype(int)
ent["consider00"] = ent["rca01"].apply(lambda x: x == consider_pattern).astype(int)




# full combination
all_countries = ent["iso2_code"].unique()
all_topics = ent["topic"].unique()

all_combinations = list(product(all_countries, all_topics))
full_df = pd.DataFrame(all_combinations, columns=["iso2_code", "topic"])\
    .sort_values(["iso2_code", "topic"])

# join entries
full_df = pd.merge(
    full_df,
    ent[["iso2_code", "topic", "entry01", "consider00"]],
    on=["iso2_code", "topic"],
    how="left"
).fillna(0)

# join complexity
cdf = pd.read_csv("../../data/outputs/eci_topics_2020_2023.csv", sep=";")
cdf = cdf[cdf["year"]==2020]
full_df = pd.merge(
    full_df,
    cdf[["iso2_code", "topic", "pci", "ubiquity"]],
    on=["iso2_code", "topic"],
    how="left"
)

# join RCA from the baseline period
full_df = pd.merge(
    full_df,
    rca_tables[rca_tables["period"]==3].loc[:,["iso2_code", "topic", "rca01"]],
    on=["iso2_code", "topic"],
    how="left"
)
full_df["rca01"] = full_df["rca01"].fillna(0)

# drop languages with no complexity value
full_df.dropna(subset=["pci"], inplace=True)

# join to full_df with entries and PCI
full_df = pd.merge(
    full_df,
    rel_dens,
    on=["iso2_code", "topic"],
    how="left"
)

# export for entry models -- only consider 00, 01 patterns
full_df["entry01"] = full_df["entry01"].astype(int)
full_df["consider00"] = full_df["consider00"].astype(int)
export_df = full_df[(full_df["entry01"]==1) | (full_df["consider00"]==1)]
export_df.to_csv("../../data/outputs/data_entry_regressions_0011_topics.csv", index=False, sep=";")

(61857, 5)
(25445, 6)
1
2
3
4


In [12]:
### EXIT -- based on topics

# relatedness density -- as in Hidalgo et al. (2007) Science
cdf = pd.read_csv("../../data/outputs/eci_topics_2020_2023.csv", sep=";")
rel_dens = cdf[cdf["year"] == 2020][["iso2_code", "topic", "density"]].drop_duplicates()


# data IN
data = pd.read_csv("../../data/inputs/topics.csv")

# use data_prep_functions to clean the dataframe of ECI_software calculation
print(data.shape)
data = data[data["year"].isin([2020, 2021, 2022, 2023])]
prev_filter = "|".join(["yaml", "json", "text", "svg", "Markdown", "xml"])
df = drop_specifics_from_list(data, filter_list=prev_filter, key_column="topic")
df = top_languages_filter(df, nr_languages=200, key_column="topic")
df = drop_country_codes_from_list(df, country_list=["EU"])
df = add_period_ids(df, period=selected_period)
print(df.shape)


# threshold for RCA : 1
ps = [1, 2, 3, 4]
rca_tables = []
for p in ps:
    print(p)
    temp = bundle_data(df, periods=[p], key_column="topic")
    temp["period"] = p
    rca_tables.append(rca_calculation(temp, c_column="iso2_code", p_column="topic", value_column="num_pushers", threshold=1))

#    dfbs.append(temp)
rca_tables = pd.concat(rca_tables)


# identify entry following the given patterns
exit_pattern = [1,1,0,0]
consider_pattern = [1,1,1,1]
ext = rca_tables.sort_values(["period"], ascending=True).groupby(["iso2_code","topic"])["rca01"].agg(list).reset_index()
ext["entry01"] = ext["rca01"].apply(lambda x: x == exit_pattern).astype(int)
ext["consider00"] = ext["rca01"].apply(lambda x: x == consider_pattern).astype(int)



# full combination
all_countries = ent["iso2_code"].unique()
all_topics = ent["topic"].unique()

all_combinations = list(product(all_countries, all_topics))
full_df = pd.DataFrame(all_combinations, columns=["iso2_code", "topic"])\
    .sort_values(["iso2_code", "topic"])

# join entries
full_df = pd.merge(
    full_df,
    ent[["iso2_code", "topic", "entry01", "consider00"]],
    on=["iso2_code", "topic"],
    how="left"
).fillna(0)

# join complexity
cdf = pd.read_csv("../../data/outputs/eci_topics_2020_2023.csv", sep=";")
cdf = cdf[cdf["year"]==2020]
full_df = pd.merge(
    full_df,
    cdf[["iso2_code", "topic", "pci", "ubiquity"]],
    on=["iso2_code", "topic"],
    how="left"
)

# join RCA from the baseline period
full_df = pd.merge(
    full_df,
    rca_tables[rca_tables["period"]==3].loc[:,["iso2_code", "topic", "rca01"]],
    on=["iso2_code", "topic"],
    how="left"
)
full_df["rca01"] = full_df["rca01"].fillna(0)

# drop languages with no complexity value
full_df.dropna(subset=["pci"], inplace=True)

# join to full_df with entries and PCI
full_df = pd.merge(
    full_df,
    rel_dens,
    on=["iso2_code", "topic"],
    how="left"
)

# export for entry models -- only consider 00, 01 patterns
full_df["entry01"] = full_df["entry01"].astype(int)
full_df.rename(columns={"entry01":"exit01"}, inplace=True)
full_df["consider00"] = full_df["consider00"].astype(int)
export_df = full_df[(full_df["exit01"]==1) | (full_df["consider00"]==1)]
export_df.to_csv("../../data/outputs/data_exit_regressions_1100_topics.csv", index=False, sep=";")

(61857, 5)
(25445, 6)
1
2
3
4
