In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
from itertools import product
from ecomplexity import ecomplexity
from ecomplexity import proximity
from ecomplexity import calc_density
import country_converter as coco
import itertools

# spatial autocorrelation
import geopandas as gpd
from pysal.lib import weights
from libpysal.io import open as psopen
from splot.esda import (
    moran_scatterplot, lisa_cluster, plot_local_autocorrelation, plot_moran
)
from splot.libpysal import plot_spatial_weights
import esda

import yaml
import sys, os
sys.path.append(os.path.abspath(".."))

from utils.utils import *
from utils.config_utils import load_config

In [2]:
# parameters
config = load_config()
focal_year = config["focal_year"]
selected_period = config["selected_period"]

**different RCA thresholds**

In [3]:
# saved in data prep
eci_software = pd.read_csv("../../data/outputs/eci_clusters_cooc_2020_2023.csv", sep=";")

In [4]:
# year focus
eci_df = eci_software[eci_software["year"]==2020]
eci_df["mcp075"] = np.where(eci_df["rca"]>=0.75, 1, 0)
eci_df["mcp125"] = np.where(eci_df["rca"]>=1.25, 1, 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  eci_df["mcp075"] = np.where(eci_df["rca"]>=0.75, 1, 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  eci_df["mcp125"] = np.where(eci_df["rca"]>=1.25, 1, 0)


In [5]:
def mcp_crosstable(df, columns):
    mcp_crosstable = df[columns].value_counts().reset_index().sort_values(by=columns).rename(columns={"count":"obs"})
    mcp_crosstable["obs_share"] = round(mcp_crosstable["obs"] / mcp_crosstable["obs"].sum(), 2)
    return mcp_crosstable

In [6]:
mcp_crosstable(eci_df, columns=["mcp", "mcp075"])

Unnamed: 0,mcp,mcp075,obs,obs_share
0,0,0,5876,0.7
2,0,1,589,0.07
1,1,1,1914,0.23


In [7]:
mcp_crosstable(eci_df, columns=["mcp", "mcp125"])

Unnamed: 0,mcp,mcp125,obs,obs_share
0,0,0,6465,0.77
2,1,0,596,0.07
1,1,1,1318,0.16


In [8]:
# data IN -- for threshold 0.75 AND 1.25

# data IN
data = pd.read_csv("../../data/inputs/languages.csv")

# use data_prep_functions to clean the dataframe of ECI_software calculation
data = data[data["year"].isin([2020, 2021, 2022, 2023])]
prev_filter = "|".join(["yaml", "json", "text", "svg", "Markdown", "xml"])
df = drop_specifics_from_list(data, filter_list=prev_filter)
df = top_languages_filter(df, nr_languages=150)
df = drop_country_codes_from_list(df, country_list=["EU"])
df = add_period_ids(df, period=selected_period)
print(df.shape)

(93076, 7)


In [9]:
# clusters of languages
cl_df = pd.read_csv("../../data/outputs/language_clusters_gh_cos_hier_ward_d1.csv")\
    .rename(columns={"Language":"language", "Cluster":"cluster_id"})\
    .iloc[:,1:]

# combine
cl_df = pd.merge(
    df,
    cl_df,
    on="language",
    how="left"
)

# drop NAs... -- not so great
cl_df.dropna(subset=["cluster_id", "cluster_id"], inplace=True)

# aggregate by clusters
cl_df = cl_df.groupby(["iso2_code", "period", "cluster_id"])["num_pushers"].agg("mean").reset_index()

# for ecomplexity calculcation
key_cols = {
    "time": "period",
    "loc": "iso2_code",
    "prod": "cluster_id",
    "val": "num_pushers",
}

# software complexity calculation -- period IDs -- 1 means 2020 on yearly basis
thresholds = [0.75, 1.25]
for t in thresholds:
    print(t)
    ccdf = []
    ppdf = []
    year_dict = {1 : 2020, 2 : 2021, 3 : 2022, 4 : 2023}
    for k in year_dict.keys():
        dfb = cl_df[cl_df["period"]==k]
        cdf = ecomplexity(dfb, key_cols, rca_mcp_threshold=t)
        cdf["year"] = year_dict[k]

        pdf = proximity(dfb, key_cols, rca_mcp_threshold=t)
        pdf["year"] = year_dict[k]

        # combine yearly dataframes
        ccdf.append(cdf)
        ppdf.append(pdf)
        print(year_dict[k], " DONE")

        # combine and save -- complexity
        cdf = pd.concat(ccdf, axis=0, ignore_index=True)
        cdf.to_csv(f"../../data/outputs/eci_clusters_cooc_2020_2023_threshold_{int(t*100)}.csv", sep=";", index=False)

        # combine and save -- language proximity
        prox_df = pd.concat(ppdf, axis=0, ignore_index=True)
        prox_df.to_csv(f"../../data/outputs/proximity_clusters_2020_2023_threshold_{int(t*100)}.csv", sep=";", index=False)

0.75
1
Percentage of pairs compared that meet log-supermodularity condition: 6.16%
1
2020  DONE
2




Percentage of pairs compared that meet log-supermodularity condition: 6.64%
2
2021  DONE
3




Percentage of pairs compared that meet log-supermodularity condition: 7.07%
3
2022  DONE
4




Percentage of pairs compared that meet log-supermodularity condition: 7.68%
4
2023  DONE




1.25
1
Percentage of pairs compared that meet log-supermodularity condition: 6.72%
1
2020  DONE
2




Percentage of pairs compared that meet log-supermodularity condition: 7.64%
2
2021  DONE
3




Percentage of pairs compared that meet log-supermodularity condition: 8.27%
3
2022  DONE
4




Percentage of pairs compared that meet log-supermodularity condition: 9.98%
4
2023  DONE




In [10]:
# correlation matrix
cdf100 = pd.read_csv("../../data/outputs/eci_clusters_cooc_2020_2023.csv", sep=";")
cdf100 = cdf100[cdf100["year"]==2020]
cdf075 = pd.read_csv("../../data/outputs/eci_clusters_cooc_2020_2023_threshold_75.csv", sep=";")
cdf075 = cdf075[cdf075["year"]==2020]
cdf125 = pd.read_csv("../../data/outputs/eci_clusters_cooc_2020_2023_threshold_125.csv", sep=";")
cdf125 = cdf125[cdf125["year"]==2020].rename(columns={"eci":"eci125"})

full_cdf = pd.merge(
    cdf100[["iso2_code", "eci"]].drop_duplicates(),
    cdf075[["iso2_code", "eci"]].drop_duplicates(),
    on=["iso2_code"],
    how="left",
    suffixes=["100", "075"]
)
full_cdf = pd.merge(
    full_cdf,
    cdf125,
    on=["iso2_code"],
    how="left"
)

# correlation matri -- ECI across thresholds
full_cdf[["eci100", "eci075", "eci125"]].corr()

Unnamed: 0,eci100,eci075,eci125
eci100,1.0,0.978869,0.903184
eci075,0.978869,1.0,0.881004
eci125,0.903184,0.881004,1.0


In [11]:
### ENTRY -- 0.75 threshold

# relatedness density -- as in Hidalgo et al. (2007) Science
cdf = pd.read_csv("../../data/outputs/eci_clusters_cooc_2020_2023_threshold_75.csv", sep=";")
rel_dens = cdf[cdf["year"] == 2020][["iso2_code", "cluster_id", "density"]].drop_duplicates()



# data IN
# use cl_df from the previous section

# threshold for RCA : 0.75
ps = [1, 2, 3, 4]
rca_tables = []
for p in ps:
    print(p)
    temp = bundle_data(cl_df, periods=[p], key_column="cluster_id")
    temp["period"] = p
    rca_tables.append(rca_calculation(temp, c_column="iso2_code", p_column="cluster_id", value_column="num_pushers", threshold=0.75))

#    dfbs.append(temp)
rca_tables = pd.concat(rca_tables)



# identify entry following the given patterns
entry_pattern = [0,0,1,1]
consider_pattern = [0,0,0,0]
ent = rca_tables.sort_values(["period"], ascending=True).groupby(["iso2_code","cluster_id"])["rca01"].agg(list).reset_index()
ent["entry01"] = ent["rca01"].apply(lambda x: x == entry_pattern).astype(int)
ent["consider00"] = ent["rca01"].apply(lambda x: x == consider_pattern).astype(int)



# full combination
all_countries = ent["iso2_code"].unique()
all_clusters = ent["cluster_id"].unique()

all_combinations = list(product(all_countries, all_clusters))
full_df = pd.DataFrame(all_combinations, columns=["iso2_code", "cluster_id"])\
    .sort_values(["iso2_code", "cluster_id"])

# join entries
full_df = pd.merge(
    full_df,
    ent[["iso2_code", "cluster_id", "entry01", "consider00"]],
    on=["iso2_code", "cluster_id"],
    how="left"
).fillna(0)

# join complexity
cdf = pd.read_csv("../../data/outputs/eci_clusters_cooc_2020_2023_threshold_75.csv", sep=";")
cdf = cdf[cdf["year"]==2020]
full_df = pd.merge(
    full_df,
    cdf[["iso2_code", "cluster_id", "pci", "ubiquity"]],
    on=["iso2_code", "cluster_id"],
    how="left"
)

# join RCA from the baseline period
full_df = pd.merge(
    full_df,
    rca_tables[rca_tables["period"]==3].loc[:,["iso2_code", "cluster_id", "rca01"]],
    on=["iso2_code", "cluster_id"],
    how="left"
)
full_df["rca01"] = full_df["rca01"].fillna(0)

# drop languages with no complexity value
full_df.dropna(subset=["pci"], inplace=True)



# join to full_df with entries and PCI
full_df = pd.merge(
    full_df,
    rel_dens,
    on=["iso2_code", "cluster_id"],
    how="left"
)



# export for entry models -- only consider 00, 01 patterns
full_df["entry01"] = full_df["entry01"].astype(int)
full_df["consider00"] = full_df["consider00"].astype(int)
export_df = full_df[(full_df["entry01"]==1) | (full_df["consider00"]==1)]
export_df.to_csv("../../data/outputs/data_cluster_entry_regressions_0011_threshold_075.csv", index=False, sep=";")

1
2
3
4


In [12]:
### ENTRY -- 1.25 threshold

# relatedness density -- as in Hidalgo et al. (2007) Science
cdf = pd.read_csv("../../data/outputs/eci_clusters_cooc_2020_2023_threshold_125.csv", sep=";")
rel_dens = cdf[cdf["year"] == 2020][["iso2_code", "cluster_id", "density"]].drop_duplicates()



# data IN
# use cl_df from the previous section

# threshold for RCA : 1.25
ps = [1, 2, 3, 4]
rca_tables = []
for p in ps:
    print(p)
    temp = bundle_data(cl_df, periods=[p], key_column="cluster_id")
    temp["period"] = p
    rca_tables.append(rca_calculation(temp, c_column="iso2_code", p_column="cluster_id", value_column="num_pushers", threshold=1.25))

#    dfbs.append(temp)
rca_tables = pd.concat(rca_tables)



# identify entry following the given patterns
entry_pattern = [0,0,1,1]
consider_pattern = [0,0,0,0]
ent = rca_tables.sort_values(["period"], ascending=True).groupby(["iso2_code","cluster_id"])["rca01"].agg(list).reset_index()
ent["entry01"] = ent["rca01"].apply(lambda x: x == entry_pattern).astype(int)
ent["consider00"] = ent["rca01"].apply(lambda x: x == consider_pattern).astype(int)



# full combination
all_countries = ent["iso2_code"].unique()
all_clusters = ent["cluster_id"].unique()

all_combinations = list(product(all_countries, all_clusters))
full_df = pd.DataFrame(all_combinations, columns=["iso2_code", "cluster_id"])\
    .sort_values(["iso2_code", "cluster_id"])

# join entries
full_df = pd.merge(
    full_df,
    ent[["iso2_code", "cluster_id", "entry01", "consider00"]],
    on=["iso2_code", "cluster_id"],
    how="left"
).fillna(0)

# join complexity
cdf = pd.read_csv("../../data/outputs/eci_clusters_cooc_2020_2023_threshold_125.csv", sep=";")
cdf = cdf[cdf["year"]==2020]
full_df = pd.merge(
    full_df,
    cdf[["iso2_code", "cluster_id", "pci", "ubiquity"]],
    on=["iso2_code", "cluster_id"],
    how="left"
)

# join RCA from the baseline period
full_df = pd.merge(
    full_df,
    rca_tables[rca_tables["period"]==3].loc[:,["iso2_code", "cluster_id", "rca01"]],
    on=["iso2_code", "cluster_id"],
    how="left"
)
full_df["rca01"] = full_df["rca01"].fillna(0)

# drop languages with no complexity value
full_df.dropna(subset=["pci"], inplace=True)



# join to full_df with entries and PCI
full_df = pd.merge(
    full_df,
    rel_dens,
    on=["iso2_code", "cluster_id"],
    how="left"
)



# export for entry models -- only consider 00, 01 patterns
full_df["entry01"] = full_df["entry01"].astype(int)
full_df["consider00"] = full_df["consider00"].astype(int)
export_df = full_df[(full_df["entry01"]==1) | (full_df["consider00"]==1)]
export_df.to_csv("../../data/outputs/data_cluster_entry_regressions_0011_threshold_125.csv", index=False, sep=";")

1
2
3
4


In [13]:
### EXIT -- 0.75 threshold

# relatedness density -- as in Hidalgo et al. (2007) Science
cdf = pd.read_csv("../../data/outputs/eci_clusters_cooc_2020_2023_threshold_75.csv", sep=";")
rel_dens = cdf[cdf["year"] == 2020][["iso2_code", "cluster_id", "density"]].drop_duplicates()


# data IN
# use cl_df from the previous section


# threshold for RCA : 0.75
ps = [1, 2, 3, 4]
rca_tables = []
for p in ps:
    print(p)
    temp = bundle_data(cl_df, periods=[p], key_column="cluster_id")
    temp["period"] = p
    rca_tables.append(rca_calculation(temp, c_column="iso2_code", p_column="cluster_id", value_column="num_pushers", threshold=0.75))

#    dfbs.append(temp)
rca_tables = pd.concat(rca_tables)



# identify entry following the given patterns
exit_pattern = [1,1,0,0]
consider_pattern = [1,1,1,1]
ext = rca_tables.sort_values(["period"], ascending=True).groupby(["iso2_code","cluster_id"])["rca01"].agg(list).reset_index()
ext["entry01"] = ext["rca01"].apply(lambda x: x == exit_pattern).astype(int)
ext["consider00"] = ext["rca01"].apply(lambda x: x == consider_pattern).astype(int)



# full combination
all_countries = ext["iso2_code"].unique()
all_clusters = ext["cluster_id"].unique()

all_combinations = list(product(all_countries, all_clusters))
full_df = pd.DataFrame(all_combinations, columns=["iso2_code", "cluster_id"])\
    .sort_values(["iso2_code", "cluster_id"])

# join entries
full_df = pd.merge(
    full_df,
    ext[["iso2_code", "cluster_id", "entry01", "consider00"]],
    on=["iso2_code", "cluster_id"],
    how="left"
).fillna(0)

# join complexity
cdf = pd.read_csv("../../data/outputs/eci_clusters_cooc_2020_2023_threshold_75.csv", sep=";")
cdf = cdf[cdf["year"]==2020]
full_df = pd.merge(
    full_df,
    cdf[["iso2_code", "cluster_id", "pci", "ubiquity"]],
    on=["iso2_code", "cluster_id"],
    how="left"
)

# join RCA from the baseline period
full_df = pd.merge(
    full_df,
    rca_tables[rca_tables["period"]==3].loc[:,["iso2_code", "cluster_id", "rca01"]],
    on=["iso2_code", "cluster_id"],
    how="left"
)
full_df["rca01"] = full_df["rca01"].fillna(0)

# drop languages with no complexity value
full_df.dropna(subset=["pci"], inplace=True)



# join to full_df with entries and PCI
full_df = pd.merge(
    full_df,
    rel_dens,
    on=["iso2_code", "cluster_id"],
    how="left"
)



# export for entry models -- only consider 00, 01 patterns
full_df["entry01"] = full_df["entry01"].astype(int)
full_df.rename(columns={"entry01":"exit01"}, inplace=True)
full_df["consider00"] = full_df["consider00"].astype(int)
export_df = full_df[(full_df["exit01"]==1) | (full_df["consider00"]==1)]
export_df.to_csv("../../data/outputs/data_cluster_exit_regressions_1100_threshold_075.csv", index=False, sep=";")

1
2
3
4


In [14]:
### EXIT -- 1.25 threshold

# relatedness density -- as in Hidalgo et al. (2007) Science
cdf = pd.read_csv("../../data/outputs/eci_clusters_cooc_2020_2023_threshold_125.csv", sep=";")
rel_dens = cdf[cdf["year"] == 2020][["iso2_code", "cluster_id", "density"]].drop_duplicates()


# data IN
# use cl_df from the previous section


# threshold for RCA : 1.25
ps = [1, 2, 3, 4]
rca_tables = []
for p in ps:
    print(p)
    temp = bundle_data(cl_df, periods=[p], key_column="cluster_id")
    temp["period"] = p
    rca_tables.append(rca_calculation(temp, c_column="iso2_code", p_column="cluster_id", value_column="num_pushers", threshold=0.75))

#    dfbs.append(temp)
rca_tables = pd.concat(rca_tables)



# identify entry following the given patterns
exit_pattern = [1,1,0,0]
consider_pattern = [1,1,1,1]
ext = rca_tables.sort_values(["period"], ascending=True).groupby(["iso2_code","cluster_id"])["rca01"].agg(list).reset_index()
ext["entry01"] = ext["rca01"].apply(lambda x: x == exit_pattern).astype(int)
ext["consider00"] = ext["rca01"].apply(lambda x: x == consider_pattern).astype(int)



# full combination
all_countries = ext["iso2_code"].unique()
all_clusters = ext["cluster_id"].unique()

all_combinations = list(product(all_countries, all_clusters))
full_df = pd.DataFrame(all_combinations, columns=["iso2_code", "cluster_id"])\
    .sort_values(["iso2_code", "cluster_id"])

# join entries
full_df = pd.merge(
    full_df,
    ext[["iso2_code", "cluster_id", "entry01", "consider00"]],
    on=["iso2_code", "cluster_id"],
    how="left"
).fillna(0)

# join complexity
cdf = pd.read_csv("../../data/outputs/eci_clusters_cooc_2020_2023_threshold_125.csv", sep=";")
cdf = cdf[cdf["year"]==2020]
full_df = pd.merge(
    full_df,
    cdf[["iso2_code", "cluster_id", "pci", "ubiquity"]],
    on=["iso2_code", "cluster_id"],
    how="left"
)

# join RCA from the baseline period
full_df = pd.merge(
    full_df,
    rca_tables[rca_tables["period"]==3].loc[:,["iso2_code", "cluster_id", "rca01"]],
    on=["iso2_code", "cluster_id"],
    how="left"
)
full_df["rca01"] = full_df["rca01"].fillna(0)

# drop languages with no complexity value
full_df.dropna(subset=["pci"], inplace=True)



# join to full_df with entries and PCI
full_df = pd.merge(
    full_df,
    rel_dens,
    on=["iso2_code", "cluster_id"],
    how="left"
)



# export for entry models -- only consider 00, 01 patterns
full_df["entry01"] = full_df["entry01"].astype(int)
full_df.rename(columns={"entry01":"exit01"}, inplace=True)
full_df["consider00"] = full_df["consider00"].astype(int)
export_df = full_df[(full_df["exit01"]==1) | (full_df["consider00"]==1)]
export_df.to_csv("../../data/outputs/data_cluster_exit_regressions_1100_threshold_125.csv", index=False, sep=";")

1
2
3
4


**spatial autocorrelation**

In [15]:
# data -- ECI_software
cdf = pd.read_csv("../../data/outputs/eci_clusters_cooc_2020_2023.csv", sep=";")
cdf = cdf[cdf["year"]==2020]
cdf = cdf[["iso2_code", "eci"]].drop_duplicates()

# data -- world map
cmap = gpd.read_file("../../data/inputs/world-administrative-boundaries.geojson")
cmap = cmap[["iso3", "iso_3166_1_alpha_2_codes", "name", "geometry"]].rename(columns={"iso_3166_1_alpha_2_codes" : "iso2"})

cmap = pd.merge(
    cmap,
    cdf,
    left_on="iso2",
    right_on="iso2_code",
    how="left"
)

# clean up
cmap = cmap.drop_duplicates(subset=["iso2"])
#cmap.dropna(subset="iso2", inplace=True)
cmap.dropna(subset="eci", inplace=True)

In [16]:
### spatial autocorrelation

# index setting
cmap = cmap.set_index("iso2", drop=False)

# create the spatial weights matrix
w = weights.Queen.from_dataframe(cmap, idVariable="iso2")

# drop island
cmap = cmap.drop(w.islands)

# create the spatial weights matrix
w = weights.Queen.from_dataframe(cmap, idVariable="iso2")

# row standardize the matrix
w.transform = "R"

# spatial lag
cmap["w_eci"] = weights.lag_spatial(w, cmap["eci"])

# z score
cmap["eci_std"] = (cmap["eci"] - cmap["eci"].mean()) / cmap["eci"].std()
cmap["w_eci_std"] = weights.lag_spatial(w, cmap["eci_std"])

# Moran I
mi = esda.Moran(cmap["eci"], w)
print(mi.I, "Moran's I")
print(mi.p_sim, "significance")

  w = weights.Queen.from_dataframe(cmap, idVariable="iso2")
 There are 27 disconnected components.
 There are 22 islands with ids: CY, JM, LK, AU, PH, MG, KR, MU, JP, MT, SG, BH, BB, PR, SN, MV, IS, CU, RE, TW, NZ, TT.
  W.__init__(self, neighbors, ids=ids, **kw)
  w = weights.Queen.from_dataframe(cmap, idVariable="iso2")


0.4826942565066437 Moran's I
0.001 significance


 There are 5 disconnected components.
  W.__init__(self, neighbors, ids=ids, **kw)


**comparison of ECI software measures**

In [2]:
# comparison
selected_year = 2021

cluster_cdf1 = pd.read_csv("../../data/outputs/eci_clusters_theory_2020_2023.csv", sep=";")
cluster_cdf = pd.read_csv("../../data/outputs/eci_clusters_cooc_2020_2023.csv", sep=";")
eci_software = pd.read_csv("../../data/outputs/eci_software_2020_2023.csv", sep=";")
topic_cdf = pd.read_csv("../../data/outputs/eci_topics_2020_2023.csv", sep=";")

cc_df = pd.merge(
    eci_software[eci_software["year"]==selected_year][["iso2_code", "eci"]].drop_duplicates(),
    cluster_cdf[cluster_cdf["year"]==selected_year][["iso2_code", "eci"]].drop_duplicates(),
    on=["iso2_code"],
    how="left",
    suffixes=["_software", "_cluster"]
)
temp = pd.merge(
    cc_df,
    cluster_cdf1[cluster_cdf1["year"]==selected_year][["iso2_code", "eci"]].drop_duplicates(),
    on=["iso2_code"],
    how="left"
).rename(columns={"eci":"eci_cluster_theory", "eci_cluster":"eci_cluster_cooccurrence"})
temp = pd.merge(
    temp,
    topic_cdf[topic_cdf["year"]==selected_year][["iso2_code", "eci"]].drop_duplicates(),
    on=["iso2_code"],
    how="left"
).rename(columns={"eci":"eci_topic"})

temp[["eci_software", "eci_cluster_theory", "eci_cluster_cooccurrence", "eci_topic"]].corr()

Unnamed: 0,eci_software,eci_cluster_theory,eci_cluster_cooccurrence,eci_topic
eci_software,1.0,0.981778,0.972761,0.839021
eci_cluster_theory,0.981778,1.0,0.968438,0.82259
eci_cluster_cooccurrence,0.972761,0.968438,1.0,0.816693
eci_topic,0.839021,0.82259,0.816693,1.0
