In [1]:
import numpy as np
import pandas as pd
import networkx as nx
from itertools import product
from ecomplexity import ecomplexity
from ecomplexity import proximity

**Part 0 - general data preparation**

In [2]:
# parameters
selected_period = "year"
#period_for_complexity = 1
log_num_pushers = False

# for ecomplexity calculcation
key_cols = {
    "time": "period",
    "loc": "iso2_code",
    "prod": "language",
    "val": "num_pushers",
}


In [3]:
# data IN
data = pd.read_csv("../data/languages.csv")

In [4]:
# filter functions
def drop_specifics_from_list(data, filter_list):
    """filter specific languages from list -- motivated by RM del Rio-Chanona et al 2023"""
    data = data[~data["language"].str.contains(filter_list, case=False, regex=True)]
    return data

def top_languages_filter(data, nr_languages):
    """keep top x number of languages ONLY"""
    top_languages = data.groupby(["language"])["num_pushers"].agg("sum").reset_index().sort_values(by="num_pushers", ascending=False)
    top_languages = list(top_languages["language"])[:nr_languages]
    data = data[data["language"].isin(top_languages)]
    return data
    
def drop_country_codes_from_list(data, country_list):
    data = data[~data["iso2_code"].isin(country_list)]
    data = data.dropna(subset="iso2_code")
    return data

def add_period_ids(data, period):
    """create missing semester ID and construct different period IDs"""
    if period=="year":
        year_to_period = dict(zip(data["year"].unique(), list(range(1, len(data["year"].unique()) + 1))))
        data["period"] = data["year"].map(year_to_period)
    if period=="semester":
        data["semester"] = np.where(data["quarter"] <= 2, 1, 2)
        data["semester_id"] = data["year"].astype(str).str.cat(data["semester"].astype(str), sep="s")
        semester_to_period = dict(zip(data["semester_id"].unique(), list(range(1, len(data["semester_id"].unique()) + 1))))
        data["period"] = data["semester_id"].map(semester_to_period)
    if period=="quarter":
        data["quarter_id"] = data["year"].astype(str).str.cat(data["quarter"].astype(str), sep="q")
        quarter_to_period = dict(zip(data["quarter_id"].unique(), list(range(1, len(data["quarter_id"].unique()) + 1))))
        data["period"] = data["quarter_id"].map(quarter_to_period)
    return data


# probably we can delete later
def dataframe_for_ecomplexity(data, period):
    """aggregate and transform dataframe for ecomplexity functions"""
    #data = data[(data["year"]==focal_year) & (data["quarter"].isin(quarter_list))]
    data = data[(data["period"]==period)]
    data = data\
        .groupby(["period", "iso2_code", "language"])["num_pushers"]\
        .agg("sum")\
        .reset_index()\
        .sort_values(by="num_pushers", ascending=False)
    return data

In [5]:
# steps to prep dataframe of ecomplexity
prev_filter = "|".join(["yaml", "json", "text", "svg", "Markdown", "xml"])
df = drop_specifics_from_list(data, filter_list=prev_filter)
df = top_languages_filter(df, nr_languages=150)
df = drop_country_codes_from_list(df, country_list=["EU"])
df = add_period_ids(df, period=selected_period)
print(df.shape)

(84934, 7)


In [6]:
# take the log of num_pushers
if log_num_pushers == True:
    df["num_pushers"] = np.log10(df["num_pushers"])

**Part 1 - M_{cl} - relatedness - complexity - 2020-2021**

In [7]:
# bundle data for M_{cl}
def bundle_data(data, periods):
    data = data[data["period"].isin(periods)]\
        .groupby(["iso2_code", "language"])["num_pushers"]\
        .agg("sum")\
        .reset_index()
    data["period"] = 1
    return data

dfb = bundle_data(df, periods=[1,2])

In [8]:
# software complexity calculation
cdf = ecomplexity(dfb, key_cols)
cdf.to_csv("../outputs/software_complexity_2020_2021_based.csv", index=False, sep=";")

# github space
space_df = proximity(dfb, key_cols)
space_df.to_csv("../outputs/software_space_2020_2021_based.csv", sep=";", index=False)

1
1


In [9]:
# cdf.drop_duplicates(subset="language").sort_values(by="pci").tail(20)

In [10]:
# function to get relatedness network from raw proximity values

def edgelist_for_github_space(data, key_columns):
    """transform the ecomplexity proximity output for visualization"""
    data = data[key_columns]

    # drop zero -- non-existing edges
    data = data[data[key_columns[2]] > 0]

    # drop self loops
    data = data[data[key_columns[0]] != data[key_columns[1]]]
    return data

def maximum_spanning_tree(data, key_columns):
    table = data.copy()
    table["distance"] = 1.0 / table[key_columns[2]]
    G = nx.from_pandas_edgelist(table, source = key_columns[0], target = key_columns[1], edge_attr = ["distance", key_columns[2]])
    T = nx.minimum_spanning_tree(G, weight = "distance")
    table2 = nx.to_pandas_edgelist(T)
    table2 = table2[table2[key_columns[2]] > 0]
    table2.rename(columns = {"source": key_columns[0], "target": key_columns[1], key_columns[2]: "score"}, inplace = True)
    table = pd.merge(
        table,
        table2,
        on=key_columns[0:2]
    )  
    table["edge"] = table.apply(lambda x: "%s-%s" % (min(x[key_columns[0]], x[key_columns[1]]), max(x[key_columns[0]], x[key_columns[1]])), axis = 1)
    table = table.drop_duplicates(subset = ["edge"])
    table = table.drop("edge", 1)
    return table[key_columns]

def add_edges(mst_edges, all_edges, nr_edges_to_add):
    # drop mst edges from the full edgelist
    mst_edges["drop"] = 1
    all_edges = pd.merge(
        all_edges,
        mst_edges,
        on = ["language_1", "language_2", "proximity"],
        how="left"
    )
    all_edges = all_edges[all_edges["drop"] != 1].drop(columns="drop")

    # sort and select
    all_edges = all_edges.sort_values(by="proximity", ascending=False).iloc[:nr_edges_to_add]

    # add to mst edgelist
    software_space_el = pd.concat([mst_el, all_edges])
    return software_space_el

In [11]:
# from space table to MST w/ additional edges
space_table = edgelist_for_github_space(space_df, key_columns=["language_1", "language_2", "proximity"])
mst_el = maximum_spanning_tree(space_table, key_columns=["language_1", "language_2", "proximity"])
mst_graph = nx.from_pandas_edgelist(mst_el, source="language_1", target="language_2")
n_nodes = mst_graph.number_of_nodes()
n_edges = n_nodes * 2
software_space_el = add_edges(mst_el, space_table, nr_edges_to_add=n_edges)

# export for Herr Wachs
software_space_el.to_csv("../outputs/software_space_edgelist_2020_2021_based.csv", index=False, sep=";")

TypeError: DataFrame.drop() takes from 1 to 2 positional arguments but 3 were given

**Part 2 - regression data for cross-sectional entry models**

In [12]:
# version 2 -- (2022) -- entry in (2023)

# bundle data for M_{cl}
dfb3 = bundle_data(df, periods=[3])
dfb4 = bundle_data(df, periods=[4])
dfb3["period"] = 3
dfb4["period"] = 4
dfbs = pd.concat([dfb3, dfb4])

In [13]:
def rca_calculation(table, c_column, p_column, value_column):
    """calculate RCA from an M_cp dataframe"""
    table["e_p"] = table.groupby(p_column)[value_column].transform("sum")
    table["e_c"] = table.groupby(c_column)[value_column].transform("sum")
    table["e"] = table[value_column].sum()

    table["rca"] = (table[value_column] / table["e_p"]) / (table["e_c"] / table["e"])
    table["rca01"] = np.where(table["rca"] >= 1, 1, 0)
    return table

In [14]:
# calculate RCA for each period
rca_tables = list()
for p in dfbs["period"].unique():
    rca_df = dfbs[dfbs["period"]==p]
    rca_tables.append(rca_calculation(rca_df, c_column="iso2_code", p_column="language", value_column="num_pushers"))
rca_tables = pd.concat(rca_tables)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  table["e_p"] = table.groupby(p_column)[value_column].transform("sum")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  table["e_c"] = table.groupby(c_column)[value_column].transform("sum")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  table["e"] = table[value_column].sum()
A value is trying to be se

In [15]:
# identify the entry style
entry_pattern = [0,1]
consider_pattern = [0,0]
ent = rca_tables.sort_values(["period"], ascending=True).groupby(["iso2_code","language"])["rca01"].agg(list).reset_index()
ent["entry01"] = ent["rca01"].apply(lambda x: x == entry_pattern).astype(int)
ent["consider00"] = ent["rca01"].apply(lambda x: x == consider_pattern).astype(int)

In [16]:
# full combination
all_countries = ent["iso2_code"].unique()
all_languages = ent["language"].unique()

all_combinations = list(product(all_countries, all_languages))
full_df = pd.DataFrame(all_combinations, columns=["iso2_code", "language"])\
    .sort_values(["iso2_code", "language"])

# join entries
full_df = pd.merge(
    full_df,
    ent[["iso2_code", "language", "entry01", "consider00"]],
    on=["iso2_code", "language"],
    how="left"
).fillna(0)

# join complexity
# cdf = pd.read_csv("../outputs/complexity_table2020.csv", sep=";")
full_df = pd.merge(
    full_df,
    cdf[["iso2_code", "language", "pci"]],
    on=["iso2_code", "language"],
    how="left"
)

# join RCA from the baseline period
full_df = pd.merge(
    full_df,
    rca_tables[rca_tables["period"]==3].loc[:,["iso2_code", "language", "rca01"]],
    on=["iso2_code", "language"],
    how="left"
)
full_df["rca01"] = full_df["rca01"].fillna(0)

# drop languages with no complexity value
full_df.dropna(subset=["pci"], inplace=True)

In [18]:
#software_space_el = pd.read_csv("../outputs/software_space_edgelist2020.csv", sep=";")
software_space_el = pd.read_csv("../outputs/software_space_edgelist.csv", sep=";")
software_space_el["proximity"] = 1

# symmetric relatedness matrix
relatedness = pd.pivot_table(
    software_space_el,
    values="proximity",
    index=["language_1"],
    columns=["language_2"],
    aggfunc=np.sum,
    margins=False
)
relatedness = relatedness.combine_first(relatedness.T).fillna(0).astype(int)

# matrix from RCA values in the baseline period
rca_tables = rca_tables[rca_tables["language"].isin(relatedness.columns)]
mat = pd.pivot_table(
    rca_tables[rca_tables["period"]==3].loc[:,["iso2_code", "language", "rca01"]],
    values="rca01",
    index=["iso2_code"],
    columns=["language"],
    aggfunc=np.sum,
    margins=False
).fillna(0).astype(int)

# relatedness density
rel = np.dot(mat, relatedness)
reltot = np.sum(relatedness, axis=0)
reltot = reltot.values.flatten()
reldens = rel / reltot
reldens_df = pd.DataFrame(reldens)
reldens_df.index = mat.index
reldens_df
reldens_df.columns = mat.columns
reldens_df = reldens_df.rename_axis("iso2_code")\
  .reset_index()\
  .melt("iso2_code", value_name="rel_density", var_name="language")\
  .reset_index(drop=True)

  relatedness = pd.pivot_table(
  mat = pd.pivot_table(


In [19]:
# join to full_df with entries and PCI
full_df = pd.merge(
    full_df,
    reldens_df,
    on=["iso2_code", "language"],
    how="left"
)

In [20]:
# export for entry models
full_df["entry01"] = full_df["entry01"].astype(int)
full_df["consider00"] = full_df["consider00"].astype(int)
export_df = full_df[(full_df["entry01"]==1) | (full_df["consider00"]==1)]
export_df.to_csv("../outputs/data_entry_regression_2022_to_2023.csv", index=False, sep=";")

In [32]:
export_df

Unnamed: 0,iso2_code,language,entry01,consider00,pci,rca01,rel_density
8,AE,Assembly,0,1,1.930404,0.0,0.000000
11,AE,Batchfile,0,1,1.600481,0.0,0.500000
15,AE,C,1,0,1.473052,0.0,0.333333
16,AE,C#,0,1,0.109249,0.0,1.000000
17,AE,C++,0,1,1.367525,0.0,0.000000
...,...,...,...,...,...,...,...
21900,ZA,XSLT,0,1,2.055318,0.0,0.000000
21918,ZM,C,1,0,1.473052,0.0,0.000000
21988,ZM,PHP,1,0,-0.713058,0.0,1.000000
22081,ZW,Dockerfile,0,1,1.199149,0.0,0.666667


**outdated versions -- will delete later**

In [21]:
# version 1 -- (2020-2021) -- entry in (2022-2023)

# bundle data for M_{cl}
def bundle_data(data, periods):
    data = data[data["period"].isin(periods)]\
        .groupby(["iso2_code", "language"])["num_pushers"]\
        .agg("sum")\
        .reset_index()
    data["period"] = 1
    return data

dfb1 = bundle_data(df, periods=[1,2])
dfb2 = bundle_data(df, periods=[3,4])
dfb2["period"] = 2
dfbs = pd.concat([dfb1, dfb2])

In [22]:
def rca_calculation(table, c_column, p_column, value_column):
    """calculate RCA from an M_cp dataframe"""
    table["e_p"] = table.groupby(p_column)[value_column].transform("sum")
    table["e_c"] = table.groupby(c_column)[value_column].transform("sum")
    table["e"] = table[value_column].sum()

    table["rca"] = (table[value_column] / table["e_p"]) / (table["e_c"] / table["e"])
    table["rca01"] = np.where(table["rca"] >= 1, 1, 0)
    return table

In [23]:
# calculate RCA for each period
rca_tables = list()
for p in dfbs["period"].unique():
    rca_df = dfbs[dfbs["period"]==p]
    rca_tables.append(rca_calculation(rca_df, c_column="iso2_code", p_column="language", value_column="num_pushers"))
rca_tables = pd.concat(rca_tables)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  table["e_p"] = table.groupby(p_column)[value_column].transform("sum")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  table["e_c"] = table.groupby(c_column)[value_column].transform("sum")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  table["e"] = table[value_column].sum()
A value is trying to be se

In [24]:
# identify the entry style
pattern = [0,1]
ent = rca_tables.sort_values(["period"], ascending=True).groupby(["iso2_code","language"])["rca01"].agg(list).reset_index()
ent["entry01"] = ent["rca01"].apply(lambda x: x == pattern).astype(int)

In [25]:
# full combination
all_countries = ent["iso2_code"].unique()
all_languages = ent["language"].unique()

all_combinations = list(product(all_countries, all_languages))
full_df = pd.DataFrame(all_combinations, columns=["iso2_code", "language"])\
    .sort_values(["iso2_code", "language"])

# join entries
full_df = pd.merge(
    full_df,
    ent[["iso2_code", "language", "entry01"]],
    on=["iso2_code", "language"],
    how="left"
).fillna(0)

# join complexity
# cdf = pd.read_csv("../outputs/complexity_table2020.csv", sep=";")
full_df = pd.merge(
    full_df,
    cdf[["iso2_code", "language", "pci"]],
    on=["iso2_code", "language"],
    how="left"
)

# join RCA from the baseline period
full_df = pd.merge(
    full_df,
    rca_tables[rca_tables["period"]==1].loc[:,["iso2_code", "language", "rca01"]],
    on=["iso2_code", "language"],
    how="left"
)
full_df["rca01"] = full_df["rca01"].fillna(0)

# drop languages with no complexity value
full_df.dropna(subset=["pci"], inplace=True)

In [26]:
#software_space_el = pd.read_csv("../outputs/software_space_edgelist2020.csv", sep=";")
software_space_el["proximity"] = 1

# symmetric relatedness matrix
relatedness = pd.pivot_table(
    software_space_el,
    values="proximity",
    index=["language_1"],
    columns=["language_2"],
    aggfunc=np.sum,
    margins=False
)
relatedness = relatedness.combine_first(relatedness.T).fillna(0).astype(int)

# matrix from RCA values in the baseline period
rca_tables = rca_tables[rca_tables["language"].isin(relatedness.columns)]
mat = pd.pivot_table(
    rca_tables[rca_tables["period"]==1].loc[:,["iso2_code", "language", "rca01"]],
    values="rca01",
    index=["iso2_code"],
    columns=["language"],
    aggfunc=np.sum,
    margins=False
).fillna(0).astype(int)

# relatedness density
rel = np.dot(mat, relatedness)
reltot = np.sum(relatedness, axis=0)
reltot = reltot.values.flatten()
reldens = rel / reltot
reldens_df = pd.DataFrame(reldens)
reldens_df.index = mat.index
reldens_df
reldens_df.columns = mat.columns
reldens_df = reldens_df.rename_axis("iso2_code")\
  .reset_index()\
  .melt("iso2_code", value_name="rel_density", var_name="language")\
  .reset_index(drop=True)

  relatedness = pd.pivot_table(
  mat = pd.pivot_table(


In [27]:
# join to full_df with entries and PCI
full_df = pd.merge(
    full_df,
    reldens_df,
    on=["iso2_code", "language"],
    how="left"
)

In [28]:
# export for entry models
#full_df.to_csv("../outputs/data_entry_regression_version1_2023q_added.csv", index=False, sep=";")
full_df.to_csv("../outputs/data_entry_regression_version1_log.csv", index=False, sep=";")

In [29]:
# join to full_df with entries and PCI
full_df = pd.merge(
    full_df,
    reldens_df,
    on=["iso2_code", "language"],
    how="left"
)

In [30]:
# export for entry models
full_df["entry01"] = full_df["entry01"].astype(int)
full_df["consider00"] = full_df["consider00"].astype(int)
export_df = full_df[(full_df["entry01"]==1) | (full_df["consider00"]==1)]
export_df.to_csv("../outputs/data_entry_regression_version3.csv", index=False, sep=";")

KeyError: 'consider00'

In [31]:
# export for entry models

#full_df.to_csv("../outputs/data_entry_regression_version2_2023q_added.csv", index=False, sep=";")
#full_df.to_csv("../outputs/data_entry_regression_version2_log.csv", index=False, sep=";")

**Part 3 - panel data for entry regressions**

In [None]:
# version 1 -- semester -- actually a cross-section ...
# version 2 -- quarter panel

In [None]:
# version 1 -- semester panel
selected_period = "semester"
prev_filter = "|".join(["yaml", "json", "text", "svg", "Markdown", "xml"])
df = drop_specifics_from_list(data, filter_list=prev_filter)
df = top_languages_filter(df, nr_languages=150)
df = drop_country_codes_from_list(df, country_list=["EU"])
df = df[df["year"]>2021]
df = add_period_ids(df, period=selected_period)
print(df.shape)

In [None]:
dfs = df.groupby(["iso2_code", "language", "period"])["num_pushers"]\
    .agg("sum")\
    .reset_index()

if log_num_pushers == True:
    dfs["num_pushers"] = np.log10(dfs["num_pushers"])
#dfb1 = bundle_data(df, periods=[1,2])
#dfb2 = bundle_data(df, periods=[3,4])
#dfb2["period"] = 2
#dfbs = pd.concat([dfb1, dfb2])
dfs

In [None]:
# calculate RCA for each period
rca_tables = list()
for p in dfs["period"].unique():
    print(p)
    rca_df = dfs[dfs["period"]==p]
    rca_tables.append(rca_calculation(rca_df, c_column="iso2_code", p_column="language", value_column="num_pushers"))
rca_tables = pd.concat(rca_tables)

In [None]:
# identify the entry style
pattern = [0,0,1]
ent = rca_tables.sort_values(["period"], ascending=True).groupby(["iso2_code","language"])["rca01"].agg(list).reset_index()
ent["entry01"] = ent["rca01"].apply(lambda x: x == pattern).astype(int)

In [None]:
# full combination
cdf = pd.read_csv("../outputs/software_complexity_2020_2021_based_log.csv", sep=";")
all_countries = ent["iso2_code"].unique()
all_languages = ent["language"].unique()

all_combinations = list(product(all_countries, all_languages))
full_df = pd.DataFrame(all_combinations, columns=["iso2_code", "language"])\
    .sort_values(["iso2_code", "language"])

# join entries
full_df = pd.merge(
    full_df,
    ent[["iso2_code", "language", "entry01"]],
    on=["iso2_code", "language"],
    how="left"
).fillna(0)

# join complexity
# cdf = pd.read_csv("../outputs/complexity_table2020.csv", sep=";")
full_df = pd.merge(
    full_df,
    cdf[["iso2_code", "language", "pci"]],
    on=["iso2_code", "language"],
    how="left"
)

# join RCA from the baseline period
full_df = pd.merge(
    full_df,
    rca_tables[rca_tables["period"]==1].loc[:,["iso2_code", "language", "rca01"]],
    on=["iso2_code", "language"],
    how="left"
)
full_df["rca01"] = full_df["rca01"].fillna(0)

# drop languages with no complexity value
full_df.dropna(subset=["pci"], inplace=True)

In [None]:
#software_space_el = pd.read_csv("../outputs/software_space_2020_2021_based_log.csv", sep=";")
software_space_el = pd.read_csv("../outputs/software_space_edgelist.csv", sep=";")
software_space_el["proximity"] = 1

# symmetric relatedness matrix
relatedness = pd.pivot_table(
    software_space_el,
    values="proximity",
    index=["language_1"],
    columns=["language_2"],
    aggfunc=np.sum,
    margins=False
)
relatedness = relatedness.combine_first(relatedness.T).fillna(0).astype(int)

# matrix from RCA values in the baseline period
rca_tables = rca_tables[rca_tables["language"].isin(relatedness.columns)]
mat = pd.pivot_table(
    rca_tables[rca_tables["period"]==3].loc[:,["iso2_code", "language", "rca01"]],
    values="rca01",
    index=["iso2_code"],
    columns=["language"],
    aggfunc=np.sum,
    margins=False
).fillna(0).astype(int)

# relatedness density
rel = np.dot(mat, relatedness)
reltot = np.sum(relatedness, axis=0)
reltot = reltot.values.flatten()
reldens = rel / reltot
reldens_df = pd.DataFrame(reldens)
reldens_df.index = mat.index
reldens_df
reldens_df.columns = mat.columns
reldens_df = reldens_df.rename_axis("iso2_code")\
  .reset_index()\
  .melt("iso2_code", value_name="rel_density", var_name="language")\
  .reset_index(drop=True)

In [None]:
# join to full_df with entries and PCI
full_df = pd.merge(
    full_df,
    reldens_df,
    on=["iso2_code", "language"],
    how="left"
)

In [None]:
# export for entry models
full_df.to_csv("../outputs/data_entry_regression_version3_semester_based_log.csv", index=False, sep=";")

In [None]:
# version 2 -- quarter panel
selected_period = "semester"
prev_filter = "|".join(["yaml", "json", "text", "svg", "Markdown", "xml"])
df = drop_specifics_from_list(data, filter_list=prev_filter)
df = top_languages_filter(df, nr_languages=150)
df = drop_country_codes_from_list(df, country_list=["EU"])
df = df[df["year"]>2021]
df = add_period_ids(df, period=selected_period)
print(df.shape)

In [None]:
df