In [1]:
import numpy as np
import pandas as pd
import networkx as nx
from itertools import product
from ecomplexity import ecomplexity
from ecomplexity import proximity

**Part 1 - M_{cl} - relatedness - complexity - 2020-2021**

In [6]:
# parameters
selected_period = "year"
#period_for_complexity = 1

# for ecomplexity calculcation
key_cols = {
    "time": "period",
    "loc": "iso2_code",
    "prod": "language",
    "val": "num_pushers",
}


In [7]:
# data IN
data = pd.read_csv("../data/languages.csv")

In [8]:
# filter functions
def drop_specifics_from_list(data, filter_list):
    """filter specific languages from list -- motivated by RM del Rio-Chanona et al 2023"""
    data = data[~data["language"].str.contains(filter_list, case=False, regex=True)]
    return data

def top_languages_filter(data, nr_languages):
    """keep top x number of languages ONLY"""
    top_languages = data.groupby(["language"])["num_pushers"].agg("sum").reset_index().sort_values(by="num_pushers", ascending=False)
    top_languages = list(top_languages["language"])[:nr_languages]
    data = data[data["language"].isin(top_languages)]
    return data
    
def drop_country_codes_from_list(data, country_list):
    data = data[~data["iso2_code"].isin(country_list)]
    data = data.dropna(subset="iso2_code")
    return data

def add_period_ids(data, period):
    """create missing semester ID and construct different period IDs"""
    if period=="year":
        year_to_period = dict(zip(data["year"].unique(), list(range(1, len(data["year"].unique()) + 1))))
        data["period"] = data["year"].map(year_to_period)
    if period=="semester":
        data["semester"] = np.where(data["quarter"] <= 2, 1, 2)
        data["semester_id"] = data["year"].astype(str).str.cat(data["semester"].astype(str), sep="s")
        semester_to_period = dict(zip(data["semester_id"].unique(), list(range(1, len(data["semester_id"].unique()) + 1))))
        data["period"] = data["semester_id"].map(semester_to_period)
    if period=="quarter":
        data["quarter_id"] = data["year"].astype(str).str.cat(data["quarter"].astype(str), sep="q")
        quarter_to_period = dict(zip(data["quarter_id"].unique(), list(range(1, len(data["quarter_id"].unique()) + 1))))
        data["period"] = data["quarter_id"].map(quarter_to_period)
    return data


# probably we can delete later
def dataframe_for_ecomplexity(data, period):
    """aggregate and transform dataframe for ecomplexity functions"""
    #data = data[(data["year"]==focal_year) & (data["quarter"].isin(quarter_list))]
    data = data[(data["period"]==period)]
    data = data\
        .groupby(["period", "iso2_code", "language"])["num_pushers"]\
        .agg("sum")\
        .reset_index()\
        .sort_values(by="num_pushers", ascending=False)
    return data

In [9]:
# steps to prep dataframe of ecomplexity
prev_filter = "|".join(["yaml", "json", "text", "svg", "Markdown", "xml"])
df = drop_specifics_from_list(data, filter_list=prev_filter)
df = top_languages_filter(df, nr_languages=150)
df = drop_country_codes_from_list(df, country_list=["EU"])
df = add_period_ids(df, period=selected_period)
print(df.shape)

(74265, 7)


In [16]:
# bundle data for M_{cl}
def bundle_data(data, periods):
    data = data[data["period"].isin(periods)]\
        .groupby(["iso2_code", "language"])["num_pushers"]\
        .agg("sum")\
        .reset_index()
    data["period"] = 1
    return data

dfb = bundle_data(df, periods=[1,2])

In [17]:
# software complexity calculation
cdf = ecomplexity(dfb, key_cols)
cdf.to_csv("../outputs/software_complexity_2020_2021_based.csv", index=False, sep=";")

# github space
space_df = proximity(dfb, key_cols)
space_df.to_csv("../outputs/software_space_2020_2021_based.csv", sep=";", index=False)

1


In [22]:
# function to get relatedness network from raw proximity values

def edgelist_for_github_space(data, key_columns):
    """transform the ecomplexity proximity output for visualization"""
    data = data[key_columns]

    # drop zero -- non-existing edges
    data = data[data[key_columns[2]] > 0]

    # drop self loops
    data = data[data[key_columns[0]] != data[key_columns[1]]]
    return data

def maximum_spanning_tree(data, key_columns):
    table = data.copy()
    table["distance"] = 1.0 / table[key_columns[2]]
    G = nx.from_pandas_edgelist(table, source = key_columns[0], target = key_columns[1], edge_attr = ["distance", key_columns[2]])
    T = nx.minimum_spanning_tree(G, weight = "distance")
    table2 = nx.to_pandas_edgelist(T)
    table2 = table2[table2[key_columns[2]] > 0]
    table2.rename(columns = {"source": key_columns[0], "target": key_columns[1], key_columns[2]: "score"}, inplace = True)
    table = pd.merge(
        table,
        table2,
        on=key_columns[0:2]
    )  
    table["edge"] = table.apply(lambda x: "%s-%s" % (min(x[key_columns[0]], x[key_columns[1]]), max(x[key_columns[0]], x[key_columns[1]])), axis = 1)
    table = table.drop_duplicates(subset = ["edge"])
    table = table.drop("edge", 1)
    return table[key_columns]

def add_edges(mst_edges, all_edges, nr_edges_to_add):
    # drop mst edges from the full edgelist
    mst_edges["drop"] = 1
    all_edges = pd.merge(
        all_edges,
        mst_edges,
        on = ["language_1", "language_2", "proximity"],
        how="left"
    )
    all_edges = all_edges[all_edges["drop"] != 1].drop(columns="drop")

    # sort and select
    all_edges = all_edges.sort_values(by="proximity", ascending=False).iloc[:nr_edges_to_add]

    # add to mst edgelist
    software_space_el = pd.concat([mst_el, all_edges])
    return software_space_el

In [23]:
# from space table to MST w/ additional edges
space_table = edgelist_for_github_space(space_df, key_columns=["language_1", "language_2", "proximity"])
mst_el = maximum_spanning_tree(space_table, key_columns=["language_1", "language_2", "proximity"])
mst_graph = nx.from_pandas_edgelist(mst_el, source="language_1", target="language_2")
n_nodes = mst_graph.number_of_nodes()
n_edges = n_nodes * 2
software_space_el = add_edges(mst_el, space_table, nr_edges_to_add=n_edges)

# export for Herr Wachs
software_space_el.to_csv("../outputs/software_space_edgelist_2020_2021_based.csv", index=False, sep=";")

  table = table.drop("edge", 1)


**Part 2 - regression data for cross-sectional entry models**

In [None]:
# version 1 -- (2020-2021) -- entry in (2022-2023)
# version 2 -- (2022) -- entry in (2023)

In [84]:
# version 1 -- (2020-2021) -- entry in (2022-2023)

# bundle data for M_{cl}
def bundle_data(data, periods):
    data = data[data["period"].isin(periods)]\
        .groupby(["iso2_code", "language"])["num_pushers"]\
        .agg("sum")\
        .reset_index()
    data["period"] = 1
    return data

dfb1 = bundle_data(df, periods=[1,2])
dfb2 = bundle_data(df, periods=[3,4])
dfb2["period"] = 2
dfbs = pd.concat([dfb1, dfb2])

In [85]:
def rca_calculation(table, c_column, p_column, value_column):
    """calculate RCA from an M_cp dataframe"""
    table["e_p"] = table.groupby(p_column)[value_column].transform("sum")
    table["e_c"] = table.groupby(c_column)[value_column].transform("sum")
    table["e"] = table[value_column].sum()

    table["rca"] = (table[value_column] / table["e_p"]) / (table["e_c"] / table["e"])
    table["rca01"] = np.where(table["rca"] >= 1, 1, 0)
    return table

In [86]:
# calculate RCA for each period
rca_tables = list()
for p in dfbs["period"].unique():
    rca_df = dfbs[dfbs["period"]==p]
    rca_tables.append(rca_calculation(rca_df, c_column="iso2_code", p_column="language", value_column="num_pushers"))
rca_tables = pd.concat(rca_tables)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  table["e_p"] = table.groupby(p_column)[value_column].transform("sum")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  table["e_c"] = table.groupby(c_column)[value_column].transform("sum")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  table["e"] = table[value_column].sum()
A value is trying to be se

In [87]:
# identify the entry style
pattern = [0,1]
ent = rca_tables.sort_values(["period"], ascending=True).groupby(["iso2_code","language"])["rca01"].agg(list).reset_index()
ent["entry01"] = ent["rca01"].apply(lambda x: x == pattern).astype(int)

In [88]:
# full combination
all_countries = ent["iso2_code"].unique()
all_languages = ent["language"].unique()

all_combinations = list(product(all_countries, all_languages))
full_df = pd.DataFrame(all_combinations, columns=["iso2_code", "language"])\
    .sort_values(["iso2_code", "language"])

# join entries
full_df = pd.merge(
    full_df,
    ent[["iso2_code", "language", "entry01"]],
    on=["iso2_code", "language"],
    how="left"
).fillna(0)

# join complexity
# cdf = pd.read_csv("../outputs/complexity_table2020.csv", sep=";")
full_df = pd.merge(
    full_df,
    cdf[["iso2_code", "language", "pci"]],
    on=["iso2_code", "language"],
    how="left"
)

# join RCA from the baseline period
full_df = pd.merge(
    full_df,
    rca_tables[rca_tables["period"]==1].loc[:,["iso2_code", "language", "rca01"]],
    on=["iso2_code", "language"],
    how="left"
)
full_df["rca01"] = full_df["rca01"].fillna(0)

# drop languages with no complexity value
full_df.dropna(subset=["pci"], inplace=True)

In [89]:
#software_space_el = pd.read_csv("../outputs/software_space_edgelist2020.csv", sep=";")
software_space_el["proximity"] = 1

# symmetric relatedness matrix
relatedness = pd.pivot_table(
    software_space_el,
    values="proximity",
    index=["language_1"],
    columns=["language_2"],
    aggfunc=np.sum,
    margins=False
)
relatedness = relatedness.combine_first(relatedness.T).fillna(0).astype(int)

# matrix from RCA values in the baseline period
rca_tables = rca_tables[rca_tables["language"].isin(relatedness.columns)]
mat = pd.pivot_table(
    rca_tables[rca_tables["period"]==1].loc[:,["iso2_code", "language", "rca01"]],
    values="rca01",
    index=["iso2_code"],
    columns=["language"],
    aggfunc=np.sum,
    margins=False
).fillna(0).astype(int)

# relatedness density
rel = np.dot(mat, relatedness)
reltot = np.sum(relatedness, axis=0)
reltot = reltot.values.flatten()
reldens = rel / reltot
reldens_df = pd.DataFrame(reldens)
reldens_df.index = mat.index
reldens_df
reldens_df.columns = mat.columns
reldens_df = reldens_df.rename_axis("iso2_code")\
  .reset_index()\
  .melt("iso2_code", value_name="rel_density", var_name="language")\
  .reset_index(drop=True)

In [90]:
# join to full_df with entries and PCI
full_df = pd.merge(
    full_df,
    reldens_df,
    on=["iso2_code", "language"],
    how="left"
)

In [91]:
# export for entry models
full_df.to_csv("../outputs/data_entry_regression_version1.csv", index=False, sep=";")

In [97]:
# version 2 -- (2022) -- entry in (2023)

# bundle data for M_{cl}
dfb3 = bundle_data(df, periods=[3])
dfb4 = bundle_data(df, periods=[4])
dfb3["period"] = 3
dfb4["period"] = 4
dfbs = pd.concat([dfb3, dfb4])

In [98]:
# calculate RCA for each period
rca_tables = list()
for p in dfbs["period"].unique():
    rca_df = dfbs[dfbs["period"]==p]
    rca_tables.append(rca_calculation(rca_df, c_column="iso2_code", p_column="language", value_column="num_pushers"))
rca_tables = pd.concat(rca_tables)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  table["e_p"] = table.groupby(p_column)[value_column].transform("sum")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  table["e_c"] = table.groupby(c_column)[value_column].transform("sum")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  table["e"] = table[value_column].sum()
A value is trying to be se

In [99]:
# identify the entry style
pattern = [0,1]
ent = rca_tables.sort_values(["period"], ascending=True).groupby(["iso2_code","language"])["rca01"].agg(list).reset_index()
ent["entry01"] = ent["rca01"].apply(lambda x: x == pattern).astype(int)

In [100]:
# full combination
all_countries = ent["iso2_code"].unique()
all_languages = ent["language"].unique()

all_combinations = list(product(all_countries, all_languages))
full_df = pd.DataFrame(all_combinations, columns=["iso2_code", "language"])\
    .sort_values(["iso2_code", "language"])

# join entries
full_df = pd.merge(
    full_df,
    ent[["iso2_code", "language", "entry01"]],
    on=["iso2_code", "language"],
    how="left"
).fillna(0)

# join complexity
# cdf = pd.read_csv("../outputs/complexity_table2020.csv", sep=";")
full_df = pd.merge(
    full_df,
    cdf[["iso2_code", "language", "pci"]],
    on=["iso2_code", "language"],
    how="left"
)

# join RCA from the baseline period
full_df = pd.merge(
    full_df,
    rca_tables[rca_tables["period"]==3].loc[:,["iso2_code", "language", "rca01"]],
    on=["iso2_code", "language"],
    how="left"
)
full_df["rca01"] = full_df["rca01"].fillna(0)

# drop languages with no complexity value
full_df.dropna(subset=["pci"], inplace=True)

In [101]:
#software_space_el = pd.read_csv("../outputs/software_space_edgelist2020.csv", sep=";")
software_space_el["proximity"] = 1

# symmetric relatedness matrix
relatedness = pd.pivot_table(
    software_space_el,
    values="proximity",
    index=["language_1"],
    columns=["language_2"],
    aggfunc=np.sum,
    margins=False
)
relatedness = relatedness.combine_first(relatedness.T).fillna(0).astype(int)

# matrix from RCA values in the baseline period
rca_tables = rca_tables[rca_tables["language"].isin(relatedness.columns)]
mat = pd.pivot_table(
    rca_tables[rca_tables["period"]==3].loc[:,["iso2_code", "language", "rca01"]],
    values="rca01",
    index=["iso2_code"],
    columns=["language"],
    aggfunc=np.sum,
    margins=False
).fillna(0).astype(int)

# relatedness density
rel = np.dot(mat, relatedness)
reltot = np.sum(relatedness, axis=0)
reltot = reltot.values.flatten()
reldens = rel / reltot
reldens_df = pd.DataFrame(reldens)
reldens_df.index = mat.index
reldens_df
reldens_df.columns = mat.columns
reldens_df = reldens_df.rename_axis("iso2_code")\
  .reset_index()\
  .melt("iso2_code", value_name="rel_density", var_name="language")\
  .reset_index(drop=True)

In [102]:
# join to full_df with entries and PCI
full_df = pd.merge(
    full_df,
    reldens_df,
    on=["iso2_code", "language"],
    how="left"
)

In [103]:
# export for entry models
full_df.to_csv("../outputs/data_entry_regression_version2.csv", index=False, sep=";")

In [33]:
def rca_calculation(table, c_column, p_column, value_column):
    """calculate RCA from an M_cp dataframe"""
    table["e_p"] = table.groupby(p_column)[value_column].transform("sum")
    table["e_c"] = table.groupby(c_column)[value_column].transform("sum")
    table["e"] = table[value_column].sum()

    table["rca"] = (table[value_column] / table["e_p"]) / (table["e_c"] / table["e"])
    table["rca01"] = np.where(table["rca"] >= 1, 1, 0)
    return table

In [34]:
# calculate RCA for each period
rca_tables = list()
for p in df["period"].unique():
    temp = dataframe_for_ecomplexity(df, period=p)
    #rca_df = df[df["period"]==p]
    rca_tables.append(rca_calculation(temp, c_column="iso2_code", p_column="language", value_column="num_pushers"))
rca_tables = pd.concat(rca_tables)

In [35]:
# drop 2023
if selected_period=="year":
    rca_tables = rca_tables[rca_tables["period"] < 4]

In [36]:
# identify the entry style
#pattern = [0,0,1,1]
pattern = [0,0,1]
ent = rca_tables.sort_values(["period"], ascending=True).groupby(["iso2_code","language"])["rca01"].agg(list).reset_index()
ent["entry01"] = ent["rca01"].apply(lambda x: x == pattern).astype(int)

In [37]:
# full combination
all_countries = ent["iso2_code"].unique()
all_languages = ent["language"].unique()

all_combinations = list(product(all_countries, all_languages))
full_df = pd.DataFrame(all_combinations, columns=["iso2_code", "language"])\
    .sort_values(["iso2_code", "language"])

# join entries
full_df = pd.merge(
    full_df,
    ent[["iso2_code", "language", "entry01"]],
    on=["iso2_code", "language"],
    how="left"
).fillna(0)

# join complexity
# cdf = pd.read_csv("../outputs/complexity_table2020.csv", sep=";")
full_df = pd.merge(
    full_df,
    cdf[["iso2_code", "language", "pci"]],
    on=["iso2_code", "language"],
    how="left"
)

# join RCA from the baseline period
full_df = pd.merge(
    full_df,
    rca_tables[rca_tables["period"]==1].loc[:,["iso2_code", "language", "rca01"]],
    on=["iso2_code", "language"],
    how="left"
)
full_df["rca01"] = full_df["rca01"].fillna(0)

In [45]:
# matrix from RCA values in the baseline period
mat = pd.pivot_table(
    rca_tables[rca_tables["period"]==1].loc[:,["iso2_code", "language", "rca01"]],
    values="rca01",
    index=["iso2_code"],
    columns=["language"],
    aggfunc=np.sum,
    margins=False
).fillna(0).astype(int)

#software_space_el = pd.read_csv("../outputs/software_space_edgelist2020.csv", sep=";")
software_space_el["proximity"] = 1

# symmetric relatedness matrix
relatedness = pd.pivot_table(
    software_space_el,
    values="proximity",
    index=["language_1"],
    columns=["language_2"],
    aggfunc=np.sum,
    margins=False
)
relatedness = relatedness.combine_first(relatedness.T).fillna(0).astype(int)

# relatedness density
rel = np.dot(mat, relatedness)
reltot = np.sum(relatedness, axis=0)
reltot = reltot.values.flatten()
reldens = rel / reltot
reldens_df = pd.DataFrame(reldens)
reldens_df.index = mat.index
reldens_df
reldens_df.columns = mat.columns
reldens_df = reldens_df.rename_axis("iso2_code")\
  .reset_index()\
  .melt("iso2_code", value_name="rel_density", var_name="language")\
  .reset_index(drop=True)

In [47]:
# join to full_df with entries and PCI
full_df = pd.merge(
    full_df,
    reldens_df,
    on=["iso2_code", "language"],
    how="left"
)



In [50]:
full_df.to_csv("../outputs/regression_df01_2020_2022.csv", index=False, sep=";")

In [223]:
reldens_df = pd.DataFrame(reldens)
reldens_df.index = iso2s
reldens_df.columns = languages
reldens_df = reldens_df.rename_axis("iso2_code")\
  .reset_index()\
  .melt("iso2_code", value_name="rel_density", var_name="language")\
  .reset_index(drop=True)

In [250]:
mat = np.random.randint(2, size=(4, 3))
relatedness = np.random.choice([0, 1], size=(3, 3))

In [251]:
# relatedness density
rel = np.dot(mat, relatedness)
reltot = np.sum(relatedness, axis=0)
reltot = reltot.flatten()
reldens = rel / reltot
reldens_df = pd.DataFrame(reldens)


  reldens = rel / reltot


In [252]:
mat

array([[1, 0, 1],
       [0, 1, 1],
       [0, 0, 1],
       [1, 0, 1]])

In [254]:
relatedness

array([[0, 0, 0],
       [0, 1, 0],
       [0, 0, 0]])

In [253]:
reldens

array([[nan,  0., nan],
       [nan,  1., nan],
       [nan,  0., nan],
       [nan,  0., nan]])

In [245]:
reldens_df

Unnamed: 0,0,1,2
0,1.0,0.666667,1.0
1,0.0,0.333333,0.0
2,0.0,0.333333,0.0
3,1.0,0.666667,0.5
4,1.0,0.333333,0.5
5,1.0,0.333333,0.5
6,1.0,0.666667,0.5
7,1.0,1.0,1.0


In [191]:
reldens = pd.DataFrame(reldens)
iso2s = mat.index
languages = mat.columns
reldens.columns = languages
reldens.index = iso2s


In [197]:
reldens

array([[0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

language,Unnamed: 1,AMPL,ANTLR,ASP.NET,ActionScript,Ada,Apex,AppleScript,Assembly,AutoHotkey,...,VHDL,Verilog,Vim Script,Vim Snippet,Visual Basic .NET,Vue,XS,XSLT,Yacc,sed
0,AE,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0
1,AF,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,AL,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,AM,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0
4,AO,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142,VN,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
143,YE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
144,ZA,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0
145,ZM,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [194]:
reldens.rename_axis("")\
  .reset_index()\
  .melt('iso2_code', value_name='Weight', var_name='Target')\
  .query('Source != Target')\
  .reset_index(drop=True)

KeyError: 'iso2_code'

In [176]:
iso2s = mat.index
languages = mat.columns
reldens.columns = languages
reldens.index = iso2s
#pd.melt(reldens, id_vars="language", value_vars='reldens', var_name='iso2_code', value_name='weight')
reldens

language,AMPL,ANTLR,ASP.NET,ActionScript,Ada,Apex,AppleScript,Assembly,AutoHotkey,Awk,...,VHDL,Verilog,Vim Script,Vim Snippet,Visual Basic .NET,Vue,XS,XSLT,Yacc,sed
iso2_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AE,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0
AF,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AL,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AM,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0
AO,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
VN,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
YE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ZA,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0
ZM,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [74]:
temp = rca_tables[rca_tables["period"]==1].loc[:,["iso2_code", "language", "rca01"]]
temp.language.nunique()

142

In [123]:
rel.shape

(147, 142)

In [125]:
reltot.shape

(142,)

In [120]:
reltot

AMPL             6
ANTLR            1
ASP.NET          1
ActionScript     5
Ada              2
                ..
Vue              2
XS               1
XSLT            13
Yacc             3
sed              5
Length: 142, dtype: int64

In [101]:
relatedness

Unnamed: 0,AMPL,ANTLR,ASP.NET,ActionScript,Ada,Apex,AppleScript,Assembly,AutoHotkey,Awk,...,VHDL,Verilog,Vim Script,Vim Snippet,Visual Basic .NET,Vue,XS,XSLT,Yacc,sed
AMPL,,,,,,,,,,,...,,,,,,,,,,
ANTLR,,,,,,,,,,,...,,,,,,,,,,
ASP.NET,,,,,,,,,,,...,,,,,,,,,,
ActionScript,,,,,,,,,,,...,,,,,,,,,,
Ada,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Vue,,,,,,,,,,,...,,,,,,,,,,
XS,,,,,,,,,,,...,,,,,,,,,,
XSLT,,,,,,,,,,1.0,...,,,1.0,,,,,,,
Yacc,,,,,,,,,,1.0,...,,,,,,,,,,


In [83]:
pd.DataFrame(rel).sum().sum()

0.0

In [69]:
reltot

AMPL             6.0
ANTLR            1.0
ASP.NET          1.0
ActionScript     5.0
Ada              2.0
                ... 
Vue              2.0
XS               1.0
XSLT            13.0
Yacc             3.0
sed              5.0
Length: 142, dtype: float64

In [63]:
relatedness.sum().sum()

498.0

In [61]:
pd.DataFrame(rel)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,132,133,134,135,136,137,138,139,140,141
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142,,,,,,,,,,,...,,,,,,,,,,
143,,,,,,,,,,,...,,,,,,,,,,
144,,,,,,,,,,,...,,,,,,,,,,
145,,,,,,,,,,,...,,,,,,,,,,


In [57]:
mat

language,AMPL,ANTLR,ASP.NET,ActionScript,Ada,Apex,AppleScript,Assembly,AutoHotkey,Awk,...,VHDL,Verilog,Vim Script,Vim Snippet,Visual Basic .NET,Vue,XS,XSLT,Yacc,sed
iso2_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AE,,,,,,,,,,,...,,,,,,0.0,,,,
AF,,,,,,,,,,,...,,,,,,,,,,
AL,,,,,,,,,,,...,,,,,,0.0,,,,
AM,,,,,,,,,,,...,,,,,,1.0,,,,
AO,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
VN,,,1.0,,,,,0.0,,0.0,...,,,0.0,,0.0,1.0,,0.0,,
YE,,,,,,,,,,,...,,,,,,,,,,
ZA,,,1.0,,,,,0.0,,,...,,,0.0,,,0.0,,0.0,,
ZM,,,,,,,,,,,...,,,,,,,,,,


In [56]:
relatedness

Unnamed: 0,AMPL,ANTLR,ASP.NET,ActionScript,Ada,Apex,AppleScript,Assembly,AutoHotkey,Awk,...,VHDL,Verilog,Vim Script,Vim Snippet,Visual Basic .NET,Vue,XS,XSLT,Yacc,sed
AMPL,,,,,,,,,,,...,,,,,,,,,,
ANTLR,,,,,,,,,,,...,,,,,,,,,,
ASP.NET,,,,,,,,,,,...,,,,,,,,,,
ActionScript,,,,,,,,,,,...,,,,,,,,,,
Ada,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Vue,,,,,,,,,,,...,,,,,,,,,,
XS,,,,,,,,,,,...,,,,,,,,,,
XSLT,,,,,,,,,,1.0,...,,,1.0,,,,,,,
Yacc,,,,,,,,,,1.0,...,,,,,,,,,,


In [43]:
software_space_el[software_space_el["language_1"]=="Vue"]

Unnamed: 0,language_1,language_2,proximity,drop
140,Vue,Blade,1,1.0


In [168]:
# export
full_df.to_csv("../outputs/entry_table_smooth.csv", index=False, sep=";")

In [169]:
rca_tables

Unnamed: 0,period,iso2_code,language,num_pushers,e_p,e_c,e,rca,rca01
5403,1,US,HTML,1480236,7784335,11045990,52378735,0.901695,0
5410,1,US,JavaScript,1195526,6282550,11045990,52378735,0.902347,0
5374,1,US,CSS,1136806,5930031,11045990,52378735,0.909033,0
5449,1,US,Python,884897,3650926,11045990,52378735,1.149318,1
2780,1,IN,HTML,866219,7784335,4963716,52378735,1.174233,1
...,...,...,...,...,...,...,...,...,...
6017,4,TR,Mako,101,15280,234503,19752306,0.556759,0
5511,4,SD,Objective-C,101,164061,3100,19752306,3.922583,1
5505,4,SD,C++,101,496777,3100,19752306,1.295436,1
6555,4,VN,Haskell,101,15640,339873,19752306,0.375306,0


In [160]:
cdf.head(2)

Unnamed: 0,iso2_code,language,num_pushers,period,diversity,ubiquity,mcp,eci,pci,density,coi,cog,rca
0,AE,AMPL,0,1,17,6,0,-0.221688,2.695782,0.016348,-0.329448,1.208442,0.0
1,AE,ANTLR,0,1,17,14,0,-0.221688,2.299902,0.030796,-0.329448,1.273815,0.0


In [152]:
ent.entry01.sum()

129

In [141]:
rca_tables[(rca_tables["iso2_code"]=="US") & (rca_tables["language"]=="PostScript")]


Unnamed: 0,num_pushers,language,language_type,iso2_code,year,quarter,period,e_p,e_c,e,rca,rca01
407,2022,PostScript,markup,US,2020,2,1,20731,11045990,52378735,0.462499,0
1003,1815,PostScript,markup,US,2020,4,1,20731,11045990,52378735,0.415152,0
22174,1776,PostScript,markup,US,2020,3,1,20731,11045990,52378735,0.406231,0
30795,1935,PostScript,markup,US,2020,1,1,20731,11045990,52378735,0.4426,0
1519,1927,PostScript,markup,US,2021,2,2,20777,11404522,60049873,0.488353,0
17127,1886,PostScript,markup,US,2021,1,2,20777,11404522,60049873,0.477963,0
17847,1953,PostScript,markup,US,2021,4,2,20777,11404522,60049873,0.494942,0
52634,1947,PostScript,markup,US,2021,3,2,20777,11404522,60049873,0.493422,0
23701,1880,PostScript,markup,US,2022,3,3,21655,12933958,71433254,0.479478,0
54672,2068,PostScript,markup,US,2022,2,3,21655,12933958,71433254,0.527426,0


In [145]:
df.period.value_counts()

3    25230
2    22696
1    19630
4     6709
Name: period, dtype: int64

In [88]:
# 
entry_df = rca_tables\
    .sort_values(by=["iso2_code", "language", "period"])\
    .groupby(["iso2_code", "language")['value'].agg(list).reset_index()

Unnamed: 0,num_pushers,language,language_type,iso2_code,year,quarter,period,e_p,e_c,e,rca,rca01
158,119186,CSS,markup,CN,2020,1,1,5930031,4425445.0,52379076,0.237886,0
159,110283,CSS,markup,IN,2020,1,1,5930031,4963716.0,52379076,0.196247,0
160,27000,CSS,markup,KR,2020,1,1,5930031,1105796.0,52379076,0.215670,0
161,33798,Vue,markup,CN,2020,1,1,580559,4425445.0,52379076,0.689042,0
162,16465,Roff,markup,US,2020,1,1,249008,11045990.0,52379076,0.313546,0
...,...,...,...,...,...,...,...,...,...,...,...,...
81557,8970,Starlark,programming,US,2023,1,4,37696,3485319.0,19752746,1.348597,1
81560,919,Java,programming,AZ,2023,1,4,753319,20380.0,19752746,1.182388,1
81563,1251,Procfile,programming,PK,2023,1,4,90470,177676.0,19752746,1.537275,1
81565,631,PLpgSQL,programming,PL,2023,1,4,30106,337551.0,19752746,1.226491,1


In [82]:
rca_table.head(2)

Unnamed: 0,num_pushers,language,language_type,iso2_code,year,quarter,period,e_p,e_c,e,rca,rca01
4443,68649,JavaScript,programming,DE,2023,1,4,2411286,776943.0,19752746,0.723809,0
4444,21447,TypeScript,programming,GB,2023,1,4,686185,680536.0,19752746,0.907197,0


In [50]:
# period t+1 -- follow up
window_data2 = df[df["year"].isin(window2)]  
window_id2 = '-'.join(map(str, window2))
window_data2["year"] = window2[0]

# ecomplexity for window
df_wprep2 = dataframe_for_ecomplexity(window_data2, focal_year=window2[0], quarter_list=[1, 2, 3, 4])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  window_data2["year"] = window2[0]


In [51]:
# construct baseline t - t+1 entry table
edf1 = rca_calculation(df_wprep, c_column="iso2_code", p_column="language", value_column="num_pushers")
edf2 = rca_calculation(df_wprep2, c_column="iso2_code", p_column="language", value_column="num_pushers")
edf = pd.concat([edf1, edf2])

In [52]:
# all possible combinations table
all_periods = edf["year"].unique()
all_countries = edf["iso2_code"].unique()
all_languages = edf["language"].unique()

all_combinations = list(product(all_periods, all_countries, all_languages))
full_df = pd.DataFrame(all_combinations, columns=["year", "iso2_code", "language"])\
    .sort_values(["iso2_code", "language", "year"])

# join entries
full_df = pd.merge(
    full_df,
    edf,
    on=["year", "iso2_code", "language"],
    how="left"
).fillna(0)

In [53]:
# 0,0,0,1,0,0 style entry -- rca_entry010
full_df["prev_rca"] = full_df["rca01"].shift(1)
full_df["entry01"] = np.where((full_df["prev_rca"]==0) & (full_df["rca01"]==1), 1, 0)

In [56]:
# export entry dataframe
full_df.drop(["prev_rca"], axis=1).to_csv("../outputs/entry_table_2periods.csv", index=False, sep=";")

In [50]:
# semester level entry
entry_periods = sorted(rdf["semester_id"].unique())
edf = []
for ep in entry_periods:
    temp = rca_calculation(rdf.loc[(rdf["semester_id"]==ep),:], c_column="iso2_code", p_column="language", value_column="num_pushers")\
        .loc[:,["semester_id", "iso2_code", "language", "rca01"]]
    edf.append(temp)
edf = pd.concat(edf)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  table["e_p"] = table.groupby(p_column)[value_column].transform("sum")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  table["e_c"] = table.groupby(c_column)[value_column].transform("sum")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  table["e"] = table[value_column].sum()
A value is trying to be se

In [51]:
# all possible combinations table
all_periods = edf["semester_id"].unique()
all_countries = edf["iso2_code"].unique()
all_languages = edf["language"].unique()

all_combinations = list(product(all_periods, all_countries, all_languages))
full_df = pd.DataFrame(all_combinations, columns=["semester_id", "iso2_code", "language"])\
    .sort_values(["iso2_code", "language", "semester_id"])

# join entries
full_df = pd.merge(
    full_df,
    edf,
    on=["semester_id", "iso2_code", "language"],
    how="left"
).fillna(0)

In [52]:
# 0,0,0,1,0,0 style entry -- rca_entry010
full_df["prev_rca"] = full_df["rca01"].shift(1)
full_df["entry010"] = np.where((full_df["prev_rca"]==0) & (full_df["rca01"]==1), 1, 0)

In [53]:
# 0,0,0,1,1,1 style entry -- rca_entry
full_df = full_df.sort_values(["iso2_code", "language", "semester_id"])
full_df["entry011"] = full_df.groupby(["iso2_code", "language"])["rca01"].transform(lambda x: x.cummax())
full_df["entry011"] = full_df["entry011"].astype(int)

In [55]:
# export entry dataframe
full_df.to_csv("../outputs/entry_table.csv", index=False, sep=";")