In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from ecomplexity import ecomplexity

In [2]:
# data IN
data = pd.read_csv("../data/languages.csv")

In [3]:
# filter motivated by RM del Rio-Chanona et al 2023
prev_filter = "|".join(["yaml", "json", "text", "svg", "Markdown", "xml"])
df = data[~data["language"].str.contains(prev_filter, case=False, regex=True)]

# focus on TOP languages ONLY
top_languages = df.groupby(["language"])["num_pushers"].agg("sum").reset_index().sort_values(by="num_pushers", ascending=False)
top_languages = list(top_languages["language"])[:100]
df = df[df["language"].isin(top_languages)]

In [4]:
# baseline cleaning for the ecomplexity
df = df[df["year"]==2020]
df = df.groupby(["year", "iso2_code", "language"])["num_pushers"].agg("sum").reset_index().sort_values(by="num_pushers", ascending=False)
df = df[df["iso2_code"] != "EU"]

In [5]:
print(df.shape)

(5093, 4)


In [11]:
key_cols = {
    "time": "year",
    "loc": "iso2_code",
    "prod": "language",
    "val": "num_pushers",
}

cdf = ecomplexity(df, key_cols)

2020


In [12]:
cdf[["eci", "iso2_code"]].drop_duplicates().sort_values(by="eci", ascending=False).head(10)

Unnamed: 0,eci,iso2_code
3325,2.059282,DE
2375,2.028812,CH
4370,1.991522,GB
11400,1.978975,SE
13110,1.974972,US
4275,1.963278,FR
9310,1.946025,NL
665,1.931133,AU
5985,1.897379,IT
2185,1.893506,CA


In [13]:
cdf[["eci", "iso2_code"]].drop_duplicates().sort_values(by="eci", ascending=False).tail(10)

Unnamed: 0,eci,iso2_code
6935,-1.142078,LA
11875,-1.203049,SO
8740,-1.203049,MW
2090,-1.242559,BZ
1900,-1.242559,BW
1615,-1.242559,BN
7220,-1.242559,LR
1235,-1.242559,BF
950,-1.242559,BB
8645,-1.242559,MV


In [14]:
complexity_df[["pci", "language"]].drop_duplicates().sort_values(by="pci", ascending=False).head(20)

Unnamed: 0,pci,language
46,2.638637,Mathematica
50,2.630674,Nix
25,2.62654,Gnuplot
47,2.589877,Meson
70,2.589809,SWIG
48,2.566328,NASL
21,2.559011,GAP
73,2.540781,Scheme
22,2.523715,GDB
19,2.522103,Fortran


In [15]:
complexity_df[["pci", "language"]].drop_duplicates().sort_values(by="pci", ascending=False).tail(20)

Unnamed: 0,pci,language
7,0.144629,C#
75,0.089469,Shell
1,0.059983,ASP.NET
87,0.05254,TypeScript
62,-0.003527,Python
83,-0.232902,TSQL
38,-0.311553,Kotlin
39,-0.327332,Less
82,-0.370785,Swift
31,-0.475622,Hack
