In [None]:
import itertools
import time

import catboost
import IPython.core.interactiveshell
import joblib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import shap
import sklearn.calibration
import sklearn.dummy
import sklearn.ensemble
import sklearn.inspection
import sklearn.linear_model
import sklearn.metrics
import sklearn.model_selection
import sklearn.naive_bayes
import sklearn.neighbors
import sklearn.neural_network
import sklearn.pipeline
import sklearn.preprocessing
import sklearn.svm

from common import import_features_contributors, initialize, selected

In [None]:
IPython.core.interactiveshell.InteractiveShell.ast_node_interactivity = "all"
pd.options.display.max_columns = 1000
pd.options.display.max_colwidth = 1000
pd.options.display.max_rows = 1000
%matplotlib inline
initialize()
n_jobs = -1

In [None]:
projects = sorted(selected())
projects

In [None]:
projects_names = {
    "odoo/odoo": "Odoo",
    "kubernetes/kubernetes": "Kubernetes",
    "elastic/elasticsearch": "Elasticsearch",
    "pytorch/pytorch": "PyTorch",
    "rust-lang/rust": "Rust",
    "definitelytyped/definitelytyped": "DefinitelyTyped",
    "home-assistant/core": "HomeAssistant",
    "ansible/ansible": "Ansible",
    "cockroachdb/cockroach": "CockroachDB",
    "apple/swift": "Swift",
    "flutter/flutter": "Flutter",
    "apache/spark": "Spark",
    "python/cpython": "Python",
    "getsentry/sentry": "Sentry",
    "paddlepaddle/paddle": "PaddlePaddle",
    "godotengine/godot": "Godot",
    "rails/rails": "Rails",
    "grafana/grafana": "Grafana",
    "clickhouse/clickhouse": "ClickHouse",
    "symfony/symfony": "Symfony",
}

In [None]:
features_all = (
    pd.concat([import_features_contributors(project) for project in projects])
    .reset_index()
    .set_index(["project", "pull_number"])
).sort_values("maintainer_responded_at")

features_all.describe().round(2).T

In [None]:
features_all = features_all.query("not is_bot and contributor != 'ghost' and contributor_latency > 0").copy()
features_all.describe().round(2).T

In [None]:
labels = ["(1) Within 1 Day", "(2) 1 Day to 1 Week", "(3) More than 1 Week"]
features_all["label"] = pd.cut(features_all["contributor_latency"], bins=[0, 24, 7 * 24, np.inf], labels=labels)

(features_all["label"].value_counts(normalize=True, sort=False) * 100).round(2)

ratios = pd.DataFrame()
for project in projects:
    features = features_all.query("project == @project")
    ratios[project] = features["label"].value_counts(normalize=True, sort=False) * 100
    ratios.loc["size", project] = len(features)

ratios = ratios.T.astype({"size": int})
ratios.sort_values("(1) Within 1 Day", ascending=False).round(1)
ratios.describe().T.round(1)

print(
    ratios.sort_index(key=lambda x: x.map({v: i for i, v in enumerate(projects_names.keys())}))
    .rename(projects_names)
    .to_latex(float_format="%.1f")
)

In [None]:
features_all.to_csv("features_contributors.csv")

In [None]:
features_all.columns

In [None]:
characteristics = [
    "pr_description",
    "pr_commits",
    "pr_changed_lines",
    "pr_changed_files",
    "contributor_pulls",
    "contributor_open_pulls",
    "contributor_acceptance_rate",
    "contributor_median_latency",
    "project_pulls",
    "project_open_pulls",
    "project_maintainers",
    "project_community",
    "project_median_latency",
    "review_latency",
    "review_hour",
    "review_day",
    "review_contributor_events",
    "review_participants_events",
    "review_bots_events",
]

In [None]:
correlations = features_all[characteristics].corr(method="spearman")

_ = plt.figure(figsize=(13, 11))
_ = sns.heatmap(
    correlations,
    vmin=-1,
    vmax=1,
    cmap="RdBu",
    annot=True,
    fmt=".2f",
    square=True,
    mask=np.triu(np.ones_like(correlations)),
)
plt.tight_layout()
plt.savefig("correlations_contributors.pdf")

In [None]:
characteristics.remove("pr_changed_lines")
characteristics.remove("pr_changed_files")
characteristics.remove("contributor_pulls")
characteristics.remove("project_pulls")
characteristics

In [None]:
def evaluate_model(X, y, train_index, test_index, number):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    y_test_bin = sklearn.preprocessing.label_binarize(y_test, classes=labels)

    models = {
        "CB": catboost.CatBoostClassifier(
            objective="MultiClassOneVsAll", random_state=1, thread_count=n_jobs, silent=True
        ),
        "DM": sklearn.dummy.DummyClassifier(strategy="most_frequent", random_state=1),
        "KNN": sklearn.pipeline.make_pipeline(
            sklearn.preprocessing.FunctionTransformer(np.log1p),
            sklearn.preprocessing.StandardScaler(),
            sklearn.neighbors.KNeighborsClassifier(n_jobs=n_jobs),
        ),
        "LR": sklearn.pipeline.make_pipeline(
            sklearn.preprocessing.FunctionTransformer(np.log1p),
            sklearn.preprocessing.StandardScaler(),
            sklearn.linear_model.LogisticRegression(random_state=1, n_jobs=n_jobs),
        ),
        "NB": sklearn.pipeline.make_pipeline(
            sklearn.preprocessing.FunctionTransformer(np.log1p),
            sklearn.preprocessing.StandardScaler(),
            sklearn.naive_bayes.GaussianNB(),
        ),
        "NN": sklearn.pipeline.make_pipeline(
            sklearn.preprocessing.FunctionTransformer(np.log1p),
            sklearn.preprocessing.StandardScaler(),
            sklearn.neural_network.MLPClassifier(random_state=1),
        ),
        "RF": sklearn.ensemble.RandomForestClassifier(random_state=1, n_jobs=n_jobs),
        "SVM": sklearn.pipeline.make_pipeline(
            sklearn.preprocessing.FunctionTransformer(np.log1p),
            sklearn.preprocessing.StandardScaler(),
            sklearn.svm.SVC(probability=True, random_state=1),
        ),
    }

    records = []

    for name, model in models.items():
        model.fit(X_train, y_train)

        if name != "DM":
            model = sklearn.calibration.CalibratedClassifierCV(
                model, method="isotonic", cv="prefit", n_jobs=n_jobs
            ).fit(X_train, y_train)

        y_pred = model.predict(X_test)
        y_pred_proba = model.predict_proba(X_test)

        precision = sklearn.metrics.precision_score(y_test, y_pred, average=None, zero_division=0)
        recall = sklearn.metrics.recall_score(y_test, y_pred, average=None, zero_division=0)

        for i in range(len(labels)):
            aucroc = sklearn.metrics.roc_auc_score(y_test_bin[:, i], y_pred_proba[:, i])
            records.append(
                {
                    "model": name,
                    "metric": "aucroc",
                    "number": number,
                    "label": labels[i],
                    "score": aucroc,
                }
            )

            aucpr = sklearn.metrics.average_precision_score(y_test_bin[:, i], y_pred_proba[:, i])
            records.append(
                {
                    "model": name,
                    "metric": "aucpr",
                    "number": number,
                    "label": labels[i],
                    "score": aucpr,
                }
            )

            records.append(
                {
                    "model": name,
                    "metric": "precision",
                    "number": number,
                    "label": labels[i],
                    "score": precision[i],
                }
            )

            records.append(
                {
                    "model": name,
                    "metric": "recall",
                    "number": number,
                    "label": labels[i],
                    "score": recall[i],
                }
            )

    return records

In [None]:
performances = {}

with joblib.Parallel(n_jobs=n_jobs) as parallel:
    for project in projects:
        project

        features = features_all.query("project == @project")
        X = features[characteristics]
        y = features["label"]

        performances[project] = pd.DataFrame(
            itertools.chain(
                *parallel(
                    joblib.delayed(evaluate_model)(X, y, train_index, test_index, number)
                    for number, (train_index, test_index) in enumerate(
                        sklearn.model_selection.TimeSeriesSplit(n_splits=10).split(X), 1
                    )
                )
            )
        ).pivot_table(values="score", index=["metric", "label"], columns=["model", "number"])

        performances[project].round(2)
        performances[project].T.groupby("model").mean().T.round(2)
        performances[project].T.groupby("model").mean().T.groupby("metric").mean().T.round(2)

joblib.dump(pd.concat(performances, names=["project"]), "performances_contributors.joblib")

In [None]:
def average_precision_ovr(y_true, y_pred_proba):
    y_true_bin = sklearn.preprocessing.label_binarize(y_true, classes=labels)
    return np.mean(
        [sklearn.metrics.average_precision_score(y_true_bin[:, i], y_pred_proba[:, i]) for i in range(len(labels))]
    )


average_precision_ovr = sklearn.metrics.make_scorer(average_precision_ovr, needs_proba=True)

In [None]:
def measure_importance(X, y, train_index, test_index, number):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    model = catboost.CatBoostClassifier(
        objective="MultiClassOneVsAll", random_state=1, thread_count=n_jobs, silent=True
    ).fit(X_train, y_train)
    model = sklearn.calibration.CalibratedClassifierCV(model, method="isotonic", cv="prefit", n_jobs=n_jobs).fit(
        X_train, y_train
    )

    records = []

    for metric, scoring in {"aucroc": "roc_auc_ovr", "aucpr": average_precision_ovr}.items():
        importances = sklearn.inspection.permutation_importance(
            model, X_test, y_test, scoring=scoring, n_repeats=10, random_state=1, n_jobs=n_jobs
        ).importances_mean

        record = {"metric": metric, "number": number}
        record.update({characteristics[i]: importance for i, importance in enumerate(importances)})
        records.append(record)

    return records

In [None]:
importances = {}

with joblib.Parallel(n_jobs=n_jobs) as parallel:
    for project in projects:
        project

        features = features_all.query("project == @project")
        X = features[characteristics]
        y = features["label"]

        importances[project] = pd.DataFrame(
            itertools.chain(
                *parallel(
                    joblib.delayed(measure_importance)(X, y, train_index, test_index, number)
                    for number, (train_index, test_index) in enumerate(
                        sklearn.model_selection.TimeSeriesSplit(n_splits=10).split(X), 1
                    )
                )
            )
        )

        importances[project].round(3)

joblib.dump(
    pd.concat(importances, names=["project"]).droplevel(1).set_index(["metric", "number"], append=True),
    "importances_contributors.joblib",
)

In [None]:
def evaluate_model_generic(project, features):
    train, test = features.query("project != @project"), features.query("project == @project")
    X_train, X_test = train[characteristics], test[characteristics]
    y_train, y_test = train["label"], test["label"]
    y_test_bin = sklearn.preprocessing.label_binarize(y_test, classes=labels)

    models = {
        "CB": catboost.CatBoostClassifier(
            objective="MultiClassOneVsAll", random_state=1, thread_count=n_jobs, silent=True
        ),
        "DM": sklearn.dummy.DummyClassifier(strategy="most_frequent", random_state=1),
    }

    records = []

    for name, model in models.items():
        model.fit(X_train, y_train)

        if name != "DM":
            model = sklearn.calibration.CalibratedClassifierCV(
                model, method="isotonic", cv="prefit", n_jobs=n_jobs
            ).fit(X_train, y_train)

        y_pred = model.predict(X_test)
        y_pred_proba = model.predict_proba(X_test)

        precision = sklearn.metrics.precision_score(y_test, y_pred, average=None, zero_division=0)
        recall = sklearn.metrics.recall_score(y_test, y_pred, average=None, zero_division=0)

        for i in range(len(labels)):
            aucroc = sklearn.metrics.roc_auc_score(y_test_bin[:, i], y_pred_proba[:, i])
            records.append(
                {
                    "project": project,
                    "model": name,
                    "metric": "aucroc",
                    "label": labels[i],
                    "score": aucroc,
                }
            )

            aucpr = sklearn.metrics.average_precision_score(y_test_bin[:, i], y_pred_proba[:, i])
            records.append(
                {
                    "project": project,
                    "model": name,
                    "metric": "aucpr",
                    "label": labels[i],
                    "score": aucpr,
                }
            )

            records.append(
                {
                    "project": project,
                    "model": name,
                    "metric": "precision",
                    "label": labels[i],
                    "score": precision[i],
                }
            )

            records.append(
                {
                    "project": project,
                    "model": name,
                    "metric": "recall",
                    "label": labels[i],
                    "score": recall[i],
                }
            )

    return records

In [None]:
with joblib.Parallel(n_jobs=n_jobs) as parallel:
    performances = pd.DataFrame(
        itertools.chain(
            *parallel(joblib.delayed(evaluate_model_generic)(project, features_all) for project in projects)
        )
    )

    performances.round(2)

joblib.dump(performances, "performances_contributors_generic.joblib")

In [None]:
def measure_importance_generic(project, features):
    train, test = features.query("project != @project"), features.query("project == @project")
    X_train, X_test = train[characteristics], test[characteristics]
    y_train, y_test = train["label"], test["label"]

    model = catboost.CatBoostClassifier(
        objective="MultiClassOneVsAll", random_state=1, thread_count=n_jobs, silent=True
    ).fit(X_train, y_train)
    model = sklearn.calibration.CalibratedClassifierCV(model, method="isotonic", cv="prefit", n_jobs=n_jobs).fit(
        X_train, y_train
    )

    records = []

    for metric, scoring in {"aucroc": "roc_auc_ovr", "aucpr": average_precision_ovr}.items():
        importances = sklearn.inspection.permutation_importance(
            model, X_test, y_test, scoring=scoring, n_repeats=10, random_state=1, n_jobs=n_jobs
        ).importances_mean

        record = {"metric": metric, "project": project}
        record.update({characteristics[i]: importance for i, importance in enumerate(importances)})
        records.append(record)

    return records

In [None]:
with joblib.Parallel(n_jobs=n_jobs) as parallel:
    importances = pd.DataFrame(
        itertools.chain(
            *parallel(joblib.delayed(measure_importance_generic)(project, features_all) for project in projects)
        )
    )

    importances.round(3)

joblib.dump(importances, "importances_contributors_generic.joblib")

In [None]:
import rpy2.robjects.packages
import rpy2.robjects.pandas2ri

rpy2.robjects.pandas2ri.activate()
scottknottesd = rpy2.robjects.packages.importr("ScottKnottESD")

In [None]:
projects = projects_names

characteristics = {
    "pr_description": "Description Length",
    "pr_commits": "Commits",
    "contributor_open_pulls": "Contributor Backlog",
    "contributor_acceptance_rate": "Contributor Performance",
    "contributor_median_latency": "Contributor Responsiveness",
    "project_open_pulls": "Project Backlog",
    "project_maintainers": "Maintainers Availability",
    "project_community": "Community Size",
    "project_median_latency": "Maintainers Responsiveness",
    "review_latency": "Review Latency",
    "review_hour": "Review Hour",
    "review_day": "Review Day",
    "review_contributor_events": "Contributor Activity",
    "review_participants_events": "Participants Activity",
    "review_bots_events": "Bots Activity",
}

models = ["CB", "KNN", "LR", "NB", "NN", "RF", "SVM"]
modelsp = [f"{model}%" for model in models]

In [None]:
performances = joblib.load("performances_contributors.joblib")

performances = performances.sort_index(
    key=lambda x: x.map({v: i for i, v in enumerate(projects.keys())}), level="project"
).rename(projects, level="project")

precisions = performances.query("metric == 'precision'")
precisions.round(2)

recalls = performances.query("metric == 'recall'")
recalls.round(2)

performances = performances.query("metric in ['aucroc', 'aucpr']")

for model in models:
    for number in performances.columns.unique("number"):
        performances[(f"{model}%", number)] = (
            (performances[(f"{model}", number)] / performances[("DM", number)]) - 1
        ) * 100

performances.round(2)
performances.T.groupby(["model"]).mean().T.round(2)

performances = performances.groupby(["project", "metric"], sort=False).mean()
performances.round(2)

aucrocs = performances.query("metric == 'aucroc'")
aucprs = performances.query("metric == 'aucpr'")

performances = performances.T.groupby("model").mean().T
performances.round(2)

In [None]:
performances_aucroc = performances.query("metric == 'aucroc'").droplevel("metric")
performances_aucroc.describe().T.round(2)

table = performances_aucroc.copy()
table.loc["Average"] = performances_aucroc.mean()
table.loc["Median"] = performances_aucroc.median()

for model in models:
    table[model] = table[model].round(2).astype(str) + " (" + table[f"{model}%"].round().astype(int).astype(str) + "%)"

table[models]
print(table[models].to_latex())

In [None]:
performances_aucpr = performances.query("metric == 'aucpr'").droplevel("metric")
performances_aucpr.describe().T.round(2)

table = performances_aucpr.copy()
table.loc["Average"] = performances_aucpr.mean()
table.loc["Median"] = performances_aucpr.median()

for model in models:
    table[model] = table[model].round(2).astype(str) + " (" + table[f"{model}%"].round().astype(int).astype(str) + "%)"

table[models]
print(table[models].to_latex())

In [None]:
precisions = (
    precisions.T.query("model in ['CB', 'DM']").T.groupby("project", sort=False).mean().T.groupby("model").mean().T
)
recalls = recalls.T.query("model in ['CB', 'DM']").T.groupby("project", sort=False).mean().T.groupby("model").mean().T

In [None]:
precisions["%"] = ((precisions["CB"] / precisions["DM"]) - 1) * 100
recalls["%"] = ((recalls["CB"] / recalls["DM"]) - 1) * 100

precisions.loc["Average"] = precisions.mean()
recalls.loc["Average"] = recalls.mean()

table = pd.DataFrame()
table["precision"] = (
    precisions["CB"].round(2).astype(str) + " (" + precisions["%"].round().astype(int).astype(str) + "%)"
)
table["recall"] = recalls["CB"].round(2).astype(str) + " (" + recalls["%"].round().astype(int).astype(str) + "%)"

table
print(table.to_latex())

In [None]:
models_rankings_aucroc = []
models_rankings_aucpr = []

for project in projects.values():
    skesd_aucroc = scottknottesd.sk_esd(
        pd.DataFrame({model: aucrocs.query("project == @project")[model].values[0].tolist() for model in models}),
        version="np",
    )
    skesd_aucpr = scottknottesd.sk_esd(
        pd.DataFrame({model: aucprs.query("project == @project")[model].values[0].tolist() for model in models}),
        version="np",
    )

    models_rankings_aucroc.append(
        pd.DataFrame(
            [skesd_aucroc[1].astype(int)],
            columns=[models[i] for i in [i - 1 for i in skesd_aucroc[3]]],
            index=[project],
        )
    )
    models_rankings_aucpr.append(
        pd.DataFrame(
            [skesd_aucpr[1].astype(int)],
            columns=[models[i] for i in [i - 1 for i in skesd_aucpr[3]]],
            index=[project],
        )
    )

print("auc-roc:")
models_rankings_aucroc = pd.concat(models_rankings_aucroc)
table = models_rankings_aucroc.copy()
table.loc["Average"] = models_rankings_aucroc.mean()
table.loc["Median"] = models_rankings_aucroc.median()
models_rankings_aucroc = table.sort_values("Average", axis=1)
models_rankings_aucroc

print("auc-pr:")
models_rankings_aucpr = pd.concat(models_rankings_aucpr)
table = models_rankings_aucpr.copy()
table.loc["Average"] = models_rankings_aucpr.mean()
table.loc["Median"] = models_rankings_aucpr.median()
models_rankings_aucpr = table.sort_values("Average", axis=1)
models_rankings_aucpr

In [None]:
importances = joblib.load("importances_contributors.joblib")

importances = (
    importances.sort_index(key=lambda x: x.map({v: i for i, v in enumerate(projects.keys())}), level="project")
    .rename(projects, level="project")
    .rename(columns=characteristics)
)
importances.round(3)

In [None]:
features_rankings_aucroc = []
features_rankings_aucpr = []

for project in projects.values():
    skesd_aucroc = scottknottesd.sk_esd(importances.query("project == @project and metric == 'aucroc'"), version="np")
    skesd_aucpr = scottknottesd.sk_esd(importances.query("project == @project and metric == 'aucpr'"), version="np")

    features_rankings_aucroc.append(
        pd.DataFrame(
            [skesd_aucroc[1].astype(int)],
            columns=[list(characteristics)[i] for i in [i - 1 for i in skesd_aucroc[3]]],
            index=[project],
        )
    )
    features_rankings_aucpr.append(
        pd.DataFrame(
            [skesd_aucpr[1].astype(int)],
            columns=[list(characteristics)[i] for i in [i - 1 for i in skesd_aucpr[3]]],
            index=[project],
        )
    )

print("auc-roc:")
features_rankings_aucroc = pd.concat(features_rankings_aucroc).rename(columns=characteristics)
table = features_rankings_aucroc.copy()
table.loc["Average"] = features_rankings_aucroc.mean()
table.loc["Median"] = features_rankings_aucroc.median()
features_rankings_aucroc = table.sort_values("Average", axis=1)
features_rankings_aucroc

print("auc-pr:")
features_rankings_aucpr = pd.concat(features_rankings_aucpr).rename(columns=characteristics)
table = features_rankings_aucpr.copy()
table.loc["Average"] = features_rankings_aucpr.mean()
table.loc["Median"] = features_rankings_aucpr.median()
features_rankings_aucpr = table.sort_values("Average", axis=1)
features_rankings_aucpr

In [None]:
fig, ax = plt.subplots(figsize=(14, 2.5))
_ = sns.violinplot(features_rankings_aucroc.drop(["Average", "Median"]), palette="YlOrRd_r", ax=ax)
_ = ax.set_ylabel("Rank")
xticklabels = [label.get_text().replace(" ", "\n") for label in ax.get_xticklabels()]
_ = ax.set_xticks(range(len(xticklabels)))
_ = ax.set_xticklabels(xticklabels)
_ = ax.yaxis.get_major_locator().set_params(integer=True)
plt.tight_layout()
plt.savefig("importances_contributors.pdf", bbox_inches="tight", pad_inches=0.01)

In [None]:
features_all = features_all.rename(columns=characteristics)

combined_shap_values = []
combined_features = []

for project in projects:
    features = features_all.query("project == @project")
    X = features[characteristics.values()]
    y = features["label"]

    model = catboost.CatBoostClassifier(
        objective="MultiClassOneVsAll", random_state=1, thread_count=n_jobs, silent=True
    ).fit(X, y)
    time.sleep(1)
    combined_shap_values.append(shap.TreeExplainer(model).shap_values(X))
    combined_features.append(X)

combined_shap_values = np.vstack(combined_shap_values)[:, :, 0]
combined_features = pd.concat(combined_features)

In [None]:
shap.plots.violin(
    combined_shap_values,
    combined_features,
    max_display=5,
    plot_type="layered_violin",
    plot_size=(8.5, 2.5),
    show=False,
    layered_violin_max_num_bins=100,
    color_bar_label="Value",
)

_ = plt.xlim([-1.5, 1.5])
_ = plt.xticks([])
_ = plt.xlabel("⟵ More than 1 Day        Within 1 Day ⟶      ", fontsize=11)
plt.tight_layout()
plt.savefig("impacts_contributors.pdf", bbox_inches="tight")

In [None]:
performances = joblib.load("performances_contributors_generic.joblib")

performances = (
    performances.pivot_table(values="score", index=["project", "metric", "label"], columns=["model"])
    .sort_index(key=lambda x: x.map({v: i for i, v in enumerate(projects.keys())}), level="project")
    .rename(projects, level="project")
)

precisions = performances.query("metric == 'precision'")
precisions.round(2)

recalls = performances.query("metric == 'recall'")
recalls.round(2)

performances = performances.query("metric in ['aucroc', 'aucpr']")

performances["CB%"] = ((performances["CB"] / performances["DM"]) - 1) * 100
performances.round(2)

performances = (
    performances.groupby(["project", "metric"], sort=False)
    .mean()
    .drop(columns="DM")
    .melt(col_level="model", ignore_index=False)
    .pivot_table(values="value", index="project", columns=["metric", "model"], sort=False)
)
performances.round(2)

In [None]:
table = performances.copy()
table.loc["Average"] = performances.mean()
table.loc["Median"] = performances.median()

for metric in ["aucroc", "aucpr"]:
    table[metric, "CB"] = (
        table[metric, "CB"].round(2).astype(str) + " (" + table[metric, "CB%"].round().astype(int).astype(str) + "%)"
    )

table = table.drop(columns=[("aucroc", "CB%"), ("aucpr", "CB%")]).droplevel("model", axis=1)
table
print(table.to_latex())

In [None]:
precisions = precisions.groupby("project", sort=False).mean().round(2)
recalls = recalls.groupby("project", sort=False).mean().round(2)

In [None]:
precisions["%"] = ((precisions["CB"] / precisions["DM"]) - 1) * 100
recalls["%"] = ((recalls["CB"] / recalls["DM"]) - 1) * 100

precisions.loc["Average"] = precisions.mean()
recalls.loc["Average"] = recalls.mean()

table = pd.DataFrame()
table["precision"] = (
    precisions["CB"].round(2).astype(str) + " (" + precisions["%"].round().astype(int).astype(str) + "%)"
)
table["recall"] = recalls["CB"].round(2).astype(str) + " (" + recalls["%"].round().astype(int).astype(str) + "%)"

table
print(table.to_latex())

In [None]:
importances = joblib.load("importances_contributors_generic.joblib")

importances = (
    importances.set_index(["project", "metric"])
    .sort_index(key=lambda x: x.map({v: i for i, v in enumerate(projects.keys())}), level="project")
    .rename(projects, level="project")
    .rename(columns=characteristics)
)
importances.round(3)

In [None]:
features_rankings_aucroc = []
features_rankings_aucpr = []

for project in projects.values():
    skesd_aucroc = scottknottesd.sk_esd(
        pd.concat([importances.query("project == @project and metric == 'aucroc'")] * 2, ignore_index=True),
        version="np",
    )
    skesd_aucpr = scottknottesd.sk_esd(
        pd.concat([importances.query("project == @project and metric == 'aucpr'")] * 2, ignore_index=True),
        version="np",
    )

    features_rankings_aucroc.append(
        pd.DataFrame(
            [skesd_aucroc[1].astype(int)],
            columns=[list(characteristics)[i] for i in [i - 1 for i in skesd_aucroc[3]]],
            index=[project],
        )
    )
    features_rankings_aucpr.append(
        pd.DataFrame(
            [skesd_aucpr[1].astype(int)],
            columns=[list(characteristics)[i] for i in [i - 1 for i in skesd_aucpr[3]]],
            index=[project],
        )
    )

print("auc-roc:")
features_rankings_aucroc = pd.concat(features_rankings_aucroc).rename(columns=characteristics)
table = features_rankings_aucroc.copy()
table.loc["Average"] = features_rankings_aucroc.mean()
table.loc["Median"] = features_rankings_aucroc.median()
features_rankings_aucroc = table.sort_values("Average", axis=1)
features_rankings_aucroc

print("auc-pr:")
features_rankings_aucpr = pd.concat(features_rankings_aucpr).rename(columns=characteristics)
table = features_rankings_aucpr.copy()
table.loc["Average"] = features_rankings_aucpr.mean()
table.loc["Median"] = features_rankings_aucpr.median()
features_rankings_aucpr = table.sort_values("Average", axis=1)
features_rankings_aucpr

In [None]:
fig, ax = plt.subplots(figsize=(14, 2.5))
_ = sns.violinplot(features_rankings_aucroc.drop(["Average", "Median"]), palette="YlOrRd_r", ax=ax)
_ = ax.set_ylabel("Rank")
xticklabels = [label.get_text().replace(" ", "\n") for label in ax.get_xticklabels()]
_ = ax.set_xticks(range(len(xticklabels)))
_ = ax.set_xticklabels(xticklabels)
_ = ax.yaxis.get_major_locator().set_params(integer=True)
plt.tight_layout()
plt.savefig("importances_contributors_generic.pdf", bbox_inches="tight", pad_inches=0.01)