In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pathlib
from pywaffle import Waffle
from collections import OrderedDict, Counter
import plot_likert
import numpy as np

In [None]:
# Create output directory if does not exists
out_dir = pathlib.Path("./output")
out_dir.mkdir(parents=True, exist_ok=True)

# Load data

In [None]:
df_g = pd.read_excel("./input/g.xlsx").drop(["Timestamp"], axis="columns")
df_m = pd.read_excel("./input/m.xlsx").drop(
    ["Id", "Heure de début", "Heure de fin", "Adresse de messagerie", "Nom"],
    axis="columns",
)

In [None]:
import re

mapper_d = {
    "Issue:  Evaluating the quality of a model offline": "I1.1",
    "Issue: Defining a good business metric for evaluating an MLSS is difficult": "I1.2",
    "Issue: Trying to simulate the environment": "I1.3",
    "Issue: Evaluating the quality of a dataset": "I1.4",
    "Issue: Explaining a model's predictions to people without ML knowledge": "I2.1",
    "Issue: The explanation techniques sometimes": "I2.2",
    "Issue: Reproducing bugs in an MLSS": "I3.1",
    "Issue: Debugging data streaming systems": "I3.2",
    "Issue: Debugging an MLSS is time-consuming": "I3.3",
    "Issue: Training models consume a lot of resources": "I4.1",
    "Issue: The queries sent to an MLSS are not answered": "I4.2",
    "Issue: At inference time, ML models consume too much memory": "I4.3",
    "Issue: Maintaining an MLSS is difficult because": "I5.1",
    "Issue: Maintaining a model is difficult": "I5.2",
    "Issue: Managing the dependencies": "I5.3",
    "Issue: Having a reliable model is difficult because of concept or data drift": "I6.1",
    "Issue: Having a reliable model is difficult because of external data providers": "I6.2",
    "Issue: Having a reliable MLSS is difficult because of the data pipelines which are brittle and have technical": "I6.3",
    "What is your job role?": "D:job_role",
    "How many years of professional": "D:experience_g",
    "How many years of experience": "D:experience_ml",
    "If you are interested": "D:email",
}


def mapper_f(col_name):
    col_name = col_name.replace("\xa0", " ")
    # Match comments questions
    if not re.match("^Do you have any", col_name) is None:
        return "C:" + col_name.split(" ")[-1][:-1]

    # Match RQs + demographic questions
    for k, v in mapper_d.items():
        if not re.match(f"^\s?{k}", col_name) is None:
            return v

    return col_name


df_g = df_g.rename(mapper_f, axis="columns")
df_m = df_m.rename(mapper_f, axis="columns")

In [None]:
col_rq = [col for col in df_m if col.startswith("RQ")]

df_m = df_m.replace(
    {"never": 1, "rarely": 2, "sometimes": 3, "often": 4, "frequently": 5}
)

In [None]:
df = pd.concat([df_g, df_m])
df["D:job_role"] = df["D:job_role"].fillna(value="Unknown")

In [None]:
df["D:job_role"] = df["D:job_role"].replace(
    {"AI Engineer": "ML Engineer", "Manager (e.g. Director)": "Manager"}
)

In [None]:
df

# Likert plot

In [None]:
def make_likert_plot(df, sorted=False):
    def remove_label(xlabels, xvalues, label_value):
        xlabels_n, xvalues_n = [], []
        for xlabel, xvalue in zip(xlabels, xvalues):
            if xlabel.get_text() != label_value:
                xlabels_n.append(xlabel)
                xvalues_n.append(xvalue)
        return xlabels_n, xvalues_n

    df = df.copy()
    df = df.astype("Int64")
    if sorted:
        df = df[df.mean(axis="rows").sort_values(ascending=False).index]
    df = df.astype("str")
    df = df.replace("<NA>", np.NaN)

    # Change background
    plt.rcParams["axes.edgecolor"] = "lightgray"

    axes = plot_likert.plot_likert(
        df, plot_likert.scales.raw5, plot_percentage=True, linestyle="-"
    )
    axes.set_ylabel("Quality Issue Id")
    # Change limits
    xlim = axes.get_xlim()
    axes.set_xlim((-22, xlim[-1]))
    xvalues = axes.get_xticks()
    xlabels = axes.get_xticklabels()
    xlabels, xvalues = remove_label(xlabels, xvalues, label_value="100%")
    xvalues.append(0)
    xlabels.append("75%")
    axes.set_xticks(xvalues)
    axes.set_xticklabels(xlabels)
    axes.grid(axis="both", color="lightgray", linestyle="-")
    axes.set_axisbelow(True)
    axes.tick_params(color="white")

    axes.get_figure().savefig("./output/likert.pdf")


col_rq = [col for col in df_g if col.startswith("I")]
make_likert_plot(df[col_rq], sorted=False)