In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pathlib
from pywaffle import Waffle
from collections import OrderedDict, Counter
import plot_likert
import numpy as np

In [None]:
# Create output directory if does not exists
out_dir = pathlib.Path("./output")
out_dir.mkdir(parents=True, exist_ok=True)

# set seaborn style
sns.set(style="whitegrid")

# Load data

In [None]:
df_g = pd.read_excel("./input/g.xlsx").drop(["Timestamp"], axis="columns")
df_m = pd.read_excel("./input/m.xlsx").drop(
    ["Id", "Heure de début", "Heure de fin", "Adresse de messagerie", "Nom"],
    axis="columns",
)

In [None]:
import re

mapper_d = {
    "Issue:  Evaluating the quality of a model offline": "I1.1",
    "Issue: Defining a good business metric for evaluating an MLSS is difficult": "I1.2",
    "Issue: Trying to simulate the environment": "I1.3",
    "Issue: Evaluating the quality of a dataset": "I1.4",
    "Issue: Explaining a model's predictions to people without ML knowledge": "I2.1",
    "Issue: The explanation techniques sometimes": "I2.2",
    "Issue: Reproducing bugs in an MLSS": "I3.1",
    "Issue: Debugging data streaming systems": "I3.2",
    "Issue: Debugging an MLSS is time-consuming": "I3.3",
    "Issue: Training models consume a lot of resources": "I4.1",
    "Issue: The queries sent to an MLSS are not answered": "I4.2",
    "Issue: At inference time, ML models consume too much memory": "I4.3",
    "Issue: Maintaining an MLSS is difficult because": "I5.1",
    "Issue: Maintaining a model is difficult": "I5.2",
    "Issue: Managing the dependencies": "I5.3",
    "Issue: Having a reliable model is difficult because of concept or data drift": "I6.1",
    "Issue: Having a reliable model is difficult because of external data providers": "I6.2",
    "Issue: Having a reliable MLSS is difficult because of the data pipelines which are brittle and have technical": "I6.3",
    "What is your job role?": "D:job_role",
    "How many years of professional": "D:experience_g",
    "How many years of experience": "D:experience_ml",
    "If you are interested": "D:email",
}


def mapper_f(col_name):
    col_name = col_name.replace("\xa0", " ")
    # Match comments questions
    if not re.match("^Do you have any", col_name) is None:
        return "C:" + col_name.split(" ")[-1][:-1]

    # Match RQs + demographic questions
    for k, v in mapper_d.items():
        if not re.match(f"^\s?{k}", col_name) is None:
            return v

    return col_name


df_g = df_g.rename(mapper_f, axis="columns")
df_m = df_m.rename(mapper_f, axis="columns")

In [None]:
col_rq = [col for col in df_g if col.startswith("I")]

df_m = df_m.replace(
    {"never": 1, "rarely": 2, "sometimes": 3, "often": 4, "frequently": 5}
)

In [None]:
df = pd.concat([df_g, df_m])
df["D:job_role"] = df["D:job_role"].fillna(value="Unknown")

In [None]:
df["D:job_role"] = df["D:job_role"].replace(
    {"AI Engineer": "ML Engineer", "Manager (e.g. Director)": "Manager"}
)

In [None]:
# Exclude practitioners with less than 3 years of experience
df = df[df["D:experience_ml"] != "0-2"]

In [None]:
experience_mapper = {"0-2": "1-3", "3-5": "3-6", "6-9": "6-10"}
df["D:experience_g"] = df["D:experience_g"].apply(lambda k: experience_mapper.get(k, k))
df["D:experience_ml"] = df["D:experience_ml"].apply(
    lambda k: experience_mapper.get(k, k)
)

# Demographics

### Experience (ML + general)

In [None]:
def generate_experience_total_plot(df):

    # Changing data's format for sns
    df = df[["D:experience_g", "D:experience_ml"]]
    df = df.rename(
        columns={"D:experience_ml": "with ML", "D:experience_g": "in general"}
    )
    df = df.melt(var_name="Professional experience")

    # Create the plot
    ax = sns.countplot(
        data=df[::-1],
        x="value",
        hue="Professional experience",
        order=["1-3", "3-6", "6-10", "10+"],
        palette=sns.color_palette("colorblind")[:2],
    )
    # ax.set_title('')
    ax.set_xlabel("years of experience")
    ax.set_ylabel("# practitioners")
    ax.set_ylim(0, 15)

    plt.savefig(out_dir / "experience_total_questionnaire.pdf", bbox_inches="tight")
    plt.show()


generate_experience_total_plot(df)

In [None]:
def put_unkonwn_at_end(d):
    tmp_k = []
    for k in d.keys():
        if k != "Unknown":
            tmp_k.append(k)
    tmp_k.append("Unknown")

    res = OrderedDict()
    for k in tmp_k:
        res[k] = d[k]
    return res


def generate_waffle_plot(df, col_name, file_name=None):
    cnt = OrderedDict(Counter(df[col_name].dropna().tolist()).most_common())
    cnt = put_unkonwn_at_end(cnt)
    tot = sum(cnt.values())
    val = [100 * (x / tot) for x in cnt.values()]
    labels = [f'{x} ({"{:.0f}".format(val[i])}%)' for i, x in enumerate(cnt.keys())]

    fig = plt.figure(
        FigureClass=Waffle,
        columns=6,
        values=cnt,
        labels=labels,
        colors=sns.color_palette("colorblind")[: len(cnt)],
        legend={
            "loc": "lower center",
            "bbox_to_anchor": (0.5, -0.4),
            "fontsize": 11,
            "ncol": 2,
        },
        icons="user",
        font_size=45,
        icon_legend=True,
        block_arranging_style="snake",
        # figsize=(6, 6),
    )

    if file_name:
        plt.savefig(out_dir / f"{file_name}.pdf", bbox_inches="tight")


def generate_job_role_waffle_plot(df):
    generate_waffle_plot(df, "D:job_role", file_name="job_role_questionnaire")


generate_job_role_waffle_plot(df)

# Nb answer per question

In [None]:
from matplotlib.ticker import MaxNLocator


def generate_n_question_plot(df):
    series = df[col_rq].count(axis="rows")

    # Create the plot
    ax = sns.barplot(x=series.index, y=series, color=sns.color_palette("colorblind")[0])
    ax.set_ylabel("# answers")
    ax.yaxis.set_major_locator(MaxNLocator(integer=True))
    plt.xticks(rotation=45, ha="right", rotation_mode="anchor")

    plt.savefig(out_dir / "n_answers.pdf", bbox_inches="tight")
    plt.show()


generate_n_question_plot(df[col_rq])

# Average Likert score

In [None]:
def generate_average_score_plot(df):
    series = df.mean(axis="rows").sort_values(ascending=False)

    # Create the plot
    ax = sns.barplot(x=series.index, y=series, color=sns.color_palette("colorblind")[0])
    ax.set_ylabel("average score")
    # ax.yaxis.set_major_locator(MaxNLocator(integer=True))
    ax.set_ylim([2, 4])
    plt.xticks(rotation=45, ha="right", rotation_mode="anchor")

    plt.savefig(out_dir / "average_score.pdf", bbox_inches="tight")
    plt.show()


generate_average_score_plot(df[col_rq])

In [None]:
from collections import Counter

df_c = df[col_rq].fillna(-1)
df_c = df_c.apply(lambda x: sorted(Counter(x).items(), key=lambda x: x[0]))

for x, i in zip(df_c, df_c.index):
    print(f"{i} : {x}")

# Mode Likert Score

In [None]:
def generate_mode_score_plot(df):
    series = (
        df.mode(axis="rows", dropna=True)
        .fillna(0)
        .max(axis="rows")
        .sort_values(ascending=False)
    )

    # Create the plot
    ax = sns.barplot(x=series.index, y=series, color=sns.color_palette("colorblind")[0])
    ax.set_ylabel("most common score")
    # ax.yaxis.set_major_locator(MaxNLocator(integer=True))
    ax.set_yticks([0, 1, 2, 3, 4, 5])
    plt.xticks(rotation=45, ha="right", rotation_mode="anchor")

    plt.savefig(out_dir / "mode_score.pdf", bbox_inches="tight")
    plt.show()


generate_mode_score_plot(df[col_rq])

# Median Likert Score

In [None]:
def generate_median_score_plot(df):
    series = df.median(axis="rows").sort_values(ascending=False)

    # Create the plot
    ax = sns.barplot(x=series.index, y=series, color=sns.color_palette("colorblind")[0])
    ax.set_ylabel("median score")
    # ax.yaxis.set_major_locator(MaxNLocator(integer=True))
    ax.set_yticks([0, 1, 2, 3, 4, 5])
    plt.xticks(rotation=45, ha="right", rotation_mode="anchor")

    plt.savefig(out_dir / "mode_score.pdf", bbox_inches="tight")
    plt.show()


generate_median_score_plot(df[col_rq])

# 4-5 Likert score

In [None]:
def generate_4_5_score_plot(df):
    def count_n_4_5_scores(series):
        vc = series.value_counts(dropna=True, normalize=True)
        per = 0
        if 4 in vc:
            per += vc[4]
        if 5 in vc:
            per += vc[5]
        return per

    series = df[col_rq].apply(count_n_4_5_scores).sort_values(ascending=False)

    # Create the plot
    ax = sns.barplot(x=series.index, y=series, color=sns.color_palette("colorblind")[0])
    ax.set_ylabel("percentage of ratings")
    # ax.yaxis.set_major_locator(MaxNLocator(integer=True))
    plt.xticks(rotation=45, ha="right", rotation_mode="anchor")

    plt.savefig(out_dir / "percentage_frequent.pdf", bbox_inches="tight")
    # plt.show()


generate_4_5_score_plot(df[col_rq])