In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pathlib
from pywaffle import Waffle
from collections import OrderedDict, Counter
import plot_likert
import numpy as np


In [None]:
# Create output directory if does not exists
out_dir = pathlib.Path('./output')
out_dir.mkdir(parents=True, exist_ok=True) 

# set seaborn style 
sns.set(style="whitegrid")

# Load data

In [None]:
df_g = pd.read_excel('./input/g.xlsx').drop(['Timestamp'], axis='columns')
df_m = pd.read_excel('./input/m.xlsx').drop(['Id', 'Heure de début', 'Heure de fin', 'Adresse de messagerie', 'Nom'], axis='columns')

In [None]:
import re 

mapper_d = {
    "Issue:  Evaluating the quality of a model offline": 'RQ1.1',
    'Issue: Defining a good business metric for evaluating an MLSS is difficult':'RQ1.2',
    'Issue: Trying to simulate the environment':'RQ1.3',
    'Issue: Evaluating the quality of a dataset':'RQ1.4',
    "Issue: Explaining a model's predictions to people without ML knowledge":'RQ2.1',
    'Issue: The explanation techniques sometimes':'RQ2.2',
    'Issue: Reproducing bugs in an MLSS':'RQ3.1',
    'Issue: Debugging data streaming systems':'RQ3.2',
    'Issue: Debugging an MLSS is time-consuming':'RQ3.3',
    'Issue: Training models consume a lot of resources':'RQ4.1',
    'Issue: The queries sent to an MLSS are not answered':'RQ4.2',
    'Issue: At inference time, ML models consume too much memory':'RQ4.3',
    "Issue: Maintaining an MLSS is difficult because":"RQ5.1",
    "Issue: Maintaining a model is difficult":"RQ5.2",
    "Issue: Managing the dependencies":"RQ5.3",
    "Issue: Having a reliable model is difficult because of concept or data drift":"RQ6.1",
    "Issue: Having a reliable model is difficult because of external data providers":"RQ6.2",
    "Issue: Having a reliable MLSS is difficult because of the data pipelines which are brittle and have technical":"RQ6.3",
    'What is your job role?':'D:job_role',
    "How many years of professional":"D:experience_g",
    "How many years of experience":"D:experience_ml",
    "If you are interested":"D:email",
}

def mapper_f(col_name):
    col_name = col_name.replace(u'\xa0', u' ')
    # Match comments questions
    if not re.match("^Do you have any", col_name) is None:
        return 'C:' + col_name.split(' ')[-1][:-1]
    
    # Match RQs + demographic questions
    for k,v in mapper_d.items():
        if not re.match(f'^\s?{k}', col_name) is None:
            return v
        
    return col_name

df_g = df_g.rename(mapper_f, axis='columns')
df_m = df_m.rename(mapper_f, axis='columns')


In [None]:
col_rq = [col for col in df_m if col.startswith('RQ')]

df_m = df_m.replace({"never":1, 'rarely':2, 'sometimes':3, 'often':4, 'frequently':5})

In [None]:
df = pd.concat([df_g, df_m])
df['D:job_role'] = df['D:job_role'].fillna(value='Unknown')

In [None]:
df

# Demographics

### Experience (ML + general)

In [None]:
def generate_experience_total_plot(df):

    # Changing data's format for sns
    df = df[['D:experience_g', 'D:experience_ml']]
    df = df.rename(columns={"D:experience_ml": "with ML", "D:experience_g": "in general"})
    df = df.melt(
        var_name="Professional experience"
    )
    
    # Create the plot
    ax = sns.countplot(
        data=df, 
        x='value', 
        hue='Professional experience', 
        order=['0-2', '3-5', '6-9', '10+'], 
        palette=sns.color_palette('colorblind')[:2]
    )
    # ax.set_title('')
    ax.set_xlabel('years of experience')
    ax.set_ylabel('# interviewees')
    ax.set_ylim(0,15)


    plt.savefig(out_dir/'experience_total.pdf', bbox_inches='tight')
    plt.show()

generate_experience_total_plot(df)

In [None]:
def put_unkonwn_at_end(d):
    tmp_k = []
    for k in d.keys():
        if k != 'Unknown':
            tmp_k.append(k)
    tmp_k.append('Unknown')

    res = OrderedDict()
    for k in tmp_k:
        res[k] = d[k]
    return res




def generate_waffle_plot(df, col_name, file_name=None):
    cnt = OrderedDict(Counter(df[col_name].dropna().tolist()).most_common()) 
    cnt = put_unkonwn_at_end(cnt)
    tot = sum(cnt.values())
    val = [ 100*(x/tot) for x in cnt.values()]
    labels = [ f'{x} ({"{:.0f}".format(val[i])}%)' for i,x in enumerate(cnt.keys())]
    
    fig = plt.figure(
        FigureClass=Waffle, 
        columns=5, 
        values=cnt, 
        labels=labels,
        colors=sns.color_palette('colorblind')[:len(cnt)],
        legend={'loc': 'lower center', 'bbox_to_anchor': (0.5, -0.4), 'fontsize':11, 'ncol': 2,},
        icons='user', font_size=40, 
        icon_legend=True,
        block_arranging_style='snake'
        # figsize=(6, 6),
    )

    if file_name:
        plt.savefig(out_dir/f'{file_name}.pdf', bbox_inches='tight')


def generate_job_role_waffle_plot(df):
    generate_waffle_plot(df, 'D:job_role', file_name='job_role')
    
generate_job_role_waffle_plot(df)

# Likert plots

In [None]:
def make_likert_plot(df, sorted=False):
    df = df.copy()
    df = df.astype('Int64')
    if sorted:
        df = df[df.mean(axis='rows').sort_values(ascending=False).index]
    df = df.astype('str')
    df = df.replace('<NA>', np.NaN)
    axes = plot_likert.plot_likert(df, plot_likert.scales.raw5, plot_percentage=True)
    axes.get_figure().savefig('./output/likert.png')

col_rq = [col for col in df_g if col.startswith('RQ')]
make_likert_plot(df[col_rq], sorted=False)

In [None]:
df[col_rq].mean(axis='rows')