In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter, OrderedDict
from pywaffle import Waffle
import pathlib


In [None]:
# Read and filter out irrelevant entries
df = pd.read_csv('./input/input.csv')
df = df[(df['State'] != 'Refused') & (df['State'] != 'Reminder sent') & (df['State'] != 'excluded') ]

# Replace Nan with Uknown
df = df.fillna('Unknown')

# Create output directory if does not exists
out_dir = pathlib.Path('./output')
out_dir.mkdir(parents=True, exist_ok=True) 

# set a grey background (use sns.set_theme() if seaborn version 0.11.0 or above) 
sns.set(style="whitegrid")

## Histograms

### Company size

In [None]:
def generate_company_size_plot(df):
    # Uniformizing values
    df['Company size'][df['Company size'] == '51-100'] = '51-500'
    df['Company size'][df['Company size'] == '100-500'] = '51-500'
    df['Company size'][df['Company size'] == '5000-10000'] = '500-10000'
    # Sorting bins
    df['Company size'] = pd.Categorical(df['Company size'], ['1-10','11-50','51-500','500-10000','10001+'])


    # Create the plot
    ax = sns.countplot(data=df, x="Company size", color=sns.color_palette('colorblind')[0])
    # ax.set_title('')
    ax.set_ylabel('# interviewees')
    ax.set_xlabel('number of employees')

    plt.savefig(out_dir/'company_size.pdf', bbox_inches='tight')
    plt.show()


generate_company_size_plot(df)

### Experience (general)

In [None]:
def generate_experience_g_plot(df):
    # Sorting bins
    df['Experience (general)'] = pd.Categorical(df['Experience (general)'], ['0-2', '3-5', '6-9', '10+'])


    # Create the plot
    ax = sns.countplot(data=df, x="Experience (general)", color=sns.color_palette('colorblind')[0])
    # ax.set_title('')
    ax.set_ylabel('# interviewees')
    ax.set_ylim(0,12)

    plt.savefig(out_dir/'experience_general.pdf', bbox_inches='tight')
    plt.show()

generate_experience_g_plot(df)

### Experience (ML)

In [None]:
def generate_experience_ml_plot(df):
    # Sorting bins
    df['Experience (ML)'] = pd.Categorical(df['Experience (ML)'], ['0-2', '3-5', '6-9', '10+'])


    # Create the plot
    ax = sns.countplot(data=df, x="Experience (ML)", color=sns.color_palette('colorblind')[0])
    # ax.set_title('')
    ax.set_ylabel('# interviewees')
    ax.set_ylim(0,15)


    plt.savefig(out_dir/'experience_ml.pdf', bbox_inches='tight')
    plt.show()

generate_experience_ml_plot(df)

### Experience (ML + general)

In [None]:
def generate_experience_total_plot(df):

    # Changing data's format for sns
    df = df[['Experience (ML)', 'Experience (general)']]
    df = df.rename(columns={"Experience (ML)": "with ML", "Experience (general)": "in general"})
    df = df.melt(
        var_name="Professional experience"
    )
    
    # Create the plot
    ax = sns.countplot(
        data=df, 
        x='value', 
        hue='Professional experience', 
        order=['0-2', '3-5', '6-9', '10+'], 
        palette=sns.color_palette('colorblind')[:2]
    )
    # ax.set_title('')
    ax.set_xlabel('years of experience')
    ax.set_ylabel('# interviewees')
    ax.set_ylim(0,15)


    plt.savefig(out_dir/'experience_total.pdf', bbox_inches='tight')
    plt.show()

generate_experience_total_plot(df)

## Pie charts

In [None]:
def put_unkonwn_at_end(d):
    tmp_k = []
    for k in d.keys():
        if k != 'Unknown':
            tmp_k.append(k)
    tmp_k.append('Unknown')

    res = OrderedDict()
    for k in tmp_k:
        res[k] = d[k]
    return res




def generate_waffle_plot(df, col_name, file_name=None):
    cnt = OrderedDict(Counter(df[col_name].dropna().tolist()).most_common()) 
    cnt = put_unkonwn_at_end(cnt)
    tot = sum(cnt.values())
    val = [ 100*(x/tot) for x in cnt.values()]
    labels = [ f'{x} ({"{:.0f}".format(val[i])}%)' for i,x in enumerate(cnt.keys())]
    
    fig = plt.figure(
        FigureClass=Waffle, 
        columns=7, 
        values=cnt, 
        labels=labels,
        colors=sns.color_palette('colorblind')[:len(cnt)],
        legend={'loc': 'lower center', 'bbox_to_anchor': (0.5, -0.3), 'fontsize':11, 'ncol': 2,},
        icons='user', font_size=30, 
        icon_legend=True,
        block_arranging_style='snake'
        # figsize=(6, 6),
    )

    if file_name:
        plt.savefig(out_dir/f'{file_name}.pdf', bbox_inches='tight')

    

### Sector

In [None]:

def generate_sector_pie_plot(df):
    cnt = OrderedDict(Counter(df['Sector'].dropna().tolist()).most_common())
    keys, values = list(cnt.keys()), list(cnt.values())
    fig1, ax1 = plt.subplots()
    plt.pie(values, labels=keys, colors = sns.color_palette(), explode = [.05 for _ in range(len(values))], autopct='%1.1f%%', startangle=90, pctdistance=0.85 )
    #draw circle
    centre_circle = plt.Circle((0,0),0.70,fc='white')
    fig = plt.gcf()
    fig.gca().add_artist(centre_circle)
    # Equal aspect ratio ensures that pie is drawn as a circle
    ax1.axis('equal')  
    plt.tight_layout()
    plt.show()

generate_sector_pie_plot(df)

In [None]:
def generate_sector_waffle_plot(df):
    generate_waffle_plot(df, 'Sector', file_name='sector_waffle')
    
generate_sector_waffle_plot(df)

### Where

In [None]:
def generate_sector_waffle_plot(df):
    generate_waffle_plot(df, 'Where', file_name='world_region')
    
generate_sector_waffle_plot(df)

### Job role

In [None]:
def generate_sector_waffle_plot(df):
    generate_waffle_plot(df, 'Job title', file_name='job_role')
    
generate_sector_waffle_plot(df)