In [1]:
import ast
import csv

import gender_guesser.detector as gender
import numpy as np
import pandas as pd
import pycountry
from bokeh.io import output_notebook, show
from bokeh.models import ColumnDataSource, FactorRange
from bokeh.palettes import Spectral6, Viridis, Viridis6
from bokeh.plotting import figure
from gender_guesser.detector import NoCountryError
from rich.jupyter import print

In [2]:
output_notebook()

In [3]:
d = gender.Detector()

In [4]:
def make_json(csvFilePath):
    data = {}
    with open(csvFilePath, encoding="utf-8") as csvf:
        csvReader = csv.DictReader(csvf)
        for rows in csvReader:
            try:
                rows["ticket"] = ast.literal_eval(rows["name"]).get("en")
            except:
                rows["ticket"] = None
            key = rows["id"]
            data[key] = rows
    return data


def get_att_gender(name, country):
    if country == "United Kingdom":
        country = "great_britain"
    elif country == "United States":
        country = "usa"
    elif country == "Russian Federation":
        country = "russia"
    try:
        gender = d.get_gender(name, country.lower())
    except NoCountryError:
        gender = np.nan
    except AttributeError:
        gender = np.nan
    # this is weak --- :shrug:
    return (
        np.nan
        if gender == "unknown"
        else "male"
        if gender == "mostly_male"
        else "female"
        if gender == "mostly_female"
        else "not_say"
        if gender == "andy"
        else gender
    )


def get_country_name(country):
    try:
        return pycountry.countries.get(alpha_2=country).name
    except:
        return "Italy"

In [5]:
pycon2022_data = make_json("data/stats_users_latest.csv")
pycon2022 = pd.DataFrame.from_dict([v for k, v in pycon2022_data.items()])
pycon2022["conf"] = "PyCon Italia 2022"
pycon2022 = pycon2022.drop(columns=["name", "item_id", "id", "is_speaker"])
pycon2022 = pycon2022.apply(lambda x: x.str.strip()).replace("", np.nan)
pycon2022["ticket_type"] = pycon2022.ticket.apply(lambda x: x.split()[0] if x else x)
current_edition = pycon2022[
    ["conf", "country", "ticket", "ticket_type", "gender", "age"]
]

In [6]:
pycon1098 = pd.read_csv("data/pycon10-9-8.csv")
fare_real = {
    "TESC": "Business Early Ticket",
    "TESP": "Personal Early Ticket",
    "TESS": "Student Early Ticket",
    "TRSC": "Business Regular Ticket",
    "TRSP": "Personal Regular Ticket",
    "TRSS": "Student Regular Ticket",
    "TOSC": "Business Late Ticket",
    "TOSP": "Personal Late Ticket",
    "TOSS": "Student Late Ticket",
}

In [7]:
pycon1098.columns = [
    "name",
    "card_name",
    "conf",
    "fare",
    "d_name",
    "fare_name",
    "country",
]
pycon1098["ticket"] = pycon1098["fare"].apply(lambda x: fare_real[x])
pycon1098["name"] = pycon1098["name"].fillna(pycon1098["card_name"])
pycon1098["name"] = pycon1098["name"].fillna("user")
pycon1098["country_name"] = pycon1098.country.apply(lambda x: get_country_name(x))
pycon1098["gender"] = pycon1098.apply(
    lambda x: get_att_gender(x["name"].split()[0], x.country_name), axis=1
)
pycon1098["ticket_type"] = pycon1098.ticket.apply(lambda x: x.split()[0] if x else x)
pycon1098["age"] = np.nan
pycon1098 = pycon1098.drop(
    columns=["card_name", "d_name", "fare_name", "fare", "name", "country_name"]
)
last_3_editions = pycon1098[
    ["conf", "country", "ticket", "ticket_type", "gender", "age"]
]

In [8]:
frames = [current_edition, last_3_editions]
data = pd.concat(frames)

# Tickets Sold

In [9]:
tickets_per_conf = (
    data.groupby("conf")
    .count()
    .reindex(["PyCon Italia 2022", "PyCon X", "PyCon 9", "PyCon 8"])
)
source = ColumnDataSource(
    data=dict(
        confs=list(tickets_per_conf.index),
        counts=list(tickets_per_conf.ticket),
        color=Viridis6[1:5],
    )
)

p = figure(
    x_range=list(tickets_per_conf.index),
    width=800,
    plot_height=450,
    title="PyCon Italia - Numbers of Tickets",
)
p.vbar(
    x="confs",
    top="counts",
    width=0.6,
    color="color",
    source=source,
    legend_field="confs",
)

p.xgrid.grid_line_color = None
p.legend.visible = False

show(p)

In [10]:
gt = data.groupby(["conf", "ticket"]).count()
groups = [
    (conf, ticket)
    for conf in ["PyCon Italia 2022", "PyCon X", "PyCon 9", "PyCon 8"]
    for ticket in list(fare_real.values())
]

In [11]:
from bokeh.transform import factor_cmap

source = ColumnDataSource(gt)
index_cmap = factor_cmap(
    "conf_ticket", palette=Viridis6, factors=sorted(data.conf.unique()), end=1
)

p = figure(
    plot_width=800,
    plot_height=400,
    title="No. Tickets Sold / Fare",
    x_range=FactorRange(*groups),
)

p.vbar(
    x="conf_ticket",
    top="ticket_type",
    width=1,
    source=source,
    line_color="white",
    fill_color=index_cmap,
)

p.y_range.start = 0
p.x_range.range_padding = 0.05
p.xgrid.grid_line_color = None
p.xaxis.major_label_orientation = 1.4
p.outline_line_color = None

show(p)

In [12]:
from bokeh.transform import factor_cmap

gt = data.groupby(["conf", "ticket_type"]).count()
groups = [
    (conf, ticket)
    for conf in ["PyCon Italia 2022", "PyCon X", "PyCon 9", "PyCon 8"]
    for ticket in list(data.ticket_type.unique())
]
source = ColumnDataSource(gt)
index_cmap = factor_cmap(
    "conf_ticket_type", palette=Viridis6, factors=sorted(data.conf.unique()), end=1
)

p = figure(
    plot_width=800,
    plot_height=400,
    title="No. Tickets Sold / Type",
    x_range=FactorRange(*groups),
)

p.vbar(
    x="conf_ticket_type",
    top="ticket",
    width=1,
    source=source,
    line_color="white",
    fill_color=index_cmap,
)

p.y_range.start = 0
p.x_range.range_padding = 0.05
p.xgrid.grid_line_color = None
p.xaxis.major_label_orientation = 1.4
p.outline_line_color = None

show(p)

In [13]:
grouped_twg = data.fillna("Unknown").groupby(["gender", "conf"]).count()
grouped_twg["gender_perc"] = None
totals = dict(grouped_twg.groupby(["conf"]).sum().ticket)
grouped_twg.reset_index(inplace=True)
grouped_twg["gender_perc"] = grouped_twg.apply(
    lambda x: 100 * (x.ticket / totals[x.conf]), axis=1
)
grouped_twg
source = ColumnDataSource(grouped_twg.groupby(["gender", "conf"], dropna=False))

index_cmap = factor_cmap(
    "gender_conf",
    palette=Viridis6,
    factors=sorted(grouped_twg.gender.unique().astype(str)),
    end=1,
)

p = figure(
    plot_width=800,
    plot_height=400,
    title="Ticket per Gender",
    x_range=grouped_twg.groupby(["gender", "conf"]),
    y_axis_label="%"
)

p.vbar(
    x="gender_conf",
    top="gender_perc_mean",
    width=1,
    source=source,
    line_color="white",
    fill_color=index_cmap,
)

p.y_range.start = 0
p.x_range.range_padding = 0.05
p.xgrid.grid_line_color = None
p.xaxis.major_label_orientation = 1.4
p.outline_line_color = None

show(p)

# Ranking

In [159]:
ranking_pycon10 = pd.read_csv("data/ranking_pyconx.txt", sep="~", skiprows=1, header=0)
ranking_pycon10 = ranking_pycon10.drop(columns=["TID", "Title", "Speakers"])
ranking_pycon10["conf"] = "PyCon X"

ranking_pycon9 = pd.read_csv("data/ranking_pycon9.txt", sep="~", header=0, skiprows=1)
ranking_pycon9 = ranking_pycon9.drop(columns=["TID", "Title", "Speakers"])
ranking_pycon9["conf"] = "PyCon 9"

ranking_pycon2022 = pd.read_excel("data/ranking_pycon2022.xlsx")
ranking_pycon2022 = ranking_pycon2022.drop(
    columns=[
        "rank",
        "score",
        "submission__id",
        "submission__hashid",
        "submission__title",
        "vote_count",
        "tags",
        "submission__speaker_id",
        "full_name",
        "rank_request__conference__code",
    ]
)
ranking_pycon2022.columns = [
    "Track",
    "Level",
    "Type",
    "Duration",
    "Lang",
    "Gender",
]
ranking_pycon2022["conf"] = "PyCon Italia 2022"
ranking_pycon2022 = ranking_pycon2022[
    ["Type", "Duration", "Track", "Level", "Lang", "Gender", "conf"]
]
ranking_pycon2022["Lang"] = ranking_pycon2022.Lang.apply(lambda x: x[2:4])
ranking_pycon2022["Type"] = ranking_pycon2022.Type.apply(
    lambda x: x if x == "Talk" else "Training"
)
frames = [ranking_pycon2022, ranking_pycon10, ranking_pycon9]
data = pd.concat(frames)

union_jack = "\U0001F1EC\U0001F1E7"
tricolore = "\U0001F1EE\U0001F1F9"
data.Gender = data.Gender.apply(
    lambda g: "+".join(
        "\U0001f469\u200d\U0001f4bb" if l == "female" else "\U0001f468\u200d\U0001f4bb"
        for l in str(g).split(",")
    )
)
data.Lang = data.Lang.apply(lambda l: union_jack if l == "en" else tricolore)
pd.set_option("display.max_rows", data.index.size)
pd.set_option("display.max_colwidth", 200)

In [160]:
trainings = data[data["Type"].values == "Training"]
talks = data[data["Type"].values == "Talk"]

In [161]:
grouped_talks = talks.groupby(["Track", "conf"]).count().loc[:, :"Type"]
grouped_training = trainings.groupby(["Track", "conf"]).count().loc[:, :"Type"]
talk_groups = [
    (track, conf)
    for track in grouped_talks.index.unique("Track")
    for conf in ["PyCon Italia 2022", "PyCon X", "PyCon 9"]
]
training_groups = [
    (track, conf)
    for track in grouped_training.index.unique("Track")
    for conf in ["PyCon Italia 2022", "PyCon X", "PyCon 9"]
]

In [162]:
from bokeh.layouts import gridplot

source_talks = ColumnDataSource(grouped_talks)
source_trainings = ColumnDataSource(grouped_training)

index_cmap = factor_cmap(
    "Track_conf", palette=Viridis6, factors=sorted(talks.Track.unique()), end=1
)

ptalks = figure(
    plot_width=800,
    plot_height=400,
    title="Talks submission / Tracks & Conf",
    x_range=FactorRange(*talk_groups),
)

ptalks.vbar(
    x="Track_conf",
    top="Type",
    width=1,
    source=source_talks,
    line_color="white",
    fill_color=index_cmap,
)

ptrainings = figure(
    plot_width=800,
    plot_height=400,
    title="Trainings submission / Tracks & Conf",
    x_range=FactorRange(*training_groups),
)

ptrainings.vbar(
    x="Track_conf",
    top="Type",
    width=1,
    source=source_trainings,
    line_color="white",
    fill_color=index_cmap,
)

ptalks.y_range.start = 0
ptalks.x_range.range_padding = 0.05
ptalks.xgrid.grid_line_color = None
ptalks.xaxis.major_label_orientation = 1.4
ptalks.outline_line_color = None

ptrainings.y_range.start = 0
ptrainings.x_range.range_padding = 0.05
ptrainings.xgrid.grid_line_color = None
ptrainings.xaxis.major_label_orientation = 1.4
ptrainings.outline_line_color = None

f = gridplot([[ptalks], [ptrainings]])
show(f)

In [163]:
langs = data.groupby(["Lang", "conf"]).count()
groups = [
    (lang, conf)
    for lang in ["🇬🇧", "🇮🇹"]
    for conf in ["PyCon Italia 2022", "PyCon X", "PyCon 9"]
]

In [164]:
source = ColumnDataSource(langs)

index_cmap = factor_cmap(
    "Lang_conf",
    palette=Viridis6,
    factors=sorted(langs.index.levels[0].unique()),
    end=1,
)

p = figure(
    plot_width=800,
    plot_height=400,
    title="Lang / Conf",
    x_range=FactorRange(*groups),
)

p.vbar(
    x="Lang_conf",
    top="Type",
    width=1,
    source=source,
    line_color="white",
    fill_color=index_cmap,
)

p.y_range.start = 0
p.x_range.range_padding = 0.05
p.xgrid.grid_line_color = None
p.xaxis.major_label_orientation = 1.4
p.outline_line_color = None

show(p)

In [165]:
gender = data.groupby(["Gender", "conf"]).count()
groups = [
    (gender, conf)
    for gender in list(data.Gender.unique())
    for conf in ["PyCon Italia 2022", "PyCon X", "PyCon 9"]
]

In [166]:
source = ColumnDataSource(gender)

index_cmap = factor_cmap(
    "Gender_conf",
    palette=Viridis6,
    factors=sorted(gender.index.levels[0].unique()),
    end=1,
)

p = figure(
    plot_width=800,
    plot_height=400,
    title="Speaker Gender / Conf",
    x_range=FactorRange(*groups),
)

p.vbar(
    x="Gender_conf",
    top="Type",
    width=1,
    source=source,
    line_color="white",
    fill_color=index_cmap,
)

p.y_range.start = 0
p.x_range.range_padding = 0.05
p.xgrid.grid_line_color = None
p.xaxis.major_label_orientation = 1.4
p.outline_line_color = None

show(p)

In [167]:
gender_count_2022 = data[["conf", "Gender"]][data["conf"] == "PyCon Italia 2022"].groupby("Gender").count()

In [168]:
xvalues = list(gender_count_2022.reset_index()["Gender"])
counts = list(gender_count_2022.reset_index()["conf"])

p = figure(x_range=xvalues, height=250, title="Speakers Gender PyCon Italia 2022",
           toolbar_location=None, tools="")

p.vbar(x=xvalues, top=counts, width=0.9)

p.xgrid.grid_line_color = None
p.y_range.start = 0

show(p)