In [10]:
%matplotlib inline
import json
import tabulate
import datetime
import numpy as np
import matplotlib.pyplot as plt

def int2dur(x):
    minutes = int(x // 60)
    seconds = int(round(x - minutes * 60))
    return f"{minutes: 3d}m{seconds:02d}s"


def analyze_speakers(data, title="", _filter=None):

    if _filter is not None:
        new_data = [_ for _ in data if _filter(_)]
    else:
        new_data = [_ for _ in data]
    speakers = [_ for sublist in new_data for _ in sublist["speakers"]]
    
    if title != "":
        print(f"\n=====[ Analyzing '{title}' ]=====\n")

        
    N_first_female = sum([1 for _ in new_data if _["speakers"][0]["sex"] == "female"])
    N_first_male = sum([1 for _ in new_data if _["speakers"][0]["sex"] == "male"])

    N_last_female = sum([1 for _ in new_data if _["speakers"][-1]["sex"] == "female"])
    N_last_male = sum([1 for _ in new_data if _["speakers"][-1]["sex"] == "male"])
    
    
    sex = [_["sex"] for _ in speakers]
    duration = [_["duration"] for _ in speakers]

    total_duration = sum(duration)
    female_durations = [_["duration"] for _ in speakers if _["sex"] == "female"]
    male_durations = [_["duration"] for _ in speakers if _["sex"] == "male"]

    
    table = {
        "sex": ["female", "male"],
        "N": [len(female_durations), len(male_durations)],
        "N first": [N_first_female, N_first_male],
        "N last": [N_last_female, N_last_male],
        "total": [int2dur(sum(female_durations)), int2dur(sum(male_durations))],
        "percent": [sum(female_durations)/total_duration*100, sum(male_durations)/total_duration*100],
        "min": [int2dur(min(female_durations)), int2dur(min(male_durations))],
        "mean": [int2dur(np.mean(female_durations)), int2dur(np.mean(male_durations))],
        "max": [int2dur(max(female_durations)), int2dur(max(male_durations))],
    }
    print(tabulate.tabulate(table, headers="keys"))

In [11]:
with open("data.json", "r") as stream:
    data = json.load(stream)
    
for idx in range(len(data)):
    print(f"Processing {data[idx]['date']}")
    data[idx]["date"] = datetime.datetime.strptime(data[idx]["date"], "%Y-%m-%d").date()
    for jdx in range(len(data[idx]["speakers"])):
        duration_txt = data[idx]["speakers"][jdx]["duration"]
        minutes, seconds = duration_txt.split(":")
        data[idx]["speakers"][jdx]["duration"] = int(minutes) * 60 + int(seconds)
    
print(f"Found {len(data)} meetings")
meeting_types = set([_["meeting"] for _ in data])
meeting_types

Processing 2023-05-28
Processing 2023-06-25
Processing 2023-07-02
Found 3 meetings


{'fast sunday', 'regular'}

In [12]:
analyze_speakers(data, title="all speakers")

for meeting_type in meeting_types:
    analyze_speakers(data, title=meeting_type, _filter=lambda x: x["meeting"] == meeting_type)


=====[ Analyzing 'all speakers' ]=====

sex       N    N first    N last  total      percent  min    mean    max
------  ---  ---------  --------  -------  ---------  -----  ------  ------
female   11          2         1  59m42s     59.8496  0m12s  5m26s   17m22s
male      7          1         2  40m03s     40.1504  0m16s  5m43s   22m40s

=====[ Analyzing 'fast sunday' ]=====

sex       N    N first    N last  total      percent  min    mean    max
------  ---  ---------  --------  -------  ---------  -----  ------  -----
female    8          0         1  17m39s     66.5201  0m12s  2m12s   4m33s
male      5          1         0  8m53s      33.4799  0m16s  1m47s   2m41s

=====[ Analyzing 'regular' ]=====

sex       N    N first    N last  total      percent  min     mean    max
------  ---  ---------  --------  -------  ---------  ------  ------  ------
female    3          2         0  42m03s     57.4323  10m55s  14m01s  17m22s
male      2          0         2  31m10s     42.5677  8m