In [23]:
from pathlib import Path
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.colors import LogNorm
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
from scipy.stats import pointbiserialr


datapath = Path.cwd().parent / "data" / "sunshines-v4.csv"
df = pd.read_csv(datapath)


def get_latest_totalcomp(row):
    years = ["2023", "2022", "2021", "2020"]
    for year in years:
        salary_col = f"salary_{year}"
        benefits_col = f"benefits_{year}"
        if salary_col in row.index and benefits_col in row.index:  # fall back to previous year if current year is missing
            if pd.notna(row[salary_col]) and pd.notna(row[benefits_col]):  # check if both present
                return row[salary_col] + row[benefits_col]  # get total comp
    print(f"warning: {row['name']}")
    return pd.NA


def get_latest_salary(row):
    years = ["2023", "2022", "2021", "2020"]
    for year in years:
        salary_col = f"salary_{year}"
        if salary_col in row.index:  # fall back to previous year if current year is missing
            if pd.notna(row[salary_col]):  # check if present
                return row[salary_col]
    print(f"warning: {row['name']}")
    return pd.NA


def get_latest_benefits(row):
    years = ["2023", "2022", "2021", "2020"]
    for year in years:
        benefits_col = f"benefits_{year}"
        if benefits_col in row.index:  # fall back to previous year if current year is missing
            if pd.notna(row[benefits_col]):  # check if present
                return row[benefits_col]
    print(f"warning: {row['name']}")
    return pd.NA


def get_latest_role(row):
    years = ["2023", "2022", "2021", "2020"]
    for year in years:
        role_col = f"role_{year}"
        if role_col in row.index:  # fall back to previous year if current year is missing
            if pd.notna(row[role_col]):  # check if present
                return row[role_col]
    print(f"warning: {row['name']}")
    return pd.NA


def get_latest_role_cluster(row):
    years = ["2023", "2022", "2021", "2020"]
    for year in years:
        role_col = f"role_cluster_{year}"
        if role_col in row.index:  # fall back to previous year if current year is missing
            if pd.notna(row[role_col]):  # check if present
                return row[role_col]
    print(f"warning: {row['name']}")
    return pd.NA


df["latest_totalcomp"] = df.apply(get_latest_totalcomp, axis=1)
df["latest_salary"] = df.apply(get_latest_salary, axis=1) # new!
df["latest_benefits"] = df.apply(get_latest_benefits, axis=1) # new!
df["latest_role"] = df.apply(get_latest_role, axis=1)
df["latest_role_cluster"] = df.apply(get_latest_role_cluster, axis=1)
df["perf_combined"] = df["paper_count"] + df["citation_count"] + df["h_index"]
for year in range(2020, 2024):
    df[f"totalcomp_{year}"] = df[f"salary_{year}"] + df[f"benefits_{year}"]

print(df.dtypes)

name                    object
sex                     object
paper_count              int64
citation_count           int64
h_index                  int64
role_2020               object
role_cluster_2020      float64
salary_2020            float64
benefits_2020          float64
role_2021               object
role_cluster_2021      float64
salary_2021            float64
benefits_2021          float64
role_2022               object
role_cluster_2022      float64
salary_2022            float64
benefits_2022          float64
role_2023               object
role_cluster_2023      float64
salary_2023            float64
benefits_2023          float64
latest_totalcomp       float64
latest_salary          float64
latest_benefits        float64
latest_role             object
latest_role_cluster    float64
perf_combined            int64
totalcomp_2020         float64
totalcomp_2021         float64
totalcomp_2022         float64
totalcomp_2023         float64
dtype: object


In [57]:
import altair as alt

alt.data_transformers.disable_max_rows()

# query: quality and quantity of work
query_scatter = alt.Chart(df).mark_circle().encode(
    x="citation_count:Q", y="paper_count:Q",
    color="perf_combined:Q",
    tooltip=["name", "sex", "paper_count", "citation_count", "h_index", "latest_totalcomp", "latest_role", "latest_role_cluster"]
).properties(
    width=600, height=400, title="Citation Count vs. Paper Count"
).add_params(
    brush := alt.selection_interval()
)

# result: latest_salary vs. latest_benefits
result_scatter = alt.Chart(df).mark_circle().encode(
    x="latest_benefits:Q",
    y="latest_salary:Q",
    color="perf_combined:Q",
).properties(
    width=600, height=400, title="Salary vs. Benefits"
).transform_filter(brush)

# result: total m/f count
hex_blue = "#779ECB"
hex_pink = "#FF6961"
result_sex_bar = alt.Chart(df).mark_bar().encode(
    x=alt.X('sex:N', title='Sex'),
    y=alt.Y('count():Q', title='Total Count'),
    color=alt.Color('sex:N', scale=alt.Scale(domain=['M', 'F'], range=[hex_blue, hex_pink]), legend=None),
    tooltip=['sex', 'count()']
).properties(
    width=600,
    height=400,
    title="Sex Ratio"
).transform_filter(brush)

#  result: role count
result_role_bar = alt.Chart(df).mark_bar().encode(
    x=alt.X('latest_role_cluster:N', title='Role'),
    y=alt.Y('count():Q', title='Total Count'),
    color=alt.Color('latest_role_cluster:N', scale=alt.Scale(scheme='tableau20'), legend=None),
    tooltip=['latest_role_cluster', 'count()']
).properties(
    width=600,
    height=400,
    title="Role Cluster Count"
).transform_filter(brush)

final_chart = (query_scatter | result_scatter) & (result_sex_bar | result_role_bar).resolve_scale(color="independent")
final_chart.save(Path.cwd().parent / "index.html")