In [None]:
import pandas as pd
import plotly.express as px

# See: https://towardsdatascience.com/read-data-from-google-sheets-into-pandas-without-the-google-sheets-api-5c468536550
sheet_id = "1q8MB-H49pd1ojqZzMbs0ExbMkWKl0Ll0cyJ3_OpF3_g"
sheet_name = "Job_Search_Status"
url = f"https://docs.google.com/spreadsheets/d/{sheet_id}/gviz/tq?tqx=out:csv&sheet={sheet_name}"

df = pd.read_csv(url, usecols=range(0, 11))

In [None]:
df

Unnamed: 0,Company,Industry,Position,Application Link,Location,Status,Source,App Date,My TODO,Vivian's TODO,Notes
0,Fakeaccepted,,Fake,,Fake,Accepted,Fake,4/15/2022,,,Fake
1,Fakechosen,,Fake,,Fake,Chosen,Fake,4/15/2022,,,Fake
2,Fakechosen2,,Fake,,Fake,Chosen,Fake,4/15/2022,,,Fake
3,Fakeonsite,,Fake,,Fake,Onsite,Fake,4/15/2022,,,Fake
4,Fakeonsite2,,Fake,,Fake,Onsite,Fake,4/15/2022,,,Fake
...,...,...,...,...,...,...,...,...,...,...,...
129,Guided.gg,Gaming,Data Scientist,https://jobs.lever.co/guilded/3a85f86d-f141-41...,Remote,Applied,LinkedIn,5/2/2022,,,hn
130,Faire,,Data Scientist New Graduate,https://boards.greenhouse.io/faire/jobs/607276...,Remote,Applied,LinkedIn,5/2/2022,,,hn
131,Tonal,Gym,Data Scientist,https://jobs.lever.co/tonal/af574c9a-9a36-41af...,NYC,Applied,LinkedIn,5/2/2022,,,hn
132,Garner Health,Healthcare,Data Science Engineer,https://boards.greenhouse.io/garnerhealth/jobs...,NYC,Interview,NYC DSA,5/2/2022,Interview scheduled with Doug,,


In [None]:
nycdsa = (
    df.groupby("Source")["Company"]
    .nunique()
    .reset_index()
    .rename(columns={"Company": "Positions"})
)
fig = px.pie(nycdsa, values="Positions", names="Source", title="Jobs via Source")
colors = ["gold", "mediumturquoise", "darkorange", "lightgreen"]
fig.update_traces(
    hoverinfo="label+percent",
    textinfo="value",
    textfont_size=20,
    marker=dict(colors=colors, line=dict(color="#151515", width=2)),
)
fig.show()

In [None]:
industry = (
    df.groupby("Industry")["Company"]
    .nunique()
    .reset_index()
    .rename(columns={"Company": "Jobs"})
    .sort_values(by="Jobs", ascending=False)
    .head(10)
)

fig = px.bar(industry, title="Jobs by Industry (top 10)", x="Industry", y="Jobs")
fig.show()

In [None]:
apps_time = (
    df.groupby("App Date")[["Company"]]
    .nunique()
    .reset_index()
    .rename(columns={"Company": "Applications", "App Date": "Date"})
)
apps_time["Date"] = pd.to_datetime(apps_time["Date"])
apps_time["Applications"] = apps_time["Applications"].cumsum()
# apps_time['Rejections'] = apps_time[apps_time['Status'] == 'Rejected']['Status'].cumsum()

fig = px.line(apps_time, x="Date", y="Applications")
fig.show()

In [None]:
app_status = df.Status.apply(
    lambda status: "No response" if status == "Applied" else status
).value_counts()

fig = px.bar(app_status, color=app_status.index)
fig.show()

In [None]:
import plotly.graph_objects as go

# add application status. NOTE assuming that app_status is sorted and monotonically decreasing
status_values = [app_status.sum()] + list(app_status.values)
status_labels = ["Applications"] + list(app_status.index)

status_colors = [
    "chartreuse",
    "rgba(254,178,76,1)",
    "rgba(240,59,32,1)",
    "rgba(247,252,185,1)",
    "rgba(173,221,142,1)",
    "rgba(49,163,84,1)",
    "rgba(44,127,184,1)",
]
status_link_colors = [
    "rgba(254,178,76,.2)",
    "rgba(240,59,32,.2)",
    "rgba(247,252,185,.5)",
    "rgba(173,221,142,.2)",
    "rgba(49,163,84,.2)",
    "rgba(44,127,184,.2)",
]

# also add application source as input into start node
source_values = list(df.Source.value_counts().values)
source_labels = list(df.Source.value_counts().index)
source_colors = [
    "rgba(239,237,245,1)",
    "rgba(239,237,245,1)",
    "rgba(188,189,220,1)",
    "rgba(117,107,177,1)",
]  # FAKE is duplicated color here
source_link_colors = [
    "rgba(239,237,245,.5)",
    "rgba(239,237,245,.5)",
    "rgba(188,189,220,.5)",
    "rgba(117,107,177,.5)",
]

values = source_values + status_values
labels = source_labels + status_labels
labels_with_values = ["%s: %s" % x for x in zip(labels, values)]
colors = source_colors + status_colors
link_colors = source_link_colors + status_link_colors

source_sources = [0, 1, 2, 3]
source_targets = [4, 4, 4, 4]
status_sources = [x + len(source_sources) for x in [0, 0, 0, 3, 4, 5, 6]]
status_targets = [x + len(source_targets) for x in [1, 2, 3, 4, 5, 6, 7]]
sources = source_sources + status_sources
targets = source_targets + status_targets

# HACK remove the value for "Application"
del values[len(source_sources)]

fig = go.Figure(
    data=[
        go.Sankey(
            arrangement="freeform",
            valueformat=".0f",
            node=dict(
                pad=15,
                thickness=20,
                line=dict(color="#151515", width=0.5),
                label=labels_with_values,
                color=colors,
                x=[0.1, 0.1, 0.1, 0.1, 0.25, 0.5, 0.5, 0.5, 0.6, 0.7, 0.85],
                y=[0.3, 0.8, 0.9, 1, 0.1, 0.10, 0.60, 0.9, 0.8, 0.85, 0.95],
                customdata=labels,
                hovertemplate="%{customdata}",
            ),
            link=dict(
                source=sources,
                target=targets,
                value=values,
                color=link_colors,
                hoverinfo="skip",
            ),
        )
    ]
)

fig.update_layout(
    title_text="<Fake Data><br>Rishi's Job Search Status<br>Spring 2022", font_size=10
)
fig.show()