In [1]:
import altair as alt
import pandas as pd
import numpy as np

In [2]:
palette = alt.Scale(
    domain=["Don’t need it (not interested)", 
            "Can use it somewhere else",  
            "Privacy or security concerns",  # 1-3: I choose to not go online
            "Too expensive",
            "Not available in area",
            "No computer or computer inadequate",  # 4-6: I cannot go online
            "Other reasons"],  # 4-6: I cannot go online
    range=["#fb9", "#fdd", "#eec", "#824", "#d35", "#f66", "#eee"]
)


# Income 
## setup

In [3]:
income_access = pd.read_csv("data/income-internet-access.csv") # 2010, 2013, 2015, 2017
income_no_access = pd.read_csv("data/income-no-access.csv")
income_reasons = pd.read_csv("data/income-reasons.csv") # 2010, 2013, 2015, 2017
reasons_doc = pd.read_csv("data/reasons_doc.csv")
income_no_access

Unnamed: 0,income,2010_no_access,2015_no_access,2017_no_access
0,Less than 10k,55.901513,46.192595,37.072659
1,10-19k,49.661223,45.282512,38.71626
2,20-29k,39.724911,35.94716,28.275212
3,30-39k,28.927634,29.249005,24.932965
4,40-49k,18.764369,22.193286,16.122799
5,50-74k,14.451187,15.739641,13.895489
6,75-99k,8.73186,12.372852,10.18507
7,100k or more,4.761172,10.030907,9.532456


## Pre-process

In [4]:
# df1 = income_access[["income", "dial_up_2010", "high_speed_int_2010", "dial_up_2013", "high_speed_int_2013", "dial_up_2015", "high_speed_int_2015", "dial_up_2017", "high_speed_int_2017"]]
# df1 = pd.melt(df1, id_vars=["income"])
# df1["year"] = df1["variable"].map(lambda x: x[-4:])
# df1["label"] = df1["variable"].map(lambda x: "Dial up service" if x[0]=="d" else "High-speed internet service")
# df1["type"] = "access"
# df1.head()

df1 = pd.melt(income_no_access, id_vars=["income"])
df1["year"] = df1["variable"].map(lambda x: x[:4])
df1["value"] = 100 - df1["value"]  # no access --> have access
df1["label"] = "with access at home"
df1["type"] = "access"
# df1.head()

In [5]:
def annotate_reason(text):
    reason_id = int(text[-1])
    reason = reasons_doc[reasons_doc["reason_id"]==reason_id]["reason"].values[0]
    return reason

df2 = pd.melt(income_reasons, id_vars=["income"])
df2["year"] = df2["variable"].map(lambda x: x[:4])
df2["label"] = df2["variable"].map(lambda x: annotate_reason(x))
df2["type"] = "reasons"
# df2.head()

In [6]:
income_df = pd.concat([df1, df2])
income_df[income_df["year"]=="2010"]

income_order = {
    "Less than 10k": 1,
    "10-19k": 2,
    "20-29k": 3,
    "30-39k": 4,
    "40-49k": 5,
    "50-74k": 6,
    "75-99k": 7,
    "100k or more": 8
}

income_df["order"] = income_df["income"].map(lambda x: income_order[x])
# income_df

## charts

In [21]:
radio_year = alt.binding_radio(options=["2010","2015","2017"], name="year: ")

select_year = alt.selection_single(name="select",
                                   fields=["year"],
                                   bind=radio_year,
                                   init={"year": "2010"})

select_bar1 = alt.selection_single(
    on='mouseover',
    nearest=True,
    clear="click"
)

select_bar2 = alt.selection_single(
    on='mouseover',
    clear="click"
)

select_reason = alt.selection_single(
    fields=["label"],
    bind="legend"
)


OpacityCondition1 = alt.condition(select_bar1, alt.value(1.0), alt.value(0.8))
OpacityCondition2 = alt.condition(select_bar2, alt.value(1.0), alt.value(0.8))
ColorCondition = alt.condition(select_reason, 
                               alt.Color("label", 
                                         title="reasons", 
                                         type="nominal",
#                                          scale=alt.Scale(scheme="tableau20"), 
                                         scale=palette,
                                         sort=alt.EncodingSortField("value", order='ascending')),
                               alt.value("lightgrey")
                              )

# palette = alt.Scale(range=['lightgreen', 'darkgreen', 'olive'])

base = alt.Chart(income_df).add_selection(
    select_year
).transform_filter(
    select_year
).properties(
    width=250, 
)

left1 = base.transform_filter(
    alt.datum.type=="access"
).encode(
    x=alt.X("value:Q",
            title=None,
            scale=alt.Scale(domain=[0,100]),  ## how to change the scale properly?
#             scale=alt.Scale(domain=[100, 50]),
           sort=alt.SortOrder("descending")),
    y=alt.Y("income:N", 
            title=None, 
            axis=None,
            sort=alt.EncodingSortField(field="order", order='ascending')),
    opacity=OpacityCondition1,
    tooltip="value",
    color=alt.value("#136")
).mark_bar().properties(
    title="% having access to Internet at home"
).add_selection(
    select_bar1
)

middle1 = base.encode(
    y=alt.Y("income:N", 
            sort=alt.EncodingSortField(field="order", order='ascending'),
            axis=None),
    text=alt.Text("income:N")
).mark_text().properties(width=20)

right1 = base.encode(
    x=alt.X("value:Q",
            scale=alt.Scale(domain=[0,100]),
           title=None),
    y=alt.Y("income:N",
            title=None,
            axis=None,
            sort=alt.EncodingSortField(field="order", order='ascending')),
    color=ColorCondition,
    order=alt.Order("value", sort="descending"),
    opacity=OpacityCondition2,
    tooltip=["label","value"]
    
).transform_filter(
    (alt.datum.type=="reasons") & (alt.datum.reason!="Other reasons")
# ).transform_filter(
#     select_reason
).mark_bar().properties(
    title="% reasons for not going online at home"
).add_selection(
    select_bar2
).add_selection(
    select_reason
)


# income_charts = alt.concat(left, middle, right, spacing=5, title="Digital Divides Across Income Levels").configure_legend(
#     orient='bottom'
# ).resolve_scale(
#     color="independent", 
# #     y="shared"
# ).configure_axis(
#     grid=False
# ).configure_view(
#     strokeOpacity=0
# ).configure_legend(
#     orient="right"
# ).configure_title(
#     fontSize=15,
# #     font="Courier",
# #     align="center",
# #     anchor="middle",
# #     color="gray"
# )

# .save("income.html")


back-to-back bar chart: https://altair-viz.github.io/gallery/us_population_pyramid_over_time.html

In [8]:
## TO DO: 
# 1. back-to-back [done]
# 2. color map [done]
# 3. add base layer [done]
# 4. axis sorting [done]
# 5. interactive legend [done]
# 6. more dimensions [done]
# 7. sort stacked bars [done]

# Age

## setup

In [9]:
age_access = pd.read_csv("data/age-no-access.csv")
age_reasons = pd.read_csv("data/age-reasons.csv") # 2010, 2015, 2017

In [10]:
df3 = pd.melt(age_access, id_vars=["age"])
df3["year"] = df3["variable"].map(lambda x: x[:4])
df3["value"] = 100 - df3["value"]  # no access --> have access
df3["label"] = "with access at home"
df3["type"] = "access"
# df3.head()

In [11]:
df4 = pd.melt(age_reasons, id_vars=["age"])
df4["year"] = df4["variable"].map(lambda x: x[:4])
df4["label"] = df4["variable"].map(lambda x: annotate_reason(x))
df4["type"] = "reasons"
# df4.head(11)

In [12]:
age_df = pd.concat([df3, df4])

age_order = {
    "3 and 4": 1,
    "5 to 10": 2,
    "11 to 14": 3,
    "15 to 18": 4,
    "19 to 24": 5,
    "25 to 29": 6,
    "30 to 39": 7,
    "40 to 49": 8,
    "50 to 59": 9,
    "60 to 69": 10,
    "70 or older": 11
}

age_df["order"] = age_df["age"].map(lambda x: age_order[x])
# age_df.head()

## charts

In [13]:
radio_year = alt.binding_radio(options=["2010","2015","2017"], name="year: ")

select_year = alt.selection_single(name="select",
                                   fields=["year"],
                                   bind=radio_year,
                                   init={"year": "2010"})

select_bar1 = alt.selection_single(
    on='mouseover',
    nearest=True,
    clear="click"
)

select_bar2 = alt.selection_single(
    on='mouseover',
    clear="click"
)

select_reason = alt.selection_single(
    fields=["label"],
    bind="legend"
)


OpacityCondition1 = alt.condition(select_bar1, alt.value(1.0), alt.value(0.8))
OpacityCondition2 = alt.condition(select_bar2, alt.value(1.0), alt.value(0.8))


ColorCondition = alt.condition(select_reason, 
                               alt.Color("label", 
                                         title="reasons", 
                                         type="nominal",
#                                          scale=alt.Scale(scheme="category20c"),
                                         scale=palette,
                                         sort=alt.EncodingSortField("value", order='ascending')),
                               alt.value("lightgrey")
                              )

# palette = alt.Scale(range=['lightgreen', 'darkgreen', 'olive'])

base = alt.Chart(age_df).add_selection(
    select_year
).transform_filter(
    select_year
).properties(
    width=250, 
)

left2 = base.transform_filter(
    alt.datum.type=="access"
).encode(
    x=alt.X("value:Q",
            title=None,
            scale=alt.Scale(domain=[0,100]),
            sort=alt.SortOrder("descending")),
    y=alt.Y("age:N", 
            title=None, 
            axis=None,
            sort=alt.EncodingSortField(field="order", order='ascending')),
    opacity=OpacityCondition1,
    color=alt.value("#136"),
    tooltip="value"
).mark_bar().properties(
    title="% having access to Internet at home"
).add_selection(
    select_bar1
)

middle2 = base.encode(
    y=alt.Y("age:N", 
            sort=alt.EncodingSortField(field="order", order='ascending'),
            axis=None),
    text=alt.Text("age:N")
).mark_text().properties(width=20)

right2 = base.transform_filter(
    alt.datum.type=="reasons"
).encode(
    x=alt.X("value:Q",
            scale=alt.Scale(domain=[0,100]),
           title=None),
    y=alt.Y("age:N",
            title=None,
            axis=None,
            sort=alt.EncodingSortField(field="order", order='ascending')),
    color=ColorCondition,
    order=alt.Order("value", sort="descending"),
    opacity=OpacityCondition2,
    tooltip=["label","value"]
).mark_bar().properties(
    title="% reasons for not going online at home"
).add_selection(
    select_bar2
).add_selection(
    select_reason
)


# age_charts = alt.concat(left, middle, right, spacing=5, title="Digital Divides Across Age Groups").configure_legend(
#     orient='bottom'
# ).resolve_scale(
#     color="independent", 
# #     y="shared"
# ).configure_axis(
#     grid=False
# ).configure_view(
#     strokeOpacity=0
# ).configure_legend(
#     orient="right"
# ).configure_title(
#     fontSize=15,
# #     font="Courier",
# #     align="center",
# #     anchor="middle",
# #     color="gray"
# )

# .save("age.html")

# age_charts = alt.concat(left, middle, right, spacing=5, title="Digital Divides Across Age Groups")

# Race

## setup

In [14]:
race_access = pd.read_csv("data/race-no-access.csv")
race_reasons = pd.read_csv("data/race-reasons.csv") # 2010, 2015, 2017

In [15]:
df5 = pd.melt(race_access, id_vars=["race"])
df5["year"] = df5["variable"].map(lambda x: x[:4])
df5["value"] = 100 - df5["value"]  # no access --> have access
df5["label"] = "with access at home"
df5["type"] = "access"
# df5.head(20)

In [16]:
df6 = pd.melt(race_reasons, id_vars=["race"])
df6["year"] = df6["variable"].map(lambda x: x[:4])
df6["label"] = df6["variable"].map(lambda x: annotate_reason(x))
df6["type"] = "reasons"
# df6.head()

In [17]:
race_df = pd.concat([df5, df6])

race_order = {
    "Asian": 1,
    "Pacific Islander": 2,
    "White": 3,
    "Hispanic": 4,
    "Black": 5,
    "AI/AN": 6,
}  # by internet access data

race_df["order"] = race_df["race"].map(lambda x: race_order[x])
# race_df.head()

In [18]:
radio_year = alt.binding_radio(options=["2010","2015","2017"], name="year: ")

select_year = alt.selection_single(name="select",
                                   fields=["year"],
                                   bind=radio_year,
                                   init={"year": "2010"})

select_bar1 = alt.selection_single(
    on="mouseover",
    nearest=True,
    clear="click"
)

select_bar2 = alt.selection_single(
    on="mouseover",
    clear="click"
)

select_reason = alt.selection_single(
    fields=["label"],
    bind="legend"
)


OpacityCondition1 = alt.condition(select_bar1, alt.value(1.0), alt.value(0.8))
OpacityCondition2 = alt.condition(select_bar2, alt.value(1.0), alt.value(0.8))

ColorCondition = alt.condition(select_reason, 
                               alt.Color("label", 
                                         title="reasons", 
                                         type="nominal",
#                                          scale=alt.Scale(scheme="accent"), 
                                         scale=palette,
                                         sort=alt.EncodingSortField("value", order='ascending')),
                               alt.value("lightgrey")
                              )

# palette = alt.Scale(range=['lightgreen', 'darkgreen', 'olive'])

base = alt.Chart(race_df).add_selection(
    select_year
).transform_filter(
    select_year
).properties(
    width=250, 
)

left3 = base.transform_filter(
    alt.datum.type=="access"
).encode(
    x=alt.X("value:Q",
            title=None,
            scale=alt.Scale(domain=[0,100]),
            sort=alt.SortOrder("descending")),
    y=alt.Y("race:N", 
            title=None, 
            axis=None,
            sort=alt.EncodingSortField(field="order", order='ascending')),
    opacity=OpacityCondition1,
    color=alt.value("#136"),
    tooltip="value",
).mark_bar().properties(
    title="% having access to Internet at home"
).add_selection(
    select_bar1
)

middle3 = base.encode(
    y=alt.Y("race:N", 
            sort=alt.EncodingSortField(field="order", order='ascending'),
            axis=None),
    text=alt.Text("race:N")
).mark_text().properties(width=20)

right3 = base.encode(
    x=alt.X("value:Q",
            scale=alt.Scale(domain=[0,100]),
           title=None),
    y=alt.Y("race:N",
            title=None,
            axis=None,
            sort=alt.EncodingSortField(field="order", order='ascending')),
    color=ColorCondition,
    order=alt.Order("value", sort="descending"),
    opacity=OpacityCondition2,
    tooltip=["label","value"]
    
).transform_filter(
    (alt.datum.type=="reasons") & (alt.datum.reason!="Other reasons")
# ).transform_filter(
#     select_reason
).mark_bar().properties(
    title="% reasons for not going online at home"
).add_selection(
    select_bar2
).add_selection(
    select_reason
)


# race_charts = alt.concat(left, middle, right, spacing=5, title="Digital Divides Across Races").configure_legend(
#     orient='bottom'
# ).resolve_scale(
#     color="independent", 
# #     y="shared"
# ).configure_axis(
#     grid=False
# ).configure_view(
#     strokeOpacity=0
# ).configure_legend(
#     orient="right"
# ).configure_title(
#     fontSize=15,
# #     font="Courier",
# #     align="center",
# #     anchor="middle",
# #     color="gray"
# )

# .save("race.html")

color schemes: https://vega.github.io/vega/docs/schemes/

# Combined

In [22]:
income_charts = alt.concat(left1, middle1, right1, spacing=20, title="Digital Divides Across Income Levels")
age_charts = alt.concat(left2, middle2, right2, spacing=28, title="Digital Divides Across Age Groups")
race_charts = alt.concat(left3, middle3, right3, spacing=18, title="Digital Divides Across Races")
(income_charts & age_charts & race_charts).configure_legend(
    orient="right",
).resolve_scale(
    color="independent", 
    y="shared"
).configure_axis(
    grid=False
).configure_view(
    strokeOpacity=0
).configure_legend(
    orient="right"
).configure_title(
    fontSize=15,
# #     font="Courier",
# #     align="center",
# #     anchor="middle",
# #     color="gray"
)

## Notes

This back-to-back bar chart demonstrates digital divides across three demographic dimensions.
- left: a bar chart showing the percentage of people having access to Internet at home
- right: a stacked bar chart showing the percentage of each reason why people choose not to go online at home
    - the stacked bars are sorted, with the most common reason placed at the left
    - the color palette:
        - dark, salient colors: highlight those external reasons (people are not going online because they cannot go online due to limited budgets, no available facilities, etc.)
        - light colors: internal/subjective reasons (people don't **want** to go online at home)
- interactives:
    - hovering on the left-side bars: show the exact percentage
    - hovering on the right-side bars: show the reason and the exact percentage
    - clicking on the legend: only show bars for that particular reason
    

Future plans:
- will highlight the gaps between the left bars (using auxiliary line and texts)
- will add more text annotations for the bars on the right
- will try to align bars when clicking on the legend (for easier comparisons)
- haven't sorted the axes based on the percentage of people with Internet access, partly because some demographics are ordinal variables - but will try to figure out a way to present the ranking