### Set Up

In [1]:
import pandas as pd
import numpy as np
import altair as alt
from IPython.display import Image

# Suppress FutureWarning
import warnings
warnings.filterwarnings("ignore")

### Read in Data

In [2]:
#Used more robust encoding 'ISO-8859-1' instead of 'ascii' (got error)
data = pd.read_csv('data/my_data(v3).csv', encoding='ISO-8859-1')

# JADE - INTERACTIVE VIZ #1

In [3]:
# Filter the DataFrame for rows where 'is_athlete' is Athlete, Non-Athlete
filtered_data = data[data['is_athlete'].isin(["Athlete", "Non-Athlete"])]

### Occupation

In [4]:
# Group by 'Occupation' and 'is_athlete', calculate the average of 'happy' for each group
grouped_occupation_data = filtered_data.groupby(['Occupation', 'is_athlete'])['Happy'].mean().reset_index()

# Pivot the table to have 'Occupation' as rows, 'is_athlete' as columns, and 'happy' values in the cells
pivot_occupation_data = grouped_occupation_data.pivot(index='Occupation', columns='is_athlete', values='Happy')

# Rename columns to 'Athlete' and 'Non-Athlete'
pivot_occupation_data.columns = ['Athlete', 'Non-Athlete']

# Reset the index to make 'Occupation' a column again
occupation_df = pivot_occupation_data.reset_index()

# Drop rows with NaN values
occupation_df = occupation_df.dropna()

# Display the resulting DataFrame
occupation_df

Unnamed: 0,Occupation,Athlete,Non-Athlete
0,Academic,3.0,2.666667
3,Accountant,4.333333,3.333333
10,Administrator,2.5,3.0
23,Auditor,3.0,4.0
57,Civil Engineer,4.0,4.0
58,Civil Servant,3.5,3.571429
61,Civil servant,3.125,3.0
62,Civil service,4.0,2.5
68,Coach,4.0,4.0
86,Doctor,4.0,4.0


In [5]:
occupation_coordinate_df = pd.DataFrame({
    'Occupation': occupation_df['Occupation'],
    'x1': occupation_df[['Athlete', 'Non-Athlete']].min(axis=1),
    'x2': occupation_df[['Athlete', 'Non-Athlete']].max(axis=1)
})

occupation_coordinate_df

Unnamed: 0,Occupation,x1,x2
0,Academic,2.666667,3.0
3,Accountant,3.333333,4.333333
10,Administrator,2.5,3.0
23,Auditor,3.0,4.0
57,Civil Engineer,4.0,4.0
58,Civil Servant,3.5,3.571429
61,Civil servant,3.0,3.125
62,Civil service,2.5,4.0
68,Coach,4.0,4.0
86,Doctor,4.0,4.0


In [6]:
chart_athlete = alt.Chart(occupation_df).mark_circle(color = "red", size = 70).encode(
    y = alt.Y('Occupation:N', title='Occupation'),
    x = alt.X('Athlete:Q', title = 'Average Happiness')
).properties(
    width=600,
    height=800
)

chart_non_athlete = alt.Chart(occupation_df).mark_circle(color = "blue", size = 70).encode(
    y = alt.Y('Occupation:N', title='Occupation'),
    x = alt.X('Non-Athlete:Q')
).properties(
    width=600,
    height=800
)

chart_line = alt.Chart(occupation_coordinate_df).mark_line().encode(
    y=alt.Y('Occupation:N', title='Occupation'),
    x=alt.X('x1:Q'),
    x2=alt.X2('x2:Q')
)

chart_combined = chart_line + chart_athlete + chart_non_athlete

chart_combined = chart_combined.configure_axis(
    gridDash=[2, 2],  # Set dash style for grid lines
    tickCount=10,  # Set the number of ticks
    labelFlush=False,  # Prevent tick labels from being cut off
).configure_view(
    stroke=None  # Remove border
).properties(
    title = "TEMP TITLE" #TODO
)

chart_combined

### Gender

In [7]:
# Group by 'Gender' and 'is_athlete', calculate the average of 'happy' for each group
grouped_gender_data = filtered_data.groupby(['Gender', 'is_athlete'])['Happy'].mean().reset_index()

# Pivot the table to have 'Gender' as rows, 'is_athlete' as columns, and 'happy' values in the cells
pivot_gender_data = grouped_gender_data.pivot(index='Gender', columns='is_athlete', values='Happy')

# Rename columns to 'Athlete' and 'Non-Athlete'
pivot_gender_data.columns = ['Athlete', 'Non-Athlete']

# Reset the index to make 'Gender' a column again
gender_df = pivot_gender_data.reset_index()

# Drop rows with NaN values
gender_df = gender_df.dropna()
2
# Display the resulting DataFrame
gender_df

Unnamed: 0,Gender,Athlete,Non-Athlete
0,Female,3.666667,3.481982
1,Male,3.55914,3.706767


In [8]:
gender_coordinate_df = pd.DataFrame({
    'Gender': gender_df['Gender'],
    'x1': gender_df[['Athlete', 'Non-Athlete']].min(axis=1),
    'x2': gender_df[['Athlete', 'Non-Athlete']].max(axis=1)
})

gender_coordinate_df

Unnamed: 0,Gender,x1,x2
0,Female,3.481982,3.666667
1,Male,3.55914,3.706767


In [9]:
# Merge occupation_df and gender_df
merged_df = pd.merge(occupation_df, gender_df, how='outer', left_on='Occupation', right_on='Gender', suffixes=('_Occupation', '_Gender'))

merged_df

Unnamed: 0,Occupation,Athlete_Occupation,Non-Athlete_Occupation,Gender,Athlete_Gender,Non-Athlete_Gender
0,Academic,3.0,2.666667,,,
1,Accountant,4.333333,3.333333,,,
2,Administrator,2.5,3.0,,,
3,Auditor,3.0,4.0,,,
4,Civil Engineer,4.0,4.0,,,
5,Civil Servant,3.5,3.571429,,,
6,Civil servant,3.125,3.0,,,
7,Civil service,4.0,2.5,,,
8,Coach,4.0,4.0,,,
9,Doctor,4.0,4.0,,,


In [10]:
# Merge occupation_coordinate_df and gender_coordinate_df
merged_coordinate_df = pd.merge(occupation_coordinate_df, gender_coordinate_df, how='outer', left_on='Occupation', right_on='Gender', suffixes=('_Occupation', '_Gender'))

merged_coordinate_df

Unnamed: 0,Occupation,x1_Occupation,x2_Occupation,Gender,x1_Gender,x2_Gender
0,Academic,2.666667,3.0,,,
1,Accountant,3.333333,4.333333,,,
2,Administrator,2.5,3.0,,,
3,Auditor,3.0,4.0,,,
4,Civil Engineer,4.0,4.0,,,
5,Civil Servant,3.5,3.571429,,,
6,Civil servant,3.0,3.125,,,
7,Civil service,2.5,4.0,,,
8,Coach,4.0,4.0,,,
9,Doctor,4.0,4.0,,,


In [11]:
# chart_athlete = alt.Chart(merged_df).mark_circle(color = "red", size = 70).encode(
#     y = alt.Y('Occupation:N', title='Occupation'),
#     x = alt.X('Athlete_occupation:Q', title = 'Average Happiness')
# ).properties(
#     width=600,
#     height=800
# )

# chart_non_athlete = alt.Chart(merged_df).mark_circle(color = "blue", size = 70).encode(
#     y = alt.Y('Occupation:N', title='Occupation'),
#     x = alt.X('Non-Athlete_occupation:Q')
# ).properties(
#     width=600,
#     height=800
# )

# chart_line = alt.Chart(merged_coordinate_df).mark_line().encode(
#     y=alt.Y('Occupation:N', title='Occupation'),
#     x=alt.X('start_occupation:Q'),
#     x2=alt.X2('end_occupation:Q')
# )

# chart_combined = chart_line + chart_athlete + chart_non_athlete

# chart_combined = chart_combined.configure_axis(
#     gridDash=[2, 2],  # Set dash style for grid lines
#     tickCount=10,  # Set the number of ticks
#     labelFlush=False,  # Prevent tick labels from being cut off
# ).configure_view(
#     stroke=None  # Remove border
# ).properties(
#     title = "TEMP TITLE" #TODO
# )

# chart_combined

### DROPDOWN

In [12]:
# Dropdown
dropdown = alt.binding_select(
    options=['Athlete_Occupation', 'Gender'],  # TODO: add more options
    name='Filter'
)
filter = alt.param( #TODO - what does this do?
    value='Athlete_Occupation',  # Default selected option
    bind=dropdown
)

chart_athlete = alt.Chart(merged_df).mark_circle(color = "red", size = 70).encode(
    y = alt.Y('Occupation:N', title='Occupation'),
    x=alt.X('x:Q').title(''),
).transform_calculate(
    x=f'datum[{filter.name}]'
).add_params(
    filter
).properties(
    width=600,
    height=800
)

chart_athlete

# chart_non_athlete = alt.Chart(merged_df).mark_circle(color = "blue", size = 70).encode(
#     y = alt.Y('Occupation:N', title='Occupation'),
#     x = alt.X(f'Non-Athlete_{filter.value}:Q')
# ).transform_calculate(
#     x=f'datum[{filter.name}]'
# ).add_params(
#     filter
# ).properties(
#     width=600,
#     height=800
# )

# chart_line = alt.Chart(merged_coordinate_df).mark_line().encode(
#     y=alt.Y('Occupation:N', title='Occupation'),
#     x=alt.X(f'x1_{filter.value}:Q'),
#     x2=alt.X2(f'x2_{filter.value}:Q')
# ).transform_calculate(
#     x=f'datum[{filter.name}]'
# ).add_params(
#     filter
# )

# chart_combined = chart_line + chart_athlete + chart_non_athlete

# chart_combined = chart_combined.configure_axis(
#     gridDash=[2, 2],  # Set dash style for grid lines
#     tickCount=10,  # Set the number of ticks
#     labelFlush=False,  # Prevent tick labels from being cut off
# ).configure_view(
#     stroke=None  # Remove border
# ).properties(
#     title = "TEMP TITLE" #TODO
# )

# chart_combined

In [13]:
# # Dropdown
# dropdown = alt.binding_select(
#     options=['Occupation', 'Gender'],  # TODO: add more options
#     name='Filter'
# )
# filter_select = alt.param( #TODO - what does this do?
#     value='Occupation',  # Default selected option
#     bind=dropdown
# )

# chart_athlete = alt.Chart(merged_df).mark_circle(color = "red", size = 70).encode(
#     y = alt.Y('Occupation:N', title='Occupation'),
#     x = alt.X(f'Athlete_{filter.value}:Q', title = 'Average Happiness')
# ).transform_calculate(
#     x=f'datum[{filter.name}]'
# ).add_params(
#     filter
# ).properties(
#     width=600,
#     height=800
# )

# chart_athlete

# # chart_non_athlete = alt.Chart(merged_df).mark_circle(color = "blue", size = 70).encode(
# #     y = alt.Y('Occupation:N', title='Occupation'),
# #     x = alt.X(f'Non-Athlete_{filter.value}:Q')
# # ).transform_calculate(
# #     x=f'datum[{filter.name}]'
# # ).add_params(
# #     filter
# # ).properties(
# #     width=600,
# #     height=800
# # )

# # chart_line = alt.Chart(merged_coordinate_df).mark_line().encode(
# #     y=alt.Y('Occupation:N', title='Occupation'),
# #     x=alt.X(f'x1_{filter.value}:Q'),
# #     x2=alt.X2(f'x2_{filter.value}:Q')
# # ).transform_calculate(
# #     x=f'datum[{filter.name}]'
# # ).add_params(
# #     filter
# # )

# # chart_combined = chart_line + chart_athlete + chart_non_athlete

# # chart_combined = chart_combined.configure_axis(
# #     gridDash=[2, 2],  # Set dash style for grid lines
# #     tickCount=10,  # Set the number of ticks
# #     labelFlush=False,  # Prevent tick labels from being cut off
# # ).configure_view(
# #     stroke=None  # Remove border
# # ).properties(
# #     title = "TEMP TITLE" #TODO
# # )

# # chart_combined