### Set Up

In [1]:
import pandas as pd
import numpy as np
import altair as alt
from IPython.display import Image

# Suppress FutureWarning
import warnings
warnings.filterwarnings("ignore")

### Read in Data

In [2]:
#Used more robust encoding 'ISO-8859-1' instead of 'ascii' (got error)
data = pd.read_csv('data/my_data(v4).csv', encoding='ISO-8859-1')

# JADE - INTERACTIVE VIZ #1

In [3]:
# Filter the DataFrame for rows where 'is_athlete' is Athlete, Non-Athlete
filtered_data = data[data['is_athlete'].isin(["Athlete", "Non-Athlete"])]

### Occupation

In [4]:
# Group by 'Occupation' and 'is_athlete', calculate the average of 'happy' for each group
grouped_occupation_data = filtered_data.groupby(['Occupation', 'is_athlete'])['Happy'].mean().reset_index()

# Round the 'Happy' column averages to the nearest 0.5, handling NaN values
grouped_occupation_data['Happy'] = grouped_occupation_data['Happy'].apply(lambda x: np.nan if pd.isna(x) else round(x * 4) / 4)

# Pivot the table to have 'Occupation' as rows, 'is_athlete' as columns, and 'happy' values in the cells
pivot_occupation_data = grouped_occupation_data.pivot(index='Occupation', columns='is_athlete', values='Happy')

# Rename columns to 'Athlete' and 'Non-Athlete'
pivot_occupation_data.columns = ['Athlete', 'Non-Athlete']

# Reset the index to make 'Occupation' a column again
occupation_df = pivot_occupation_data.reset_index()

# Drop rows with NaN values
occupation_df = occupation_df.dropna()

# Create a new DataFrame with 'y', 'Athlete', 'Non-Athlete', 'x1', and 'x2'
occupation_coordinate_df = pd.DataFrame({
    'y': occupation_df['Occupation'],
    'Athlete': occupation_df['Athlete'],
    'Non-Athlete': occupation_df['Non-Athlete'],
    'x1': occupation_df[['Athlete', 'Non-Athlete']].min(axis=1),
    'x2': occupation_df[['Athlete', 'Non-Athlete']].max(axis=1)
})

# Display the resulting DataFrame
occupation_coordinate_df

Unnamed: 0,y,Athlete,Non-Athlete,x1,x2
0,Academic,3.0,2.75,2.75,3.0
3,Accountant,4.25,3.25,3.25,4.25
10,Administrator,2.5,3.0,2.5,3.0
23,Auditor,3.0,4.0,3.0,4.0
57,Civil Engineer,4.0,4.0,4.0,4.0
58,Civil Servant,3.5,3.5,3.5,3.5
61,Civil servant,3.0,3.0,3.0,3.0
62,Civil service,4.0,2.5,2.5,4.0
68,Coach,4.0,4.0,4.0,4.0
86,Doctor,4.0,4.0,4.0,4.0


### Gender

In [5]:
# Group by 'Gender' and 'is_athlete', calculate the average of 'happy' for each group
grouped_gender_data = filtered_data.groupby(['Gender', 'is_athlete'])['Happy'].mean().reset_index()

# Round the 'Happy' column averages to the nearest 0.5, handling NaN values
grouped_gender_data['Happy'] = grouped_gender_data['Happy'].apply(lambda x: np.nan if pd.isna(x) else round(x * 4) / 4)

# Pivot the table to have 'Gender' as rows, 'is_athlete' as columns, and 'happy' values in the cells
pivot_gender_data = grouped_gender_data.pivot(index='Gender', columns='is_athlete', values='Happy')

# Rename columns to 'Athlete' and 'Non-Athlete'
pivot_gender_data.columns = ['Athlete', 'Non-Athlete']

# Reset the index to make 'Gender' a column again
gender_df = pivot_gender_data.reset_index()

# Drop rows with NaN values
gender_df = gender_df.dropna()

# Create a new DataFrame with 'Gender', 'Athlete', 'Non-Athlete', 'x1', and 'x2'
gender_coordinate_df = pd.DataFrame({
    'y': gender_df['Gender'],
    'Athlete': gender_df['Athlete'],
    'Non-Athlete': gender_df['Non-Athlete'],
    'x1': gender_df[['Athlete', 'Non-Athlete']].min(axis=1),
    'x2': gender_df[['Athlete', 'Non-Athlete']].max(axis=1)
})

# Display the resulting DataFrame
gender_coordinate_df


Unnamed: 0,y,Athlete,Non-Athlete,x1,x2
0,Female,3.75,3.5,3.5,3.75
1,Male,3.5,3.75,3.5,3.75


### Age Group

In [6]:
# Group by 'Age Group' and 'is_athlete', calculate the average of 'happy' for each group
grouped_age_group_data = filtered_data.groupby(['AgeGroup', 'is_athlete'])['Happy'].mean().reset_index()

# Round the 'Happy' column averages to the nearest 0.5, handling NaN values
grouped_age_group_data['Happy'] = grouped_age_group_data['Happy'].apply(lambda x: np.nan if pd.isna(x) else round(x * 4) / 4)

# Pivot the table to have 'Age Group' as rows, 'is_athlete' as columns, and 'happy' values in the cells
pivot_age_group_data = grouped_age_group_data.pivot(index='AgeGroup', columns='is_athlete', values='Happy')

# Rename columns to 'Athlete' and 'Non-Athlete'
pivot_age_group_data.columns = ['Athlete', 'Non-Athlete']

# Reset the index to make 'Age Group' a column again
age_group_df = pivot_age_group_data.reset_index()

# Drop rows with NaN values
age_group_df = age_group_df.dropna()

# Create a new DataFrame with 'Age Group', 'Athlete', 'Non-Athlete', 'x1', and 'x2'
age_group_coordinate_df = pd.DataFrame({
    'y': age_group_df['AgeGroup'],
    'Athlete': age_group_df['Athlete'],
    'Non-Athlete': age_group_df['Non-Athlete'],
    'x1': age_group_df[['Athlete', 'Non-Athlete']].min(axis=1),
    'x2': age_group_df[['Athlete', 'Non-Athlete']].max(axis=1)
})

# Display the resulting DataFrame
age_group_coordinate_df

Unnamed: 0,y,Athlete,Non-Athlete,x1,x2
0,18-20,3.5,3.5,3.5,3.5
1,21-30,3.75,3.5,3.5,3.75
2,31-40,3.5,3.5,3.5,3.5
3,41-50,3.5,3.5,3.5,3.5
4,51-60,4.0,3.5,3.5,4.0
5,61-70,4.25,4.0,4.0,4.25
6,71+,4.0,3.75,3.75,4.0


### Country During Lockdown

In [7]:
# Group by 'CountryDuringLockdown' and 'is_athlete', calculate the average of 'happy' for each group
grouped_country_data = filtered_data.groupby(['CountryDuringLockdown', 'is_athlete'])['Happy'].mean().reset_index()

# Round the 'Happy' column averages to the nearest 0.5, handling NaN values
grouped_country_data['Happy'] = grouped_country_data['Happy'].apply(lambda x: np.nan if pd.isna(x) else round(x * 4) / 4)

# Pivot the table to have 'CountryDuringLockdown' as rows, 'is_athlete' as columns, and 'happy' values in the cells
pivot_country_data = grouped_country_data.pivot(index='CountryDuringLockdown', columns='is_athlete', values='Happy')

# Rename columns to 'Athlete' and 'Non-Athlete'
pivot_country_data.columns = ['Athlete', 'Non-Athlete']

# Reset the index to make 'CountryDuringLockdown' a column again
country_df = pivot_country_data.reset_index()

# Drop rows with NaN values
country_df = country_df.dropna()

# Create a new DataFrame with 'CountryDuringLockdown', 'Athlete', 'Non-Athlete', 'x1', and 'x2'
country_during_lockdown_coordinate_df = pd.DataFrame({
    'y': country_df['CountryDuringLockdown'],
    'Athlete': country_df['Athlete'],
    'Non-Athlete': country_df['Non-Athlete'],
    'x1': country_df[['Athlete', 'Non-Athlete']].min(axis=1),
    'x2': country_df[['Athlete', 'Non-Athlete']].max(axis=1)
})

# Display the resulting DataFrame
country_during_lockdown_coordinate_df


Unnamed: 0,y,Athlete,Non-Athlete,x1,x2
0,Australia,4.0,2.0,2.0,4.0
2,Ireland,3.75,3.75,3.75,3.75
6,UK,3.5,3.5,3.5,3.5


### COMBINE DATAFRAMES

In [8]:
# Combine DataFrames
combined_df = pd.concat([occupation_coordinate_df.assign(Dataset='Occupation'),
                         gender_coordinate_df.assign(Dataset='Gender'),
                         age_group_coordinate_df.assign(Dataset='Age Group'),
                         country_during_lockdown_coordinate_df.assign(Dataset='Country During Lockdown')])

combined_df

Unnamed: 0,y,Athlete,Non-Athlete,x1,x2,Dataset
0,Academic,3.0,2.75,2.75,3.0,Occupation
3,Accountant,4.25,3.25,3.25,4.25,Occupation
10,Administrator,2.5,3.0,2.5,3.0,Occupation
23,Auditor,3.0,4.0,3.0,4.0,Occupation
57,Civil Engineer,4.0,4.0,4.0,4.0,Occupation
58,Civil Servant,3.5,3.5,3.5,3.5,Occupation
61,Civil servant,3.0,3.0,3.0,3.0,Occupation
62,Civil service,4.0,2.5,2.5,4.0,Occupation
68,Coach,4.0,4.0,4.0,4.0,Occupation
86,Doctor,4.0,4.0,4.0,4.0,Occupation


### DROPDOWN

In [9]:
dataset_selection = alt.selection_single(
    fields=['Dataset'],
    bind=alt.binding_select(options=['Occupation', 'Gender', 'Age Group', 'Country During Lockdown']),
    value = 'Occupation',
    name='Dataset'
)

chart_athlete = alt.Chart(combined_df).mark_circle(color= '#2459ed', size=70).encode(
    y=alt.Y('y:N', title='Category'),
    x=alt.X('Athlete:Q', title='Average Happiness'),
    tooltip = [
        alt.Tooltip('Athlete:Q', title = 'Average Happiness (Athlete)')
    ]   
).transform_filter(
    dataset_selection
)

chart_non_athlete = alt.Chart(combined_df).mark_circle(color = '#eda724', size = 70).encode(
    y = alt.Y('y:N', title='Category'),
    x = alt.X('Non-Athlete:Q'),
    tooltip = [
        alt.Tooltip('Non-Athlete:Q', title = 'Average Happiness (Non-Athlete)')
    ]   
).transform_filter(
    dataset_selection
)

chart_line = alt.Chart(combined_df).mark_line().encode(
    y=alt.Y('y:N', title='Category'),
    x=alt.X('x1:Q'),
    x2=alt.X2('x2:Q')
).transform_filter(
    dataset_selection
)

chart_combined = chart_line + chart_athlete + chart_non_athlete


chart_combined = chart_combined.add_selection(dataset_selection)

chart_combined

#TODO: add legend
#TODO: match color scheme
#TODO: add tooltip (exact value, amount of people, and for line: add difference)
#TODO: add back the config edit
#TODO: add another dropdown for different score?
#TODO: change dropdown title