### Set Up

In [18]:
import pandas as pd
import numpy as np
import altair as alt
from IPython.display import Image

# Suppress FutureWarning
import warnings
warnings.filterwarnings("ignore")

### Read in Data

In [37]:
#Used more robust encoding 'ISO-8859-1' instead of 'ascii' (got error)
data = pd.read_csv('data/my_data(v3).csv', encoding='ISO-8859-1')

# JADE - INTERACTIVE VIZ #1

In [38]:
# Filter the DataFrame for rows where 'is_athlete' is Athlete, Non-Athlete
filtered_data = data[data['is_athlete'].isin(["Athlete", "Non-Athlete"])]

### Occupation

In [40]:
filtered_data['Occupation'] = filtered_data['Occupation'].str.upper()

# Group by 'Occupation' and 'is_athlete', calculate the average of 'happy' for each group
grouped_occupation_data = filtered_data.groupby(['Occupation', 'is_athlete'])['Happy'].mean().reset_index()

# Round the 'Happy' column averages to the nearest 0.5, handling NaN values
grouped_occupation_data['Happy'] = grouped_occupation_data['Happy'].apply(lambda x: np.nan if pd.isna(x) else round(x * 4) / 4)

# Pivot the table to have 'Occupation' as rows, 'is_athlete' as columns, and 'happy' values in the cells
pivot_occupation_data = grouped_occupation_data.pivot(index='Occupation', columns='is_athlete', values='Happy')

# Rename columns to 'Athlete' and 'Non-Athlete'
pivot_occupation_data.columns = ['Athlete', 'Non-Athlete']

# Reset the index to make 'Occupation' a column again
occupation_df = pivot_occupation_data.reset_index()

# Drop rows with NaN values
occupation_df = occupation_df.dropna()

# Create a new DataFrame with 'y', 'Athlete', 'Non-Athlete', 'x1', and 'x2'
occupation_coordinate_df = pd.DataFrame({
    'y': occupation_df['Occupation'],
    'Athlete': occupation_df['Athlete'],
    'Non-Athlete': occupation_df['Non-Athlete'],
    'x1': occupation_df[['Athlete', 'Non-Athlete']].min(axis=1),
    'x2': occupation_df[['Athlete', 'Non-Athlete']].max(axis=1)
})

### Gender

In [6]:
# Group by 'Gender' and 'is_athlete', calculate the average of 'happy' for each group
grouped_gender_data = filtered_data.groupby(['Gender', 'is_athlete'])['Happy'].mean().reset_index()

# Round the 'Happy' column averages to the nearest 0.5, handling NaN values
grouped_gender_data['Happy'] = grouped_gender_data['Happy'].apply(lambda x: np.nan if pd.isna(x) else round(x * 4) / 4)

# Pivot the table to have 'Gender' as rows, 'is_athlete' as columns, and 'happy' values in the cells
pivot_gender_data = grouped_gender_data.pivot(index='Gender', columns='is_athlete', values='Happy')

# Rename columns to 'Athlete' and 'Non-Athlete'
pivot_gender_data.columns = ['Athlete', 'Non-Athlete']

# Reset the index to make 'Gender' a column again
gender_df = pivot_gender_data.reset_index()

# Drop rows with NaN values
gender_df = gender_df.dropna()

# Create a new DataFrame with 'Gender', 'Athlete', 'Non-Athlete', 'x1', and 'x2'
gender_coordinate_df = pd.DataFrame({
    'y': gender_df['Gender'],
    'Athlete': gender_df['Athlete'],
    'Non-Athlete': gender_df['Non-Athlete'],
    'x1': gender_df[['Athlete', 'Non-Athlete']].min(axis=1),
    'x2': gender_df[['Athlete', 'Non-Athlete']].max(axis=1)
})

### Age Group

In [9]:
# Group by 'Age Group' and 'is_athlete', calculate the average of 'happy' for each group
grouped_age_group_data = filtered_data.groupby(['Age Group', 'is_athlete'])['Happy'].mean().reset_index()

# Round the 'Happy' column averages to the nearest 0.5, handling NaN values
grouped_age_group_data['Happy'] = grouped_age_group_data['Happy'].apply(lambda x: np.nan if pd.isna(x) else round(x * 4) / 4)

# Pivot the table to have 'Age Group' as rows, 'is_athlete' as columns, and 'happy' values in the cells
pivot_age_group_data = grouped_age_group_data.pivot(index='Age Group', columns='is_athlete', values='Happy')

# Rename columns to 'Athlete' and 'Non-Athlete'
pivot_age_group_data.columns = ['Athlete', 'Non-Athlete']

# Reset the index to make 'Age Group' a column again
age_group_df = pivot_age_group_data.reset_index()

# Drop rows with NaN values
age_group_df = age_group_df.dropna()

# Create a new DataFrame with 'Age Group', 'Athlete', 'Non-Athlete', 'x1', and 'x2'
age_group_coordinate_df = pd.DataFrame({
    'y': age_group_df['Age Group'],
    'Athlete': age_group_df['Athlete'],
    'Non-Athlete': age_group_df['Non-Athlete'],
    'x1': age_group_df[['Athlete', 'Non-Athlete']].min(axis=1),
    'x2': age_group_df[['Athlete', 'Non-Athlete']].max(axis=1)
})

### Country During Lockdown

In [10]:
# Group by 'CountryDuringLockdown' and 'is_athlete', calculate the average of 'happy' for each group
grouped_country_data = filtered_data.groupby(['Country During Lockdown', 'is_athlete'])['Happy'].mean().reset_index()

# Round the 'Happy' column averages to the nearest 0.5, handling NaN values
grouped_country_data['Happy'] = grouped_country_data['Happy'].apply(lambda x: np.nan if pd.isna(x) else round(x * 4) / 4)

# Pivot the table to have 'CountryDuringLockdown' as rows, 'is_athlete' as columns, and 'happy' values in the cells
pivot_country_data = grouped_country_data.pivot(index='Country During Lockdown', columns='is_athlete', values='Happy')

# Rename columns to 'Athlete' and 'Non-Athlete'
pivot_country_data.columns = ['Athlete', 'Non-Athlete']

# Reset the index to make 'CountryDuringLockdown' a column again
country_df = pivot_country_data.reset_index()

# Drop rows with NaN values
country_df = country_df.dropna()

# Create a new DataFrame with 'CountryDuringLockdown', 'Athlete', 'Non-Athlete', 'x1', and 'x2'
country_during_lockdown_coordinate_df = pd.DataFrame({
    'y': country_df['Country During Lockdown'],
    'Athlete': country_df['Athlete'],
    'Non-Athlete': country_df['Non-Athlete'],
    'x1': country_df[['Athlete', 'Non-Athlete']].min(axis=1),
    'x2': country_df[['Athlete', 'Non-Athlete']].max(axis=1)
})

### COMBINE DATAFRAMES

In [11]:
# Combine DataFrames
combined_df = pd.concat([occupation_coordinate_df.assign(Dataset='Occupation'),
                         gender_coordinate_df.assign(Dataset='Gender'),
                         age_group_coordinate_df.assign(Dataset='Age Group'),
                         country_during_lockdown_coordinate_df.assign(Dataset='Country During Lockdown')])

In [12]:
combined_df.to_csv('data/jade_viz_1.csv', index=False)

## Interactive #1 (v2)

In [41]:
# Add a new column 'Difference' to represent the difference between 'Athlete' and 'Non-Athlete'
occupation_coordinate_df['Difference'] = occupation_coordinate_df['Athlete'] - occupation_coordinate_df['Non-Athlete']


# Create a new DataFrame with 'y', 'Athlete', 'Non-Athlete', 'x1', 'x2', and 'Difference'
occupation_coordinate_df_v2 = pd.DataFrame({
    'y': occupation_df['Occupation'],
    'Athlete': occupation_df['Athlete'],
    'Non-Athlete': occupation_df['Non-Athlete'],
    'x1': occupation_df[['Athlete', 'Non-Athlete']].min(axis=1),
    'x2': occupation_df[['Athlete', 'Non-Athlete']].max(axis=1),
    'Difference': occupation_coordinate_df['Difference']
})

In [42]:
occupation_coordinate_df_v2.to_csv('data/jade_viz_1(v2).csv', index=False)

## Interactive #2

In [14]:
data3 = data.copy()
data3 = data3.rename(columns = {'Athlete/Non-Athlete' : 'is_athlete'})

# removing spaces from column headers

data4 = data3.copy()

data4.columns = data3.columns.str.replace(' ', '')

data4['AgeGroup']=data4['AgeGroup'].replace(
    {
        1: '18-20', 2: '21-30', 3:'31-40', 4:'41-50', 5:'51-60', 6:'61-70', 7:'71+'
    }
)

# CountryDuringLockdown
data4['CountryDuringLockdown']=data4['CountryDuringLockdown'].replace(
    {
        1:'UK', 2:'Ireland', 3:'New Zealand', 
        4:'Australia', 5:'Thailand', 6:'Belgium', 7:'Sweden'
    }
)

# MaritalStatus
data4['MaritalStatus'] = data4['MaritalStatus'].replace(
    {
        1:'Single',
        2:'Married/Cohabiting',
        3:'Civil Partnership',
        4:'Divorced',
        5:'Widowed'
    }
)

#SmokingStatus
data4['SmokingStatus'] = data4['SmokingStatus'].replace(
    {
        1:'Never',
        2:'Ex-occasional smoker',
        3:'Ex-smoker',
        4:'Occasional',
        5:'Half pack daily',
        6:'Full pack daily',
        7:'Multiple packs daily'
    }
)

# FiveFruitandVeg
data4['FiveFruitandVeg'] = data4['FiveFruitandVeg'].replace(
    {
        1:'Yes',
        2:'No'
    }
)

# Dropping the 5994 rows
data4 = data4[data4['PsychologicalWellbeing'] != 5994]

# I need to bin hours of sleep
data4['Hourssleep'] = data4['Hourssleep'].replace(
        dict.fromkeys([1, 1.5, 2, 2.5, 3, 3.5],'< 4')).replace(
        dict.fromkeys([4, 4.5, 5, 5.5],'< 6')).replace(
        dict.fromkeys([6, 6.5, 7, 7.5],'< 8')).replace(
        dict.fromkeys([8, 8.5, 9, 9.5],'< 10')).replace(
        dict.fromkeys([10, 10.5],'10 +'))

# Changes weeks spent distancing intervals
data4['WeeksSocialDistancing']=data4['WeeksSocialDistancing'].replace({0:1, 2:4, 3:7, 4:10, 5:13, 6:16, 7:19, 8:21})

data_viz2 = data4.copy()

# Define a function to map severity based on the given conditions
def map_severity(score):
    if score <= 7:
        return 'Normal'
    elif 8 <= score <= 10:
        return 'Mild'
    else:
        return 'Severe'

# Create new columns based on the specified conditions
data_viz2['Anxiety Severity'] = data_viz2['HADS-AAVERAGE'].apply(map_severity)
data_viz2['Depression Severity'] = data_viz2['HADS-DAVERAGE'].apply(map_severity)

In [15]:
data_viz2.to_csv('data/jade_viz_2.csv', index=False)