In [1]:
# import packages for analysis
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats


In [None]:
# import data
df = pd.read_csv('wellbeing_survey_responses.csv')
df.dropna(subset=['age'],inplace=True) #there is one row with all blanks including the age

In [3]:
# Data Encoding

# convert timestamp column from object to datetime data type
df['response_timestamp'] = pd.to_datetime(df['response_timestamp'])

# encode the ordinal age column to labels
age_ordinal_values = { 'Prefer not to answer': -1, '18 to 24 years': 1, '25 to 34 years': 2
                      , '35 to 44 years': 3, '45 to 54 years': 4, '55 to 64 years': 5,
                     '65+ years': 6}

df['age_encoded'] = df['age'].map(age_ordinal_values)

# encode the categorical feature learning column to one-hot encoding
flm_one_hot_df = pd.get_dummies(data=df['feature_learn_method'], prefix='flm', dummy_na=True)

df = pd.concat([df, flm_one_hot_df], axis=1)

# encode the categorical phone type column to iPhone flag, question was required in survey so data cannot be blank
def phone_type_mapping_func(ptype : str):
    if ptype == 'Prefer not to answer':
        return -1
    elif ptype == 'Apple iPhone':
        return 1
    else:
        return 0
    
df['iphone_flag'] = df['phone_type'].apply(phone_type_mapping_func)

In [9]:
# coalesce the awareness and usage columns to one column for analysis
# source: https://www.statology.org/pandas-coalesce/
df['all_phone_feature_aware'] = df[['iphone_aware', 'samsung_aware', 'pixel_aware', 'other_aware']].bfill(axis=1).iloc[:, 0]
df['all_phone_feature_usage'] = df[['iphone_usage', 'samsung_usage', 'pixel_usage', 'other_usage']].bfill(axis=1).iloc[:, 0]

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 353 entries, 0 to 353
Data columns (total 52 columns):
 #   Column                              Non-Null Count  Dtype         
---  ------                              --------------  -----         
 0   ID                                  353 non-null    int64         
 1   response_timestamp                  353 non-null    datetime64[ns]
 2   age                                 353 non-null    object        
 3   feature_learn_method                352 non-null    object        
 4   autonomy_avgscore                   345 non-null    float64       
 5   relatedness_avgscore                349 non-null    float64       
 6   competence_avgscore                 351 non-null    float64       
 7   pressured_to_use_more               348 non-null    float64       
 8   spend_more_time_than_should         351 non-null    float64       
 9   makes_me_do_unwanted_things         350 non-null    float64       
 10  life_intrusive            

In [12]:
df['age'].value_counts(normalize=True)

25 to 34 years          0.331445
18 to 24 years          0.209632
35 to 44 years          0.138810
65+ years               0.127479
45 to 54 years          0.093484
55 to 64 years          0.087819
Prefer not to answer    0.011331
Name: age, dtype: float64

In [14]:
df['age'].value_counts()

25 to 34 years          117
18 to 24 years           74
35 to 44 years           49
65+ years                45
45 to 54 years           33
55 to 64 years           31
Prefer not to answer      4
Name: age, dtype: int64