In [61]:
# Data handling and manipulation library
import pandas as pd
# Data visualization library based on matplotlib
import seaborn as sns
# Basic plotting library in Python
import matplotlib.pyplot as plt
# Cross-validation function to evaluate model performance
from sklearn.model_selection import cross_val_score
# Library for numerical operations in Python
import numpy as np
# Preprocessing tool to standardize features (mean=0, variance=1)
from sklearn.preprocessing import StandardScaler
# Principal Component Analysis (PCA) for dimensionality reduction
from sklearn.decomposition import PCA
# Visualization tool for the elbow method to determine the optimal number of clusters
from yellowbrick.cluster.elbow import kelbow_visualizer
from yellowbrick.cluster import KElbowVisualizer
# Calculates the Silhouette Score which measures the quality of clusters
from sklearn.metrics import silhouette_score
# KMeans clustering algorithm
from sklearn.cluster import KMeans
# Imputation functions
from sklearn.impute import KNNImputer, SimpleImputer
# Library for interactive plotting 
import plotly
# Module for creating various chart types (like scatter plots)
import plotly.graph_objects as go
# Simplified module for creating visualizations in Plotly
import plotly.express as px
# k-Nearest Neighbors classifier
from sklearn.neighbors import KNeighborsClassifier
# Splits data into random train and test subsets
from sklearn.model_selection import train_test_split
# Generates a confusion matrix
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder


### Data cleaning and preprocessing

In the first part of this notebook, the focus will be on preparing and cleaning the data for subsequent analysis. The main principles will be covered for some of the features, and it is up to you to do it for the remaining features.

In [62]:
df = pd.read_csv('data/descriptives.csv')

Step 1: Discard uniformative features. 

In [63]:
cols_to_drop = []

# Since patient_id, conversation_id and name are unique to each patient and conversation,
#  they are not useful for clustering/classification. They will become important later though
cols_to_drop.append('patient_id')
cols_to_drop.append('name')
cols_to_drop.append('conversation_id')

In [64]:
#To get a sense of which features offer meaningful information, we print out the frequency of each
#value for each feature

Nrows = df.shape[0]
# Go through each column except for the ones we want to drop
for col in df.columns.drop(cols_to_drop):
    # get unique values in column
    unique = df[col].unique();
    print("\n", col,":")
    # calculate frequency of each value
    for val in unique:
        if val not in ['nan']:
            N = len(df[df[col] == val])
            print("Value, N observations, frequency :", val," ,", N, " ,", np.round(N/Nrows,3))

    print("No. of NANs ", len(df[df[col].isna()]))



 gender :
Value, N observations, frequency : Female  , 159  , 0.53
Value, N observations, frequency : Male  , 141  , 0.47
No. of NANs  0

 age :
Value, N observations, frequency : 45  , 83  , 0.277
Value, N observations, frequency : 32  , 60  , 0.2
Value, N observations, frequency : 28  , 16  , 0.053
Value, N observations, frequency : 35  , 88  , 0.293
Value, N observations, frequency : 24  , 1  , 0.003
Value, N observations, frequency : 40  , 10  , 0.033
Value, N observations, frequency : 36  , 3  , 0.01
Value, N observations, frequency : 30  , 13  , 0.043
Value, N observations, frequency : 25  , 4  , 0.013
Value, N observations, frequency : 38  , 3  , 0.01
Value, N observations, frequency : 55  , 7  , 0.023
Value, N observations, frequency : 57  , 1  , 0.003
Value, N observations, frequency : 42  , 4  , 0.013
Value, N observations, frequency : 10  , 1  , 0.003
Value, N observations, frequency : 31  , 2  , 0.007
Value, N observations, frequency : 20  , 1  , 0.003
Value, N observation

In [65]:
# Going through the above, we can see that the following features are not useful for clustering/classification,
# simply because they have too many missing values. 
cols_to_drop.extend(['average_sleep_duration', 'systolic', 'diastolic', 'health_related_activities'])

# Similarly, since current_country_of_residence is the same for all patients, it is not useful for clustering/classification
cols_to_drop.append('current_country_of_residence')

# Since all non-nan values for drug_usage are False, it is not useful for clustering/classification
cols_to_drop.append('drug_usage')

# Since ethnicity and country_of_birth are almost equivalent, we can drop one of them
# Finally, since state_code and country_code contain the same information, we can drop one of them
cols_to_drop.append('ethnicity')
cols_to_drop.append('state_code')

# Drop the columns we don't need
df.drop(cols_to_drop, axis=1, inplace=True)

Step 2: Convert features with potential to be numeric to numeric features. In our case, this amounts to transforming the average_blood_pressure feature

In [66]:
# Let's look at the average_blood_presure feature
unique = df['average_blood_pressure'].unique();
# calculate frequency of each value
for val in unique:
    if val not in ['nan']:
        N = len(df[df['average_blood_pressure'] == val])
        print("Value, N observations, frequency :", val," ,", N, " ,", np.round(N/Nrows,3))

print("No. of NANs ", len(df[df['average_blood_pressure'].isna()]))


Value, N observations, frequency : nan  , 0  , 0.0
Value, N observations, frequency : 120/80  , 229  , 0.763
Value, N observations, frequency : 120/80 mmHg  , 2  , 0.007
Value, N observations, frequency : 130/85  , 5  , 0.017
Value, N observations, frequency : 130/80  , 7  , 0.023
No. of NANs  57


In [67]:
# To convert this feature into a numerical one (well, two), we will split it into two features: systolic and diastolic

# Get rid of the unit
df['average_blood_pressure'] = df['average_blood_pressure'].str.replace(' mmHg', '')

# Split the feature into two
df[['systolic','diastolic']] = df['average_blood_pressure'].str.split('/',expand=True)

# Convert the two new features to numeric
df['systolic'] = pd.to_numeric(df['systolic'])
df['diastolic'] = pd.to_numeric(df['diastolic'])

# Drop the now redundant average_blood_pressure feature
df.drop('average_blood_pressure', axis=1, inplace=True)

In [68]:
# Get rid of units and convert to numeric for the remaining relevant features

# remove ' cm' suffix, then convert to float
df['height'] = df['height'].str.replace(' cm', '')
df['height'] = df['height'].astype(float)

# Remove ' kg' suffix from 'weight' and convert to float
df['weight'] = df['weight'].str.replace(' kg', '')
df['weight'] = df['weight'].astype(float)



Step 3: Fill out (=impute) missing numeric values

In [69]:
# Let's look at the number of Nans for each numeric column
num_cols = df.select_dtypes(include=['float64', 'int64']).columns

Nrows = df.shape[0]
# Go through each column except for the ones we want to drop
for col in num_cols:
    # get unique values in column
    unique = df[col].unique();
    print(col,":")
    print("No. of NANs ", len(df[df[col].isna()]))



age :
No. of NANs  0
weight :
No. of NANs  1
height :
No. of NANs  1
bmi :
No. of NANs  1
average_daily_step_count :
No. of NANs  0
resting_heart_rate :
No. of NANs  0
heart_rate_variability :
No. of NANs  0
average_blood_glucose :
No. of NANs  0
average_fasting_glucose :
No. of NANs  1
number_of_children :
No. of NANs  88
screen_time_per_day :
No. of NANs  1
average_sleep_duration_hours :
No. of NANs  85
systolic :
No. of NANs  57
diastolic :
No. of NANs  57


In [70]:
# We see that some features have as much as 88 (= 30 %) missing values, which is much more than we would like. One could certainly
# argue that these features should be dropped. We choose to keep them in the following, however.

# Too impute missing values, we introduce the following functions

# This imputer only simply replace the missing values with mean, median or most frequent value of that feature
simple_imputer = SimpleImputer(missing_values=np.nan, strategy='mean') 

# This imputer considers all numeric features and assigns to each missing value the feature mean of the n nearest neighbors
knn_imputer = KNNImputer(missing_values = np.nan, n_neighbors = 5)

# We will use the knn imputer, since it considers all numeric features to find the most similar neighbors:
df.loc[:, num_cols] = knn_imputer.fit_transform(df.loc[:, num_cols] )

# Finally, since non-integer number of childrens are not very meaningful, we'll round the estimated no of children
df['number_of_children'] = df['number_of_children'].round(decimals = 0)

Step 4: Convert categorical features to numeric ones
* Convert features with two values to boolean features
* Label encode features with order
* (Cleverly) group and one hot encode features without order

In [71]:
# Convert features with two values to boolean

bool_cols = ['gender', 'stress_level', 'physical_activity', 'alcohol_consumption']

alcohol_consumption_order = {'Low': 0, 'Moderate': 1}
df['alcohol_consumption'] = df['alcohol_consumption'].map(alcohol_consumption_order)

gender_order = {'Male': 0, 'Female': 1}
df['gender'] = df['gender'].map(gender_order)

# Group low and moderate together since only 1 person reported low stress
stress_level_order = {'Low': 0, 'Moderate': 0, 'High': 1}
df['stress_level'] = df['stress_level'].map(stress_level_order)

physical_activity_order = {'Low': 0, 'Moderate': 1}
df['physical_activity'] = df['physical_activity'].map(physical_activity_order)


#### Ranking ordered features (ordinal features) using label encoding:

Since one-hot encoding can increase the no. of features dramatically,
we will use label encoding for ordinal features. Sometimes, the order is 
very clear (e.g. low, mid, high, very high), and sometimes we have to be
a bit creative.

For example, we could rank country_of_birth by GDP per capita, and we could
rank state_name by average BMI. We could also rank housing by the average
size of each housing type, and tenure from renting to owning to owning without mortgage.

In the following, we'll convert the activity and state_name features by finding a metric for the order and then using label encoding. We'll start by reformatting the activities feature to make them easier to work with, after which we'll transform category to a numerical one.

In [72]:
## Reformatting the activity feature ##

unique_list = []
# List of extra stuff we want to drop
discard_list = ['"activities":', ' (sprints)', '(jazz)', '(hip-hop)', \
                '(marathons, sprints, trail running)', '(ballet, hip-hop, jazz, ballroom)']

# Go through each row in the activities column
for i, x in enumerate(df.loc[:,'activities']):
    x_cleaned = []
    # Avoid nans
    if type(x) is float:
        continue
    # Remove the extra stuff
    for el in discard_list:
        x = x.replace(el,'')
    # Split the string into a list of activities
    for el in list(x.split(',')):
        # Remove extra characters
        el = el.strip("['").strip("']").strip('{"').strip('}').strip('"').strip()
        # Add to activity to list of activities
        x_cleaned.append(el)
        # Add to list of unique activities if not already there
        if el not in unique_list:
            unique_list.append(el)

    # Replace the original string with the cleaned list
    df.loc[i,'activities'] = str(x_cleaned)


# Let's take a look at some of the reformatted rows
print(df.loc[:5,'activities'])

print("\nUnique activities: ", unique_list)

0    ['Running', 'Walking', 'Yoga', 'Swimming', 'Da...
1    ['Walking', 'Swimming', 'Running', 'Yoga', 'Da...
2    ['Running', 'Walking', 'Swimming', 'Dancing', ...
3    ['Walking', 'Yoga', 'Swimming', 'Dancing', 'Cy...
4    ['Running', 'Walking', 'Yoga', 'Swimming', 'Da...
5                   ['Running', 'Walking', 'Swimming']
Name: activities, dtype: object

Unique activities:  ['Running', 'Walking', 'Yoga', 'Swimming', 'Dancing', 'Cycling', 'Weightlifting', 'Hiking', 'Pilates', 'Tai Chi', 'Aerobics', 'Strength Training']


The goal is to translate the activities into a measure of peoples physical fitness/health.
We will very naively assume that the number of listed activites correspond to the physical activity level
of each patient. 
There are many other ways of transforming these categories. One could e.g. map each activity to the average calorie burning rate and then add the burning rates for each row. Feel free to play around and try different things!

In [73]:
## Map each activity list to the number of activities on that list
for i, x in enumerate(df.loc[:,'activities']):
    if type(x) is not float:
        df.loc[i,'activities'] = len(list(x.split(',')))


As another example, let's introduce order into the state name feature and then use label encoding. In particular, let's map the state_names, which are not necessarily very informative as is, onto the obesity prevalences (% of obese people) of the corresponding states

In [74]:
# As an example, let us map the state_name feauture onto the prevalence of obsese people (BMI > 30) in the given stats
# The obesity prevalence data is from 2021 and can be found and downloaded on
#  https://www.cdc.gov/obesity/data/prevalence-maps.html

obesity_df = pd.read_csv('data/Obesity-prevalence-by-state-2021.csv')
obesity_order = {}

# Make obesity order dictionary
for row in range(len(obesity_df)):
    try:
        obesity_order.update({f'{obesity_df.loc[row,"State"]}': float(obesity_df.loc[row,'Prevalence'])})
    except:
        pass

del_list = []
for key in obesity_order.keys():
    if key not in df['state_name'].unique():
        del_list.append(key)
   
for key in del_list:
    del obesity_order[key]

df['state_name'] = df['state_name'].map(obesity_order)

In the following, we'll show two examples of how to group and transform features using one-hot (or dummy) encoding. One-hot encoding has the advantage
that is does not introduce any order between features like label encoding does, but the drawback that K different categories of a feature will be transformed into K or K-1 new features, and so the total number of features can become very large, which can hurt performance. For this reasons, if a feature has many categories, these categories are often grouped in straightforward (simply group rare categories) or clever (use domain knowledge to make groups).



In [75]:
## Group the observations of the mother feature

# Print out each category and its frequency 
for col in ['mother']:
    # get unique values in column
    unique = df[col].unique();
    print("\n", col,":")
    # calculate frequency of each value
    for val in unique:
        if val not in ['nan']:
            N = len(df[df[col] == val])
            print("Value, N observations, frequency :", val," ,", N, " ,", np.round(N/Nrows,3))

    print("No. of NANs ", len(df[df[col].isna()]))

# Group different spellings
mother_map = {'Type 2 Diabetes': 'Type 2 diabetes', }
df['mother'].replace({'Type 2 Diabetes': 'Type 2 diabetes'}, inplace=True)

# Group all values except for 'Type 2 diabetes' and 'Hypertension' into 'No known health issues'
mask = (df['mother'] == 'Type 2 diabetes') | ( df['mother'] == 'Hypertension')
df.loc[~mask, 'mother'] = 'No known health issues'

print("\nvalues of mother after grouping: ", df['mother'].unique())



 mother :
Value, N observations, frequency : Hypertension  , 7  , 0.023
Value, N observations, frequency : No known health issues  , 90  , 0.3
Value, N observations, frequency : No known medical conditions  , 170  , 0.567
Value, N observations, frequency : No history of diabetes  , 6  , 0.02
Value, N observations, frequency : No known health conditions  , 2  , 0.007
Value, N observations, frequency : Type 2 diabetes  , 13  , 0.043
Value, N observations, frequency : No significant medical history  , 1  , 0.003
Value, N observations, frequency : No history of hypertension  , 2  , 0.007
Value, N observations, frequency : No history of heart disease  , 1  , 0.003
Value, N observations, frequency : Type 2 Diabetes  , 3  , 0.01
Value, N observations, frequency : No history of CVD  , 3  , 0.01
Value, N observations, frequency : No history of heart disease or diabetes  , 1  , 0.003
Value, N observations, frequency : No history of cardiovascular diseases  , 1  , 0.003
No. of NANs  0

values of

In [76]:
## Group similar (and also rare) categories of the diet_type features

# Go through each column except for the ones we want to drop
for col in ['diet_type']:
    # get unique values in column
    unique = df[col].unique();
    print("\n", col,":")
    # calculate frequency of each value
    for val in unique:
        if val not in ['nan']:
            N = len(df[df[col] == val])
            print("Value, N observations, frequency :", val," ,", N, " ,", np.round(N/Nrows,3))

    print("No. of NANs ", len(df[df[col].isna()]))

# Group different spellings
df['diet_type'].replace({'Balanced Diet': 'Balanced'}, inplace=True)

# Group plant based diets together
df['diet_type'].replace({'Vegan': 'Plant-based', 'Vegetarian': 'Plant-based'}, inplace=True)

# Group the remaining values into 'Other'
mask = (df['diet_type'] == 'Balanced') | ( df['diet_type'] == 'Plant-based') | ( df['diet_type'] == 'Mediterranean') | ( df['diet_type'] == 'Low-carb')
df.loc[~mask, 'diet_type'] = 'Other'

print("\nvalues of diet_type after grouping: ", df['diet_type'].unique())



 diet_type :
Value, N observations, frequency : Mediterranean  , 47  , 0.157
Value, N observations, frequency : Balanced  , 177  , 0.59
Value, N observations, frequency : Gluten-free  , 15  , 0.05
Value, N observations, frequency : Keto  , 1  , 0.003
Value, N observations, frequency : Diabetic-friendly  , 6  , 0.02
Value, N observations, frequency : Omnivore  , 7  , 0.023
Value, N observations, frequency : Low-carb  , 8  , 0.027
Value, N observations, frequency : Low FODMAP  , 1  , 0.003
Value, N observations, frequency : Crohn's disease-friendly diet  , 1  , 0.003
Value, N observations, frequency : Low Protein  , 1  , 0.003
Value, N observations, frequency : Vegetarian  , 11  , 0.037
Value, N observations, frequency : Low-FODMAP  , 2  , 0.007
Value, N observations, frequency : Anti-inflammatory diet  , 1  , 0.003
Value, N observations, frequency : Gestational  , 1  , 0.003
Value, N observations, frequency : Plant-based  , 2  , 0.007
Value, N observations, frequency : Salt-restricted 

In [77]:
## We need this function to do one-hot (or dummy) encoding
def do_dummy_encoding(dataframe, target_columns, prefix_list, drop_first = False):
    """
    For each cateogrical column with categories (l1,...,ln), this function transform the category into n ( if drop_first = False)
    or n-1 columns, where [0,0,...,0] represents l1,
    [1,0,...,0] rep. l2, [0,1,0,...,0] rep l3 etc
    """
    data = dataframe.copy()
    # Build dummy index dataframe
    dummy_columns = pd.get_dummies(data[target_columns], prefix = prefix_list, drop_first=drop_first)

    ## Append new columns to dataframe
    data = pd.concat([data, dummy_columns], axis = 1)

    ## Drop original columns
    data.drop(columns = target_columns, inplace=True)
    return data

Having grouped the categories of mother and diet_type into fewer categories, let us transform the features using one hot encoding

In [78]:
onehot_cols = ['mother', 'diet_type']

for i, col in enumerate(onehot_cols):
    # Use onehot_cols[i] as prefix for the new columns
    # If drop_first = False, the first category will be included in the new columns
    # If drop_first = True, the first category will be dropped, meaning that
    #  if all new columns are 0, the category is the first one. This is the most
    #  efficient way of doing it, but it has the price that we have to remember that
    # the first category is represented like this
    df = do_dummy_encoding(df, col, onehot_cols[i], drop_first = False)

# Let's take a look at the new columns
print(df.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 40 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   gender                         300 non-null    int64  
 1   age                            300 non-null    int64  
 2   country_of_birth               300 non-null    object 
 3   weight                         300 non-null    float64
 4   height                         300 non-null    float64
 5   bmi                            300 non-null    float64
 6   average_daily_step_count       300 non-null    int64  
 7   resting_heart_rate             300 non-null    int64  
 8   heart_rate_variability         300 non-null    float64
 9   average_blood_glucose          300 non-null    float64
 10  average_fasting_glucose        300 non-null    float64
 11  specific_preferences           292 non-null    object 
 12  challenges                     275 non-null    obj

In the following, we simply encode the remaining categorical features to numerical ones by label encoding.
This naive approach introduces false order between different categories and is not in general the right approach
when no inherent order is present between categories.

#### **You are expected to encode the remaining categorical features in the best way, e.g. using the methods illustrated above.**

**NB:** Make sure you remove a feature from the list below once you have found another way to encode it!

In [79]:
# Use LabelEncoder to convert the remaining columns to numerical values
labels = ['country_of_birth', 'current_city_of_residence', 'housing_type', 'housing_tenure', 'marital_status', 'sexual_orientation',
              'father', 'specific_preferences', 'challenges', 'activities']
labelencoder = LabelEncoder()
for label in labels:
    df[label] = labelencoder.fit_transform(df[label])

In [80]:
# Having converted all features to numerical values, we can now imputate the remaining NaNs

# This imputer considers all numeric features and assigns to each missing value the feature mean of the n nearest neighbors
knn_imputer = KNNImputer(missing_values = np.nan, n_neighbors = 5)

# We will use the knn imputer, since it considers all numeric features to find the most similar neighbors:
df.loc[:] = knn_imputer.fit_transform(df.loc[:] )

# Finally, since non-integer number of childrens are not very meaningful, we'll round the estimated no of children
#df['number_of_children'] = df['number_of_children'].round(decimals = 0)

#### At this point, the data cleaning is completed, and we can proceed by scaling the features and potentially applying PCA analysis to reduce the total number of features. 


### Excercises:
- There are some indications that the BMI feature contains errors. Use the height and weight features to calculate the BMI of all patients and replace the potentially wrong BMI values your results
- Can you fig. out what the nan values in number_of_children should be (before we impute them), given that you also have access to the has_children feature? Once you do, change the nan values before imputation.
- Just like we mapped the state_name feature onto obesity prevalences, map the country_of_origin to a metric you think might be useful for predicting type 2 diabetes. 
- Inspired by how we grouped the mother and diet_type features, group the father feature, and then move it from the label_encoding list to the one-hot encoding list
 and speficic_preferances features
- (**To be done on your own time**) Go through the remaining features 'country_of_birth', 'current_city_of_residence', 'housing_type', 'housing_tenure', 'marital_status', 'sexual_orientation',
               'specific_preferences', 'challenges', 'activities', and figure out the best way to encode them