# Unsupervised Learning notebook

This notebook will be for finding inferences on the given obesity dataset with unsupervised learning algorithms K-Means and Hierarchical clustering

#### Initial import and data check

In [1]:
# initial imports
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, homogeneity_score, completeness_score

import matplotlib.pyplot as plt
import seaborn as sn

In [2]:
# import obesity data
obesity_df = pd.read_csv("assignment_dataset/ObesityDataSet_raw_and_data_sinthetic.csv")

In [3]:
obesity_df.shape

(2111, 17)

In [4]:
obesity_df

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,Female,21.000000,1.620000,64.000000,yes,no,2.0,3.0,Sometimes,no,2.000000,no,0.000000,1.000000,no,Public_Transportation,Normal_Weight
1,Female,21.000000,1.520000,56.000000,yes,no,3.0,3.0,Sometimes,yes,3.000000,yes,3.000000,0.000000,Sometimes,Public_Transportation,Normal_Weight
2,Male,23.000000,1.800000,77.000000,yes,no,2.0,3.0,Sometimes,no,2.000000,no,2.000000,1.000000,Frequently,Public_Transportation,Normal_Weight
3,Male,27.000000,1.800000,87.000000,no,no,3.0,3.0,Sometimes,no,2.000000,no,2.000000,0.000000,Frequently,Walking,Overweight_Level_I
4,Male,22.000000,1.780000,89.800000,no,no,2.0,1.0,Sometimes,no,2.000000,no,0.000000,0.000000,Sometimes,Public_Transportation,Overweight_Level_II
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2106,Female,20.976842,1.710730,131.408528,yes,yes,3.0,3.0,Sometimes,no,1.728139,no,1.676269,0.906247,Sometimes,Public_Transportation,Obesity_Type_III
2107,Female,21.982942,1.748584,133.742943,yes,yes,3.0,3.0,Sometimes,no,2.005130,no,1.341390,0.599270,Sometimes,Public_Transportation,Obesity_Type_III
2108,Female,22.524036,1.752206,133.689352,yes,yes,3.0,3.0,Sometimes,no,2.054193,no,1.414209,0.646288,Sometimes,Public_Transportation,Obesity_Type_III
2109,Female,24.361936,1.739450,133.346641,yes,yes,3.0,3.0,Sometimes,no,2.852339,no,1.139107,0.586035,Sometimes,Public_Transportation,Obesity_Type_III


#### Set feature types

We need to define what types of variables are features are so we can transform them correctly for the distance algorithms

In [5]:
# Define feature groups based on data types
numerical_features = ['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE']
binary_features = ['family_history_with_overweight', 'FAVC', 'SMOKE', 'SCC']
ordinal_features = ['CAEC', 'CALC']
nominal_features = ['Gender', 'MTRANS']

In [6]:
# Check for 16 features
len(numerical_features) + len(binary_features) + len(ordinal_features) + len(nominal_features)

16

#### Encodings

In [7]:
# work on copy
encoded_df = obesity_df.copy()

##### Binary Features

In [8]:
binary_map = {'yes':1, 'no':0}

for col in binary_features:
    encoded_df[col] = encoded_df[col].map(binary_map).astype(int)

In [9]:
encoded_df[binary_features]

Unnamed: 0,family_history_with_overweight,FAVC,SMOKE,SCC
0,1,0,0,0
1,1,0,1,1
2,1,0,0,0
3,0,0,0,0
4,0,0,0,0
...,...,...,...,...
2106,1,1,0,0
2107,1,1,0,0
2108,1,1,0,0
2109,1,1,0,0


##### Ordinal features

For the two ordinal features

no < sometimes < frequently < always -> 0, 1, 2, 3

In [10]:
ordinal_map = {
    'no':0,
    'Sometimes':1,
    'Frequently':2,
    'Always':3
}

for col in ordinal_features:
    encoded_df[col] = encoded_df[col].map(ordinal_map)

In [11]:
encoded_df[ordinal_features]

Unnamed: 0,CAEC,CALC
0,1,0
1,1,1
2,1,2
3,1,2
4,1,1
...,...,...
2106,1,1
2107,1,1
2108,1,1
2109,1,1


##### Nominal features - One hot encoding

Use one hot encoding for the nominal features and drop first. Absense of any 1 value implies the dropped value

In [12]:
encoded_df = pd.get_dummies(encoded_df, columns=nominal_features, drop_first=True)

In [13]:
encoded_df.columns.to_list()

['Age',
 'Height',
 'Weight',
 'family_history_with_overweight',
 'FAVC',
 'FCVC',
 'NCP',
 'CAEC',
 'SMOKE',
 'CH2O',
 'SCC',
 'FAF',
 'TUE',
 'CALC',
 'NObeyesdad',
 'Gender_Male',
 'MTRANS_Bike',
 'MTRANS_Motorbike',
 'MTRANS_Public_Transportation',
 'MTRANS_Walking']

#### Create feature dataset - a.k.a X

In [14]:
# Get all feature columns except NObeyesdad
feature_cols = [col for col in encoded_df.columns if col != 'NObeyesdad']

len(feature_cols) # should be 19

19

In [15]:
X = encoded_df[feature_cols]

In [16]:
X.shape # should be 2111 rows by 19 columns

(2111, 19)

#### Scale X

In [17]:
# Scale all features using StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Convert back to DataFrame to keep track of feature names
X_scaled_df = pd.DataFrame(X_scaled, columns=feature_cols)

In [18]:
X_scaled_df.shape

(2111, 19)

In [19]:
X_scaled_df.head()

Unnamed: 0,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,Gender_Male,MTRANS_Bike,MTRANS_Motorbike,MTRANS_Public_Transportation,MTRANS_Walking
0,-0.522124,-0.875589,-0.862558,0.472291,-2.759769,-0.785019,0.404153,-0.300346,-0.1459,-0.013073,-0.218272,-1.188039,0.561997,-1.419172,-1.011914,-0.05768,-0.072375,0.579721,-0.165078
1,-0.522124,-1.947599,-1.168077,0.472291,-2.759769,1.088342,0.404153,-0.300346,6.853997,1.618759,4.581439,2.33975,-1.080625,0.52116,-1.011914,-0.05768,-0.072375,0.579721,-0.165078
2,-0.206889,1.054029,-0.36609,0.472291,-2.759769,-0.785019,0.404153,-0.300346,-0.1459,-0.013073,-0.218272,1.16382,0.561997,2.461491,0.988227,-0.05768,-0.072375,0.579721,-0.165078
3,0.423582,1.054029,0.015808,-2.117337,-2.759769,1.088342,0.404153,-0.300346,-0.1459,-0.013073,-0.218272,1.16382,-1.080625,2.461491,0.988227,-0.05768,-0.072375,-1.724969,6.057758
4,-0.364507,0.839627,0.12274,-2.117337,-2.759769,-0.785019,-2.167023,-0.300346,-0.1459,-0.013073,-0.218272,-1.188039,-1.080625,0.52116,0.988227,-0.05768,-0.072375,0.579721,-0.165078


In [20]:
# Check scaling has been correct. Mean of 0 and sd of 1
X_scaled_df.describe()

Unnamed: 0,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,Gender_Male,MTRANS_Bike,MTRANS_Motorbike,MTRANS_Public_Transportation,MTRANS_Walking
count,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0
mean,0.0,-5.385449e-16,-5.385449e-16,-8.078174000000001e-17,1.346362e-16,-1.3463620000000001e-17,4.964711e-16,-1.884907e-16,-2.0195440000000002e-17,1.07709e-16,-6.731812000000001e-17,2.6927250000000003e-17,-1.3463620000000001e-17,5.048859e-17,-5.385449000000001e-17,-1.0097720000000001e-17,2.0195440000000002e-17,6.731812000000001e-17,0.0
std,1.000237,1.000237,1.000237,1.000237,1.000237,1.000237,1.000237,1.000237,1.000237,1.000237,1.000237,1.000237,1.000237,1.000237,1.000237,1.000237,1.000237,1.000237,1.000237
min,-1.625448,-2.698006,-1.817304,-2.117337,-2.759769,-2.658379,-2.167023,-2.435125,-0.1459003,-1.644905,-0.218272,-1.188039,-1.080625,-1.419172,-1.011914,-0.05768012,-0.07237469,-1.724969,-0.165078
25%,-0.688066,-0.7683883,-0.8062914,0.4722913,0.3623491,-0.7850187,-0.03456952,-0.3003456,-0.1459003,-0.6905894,-0.218272,-1.04163,-1.080625,-1.419172,-1.011914,-0.05768012,-0.07237469,-1.724969,-0.165078
50%,-0.241897,-0.01263207,-0.136951,0.4722913,0.3623491,-0.0628345,0.4041527,-0.3003456,-0.1459003,-0.01307326,-0.218272,-0.01210937,-0.05341135,0.5211595,0.9882266,-0.05768012,-0.07237469,0.5797206,-0.165078
75%,0.265964,0.7159595,0.7960531,0.4722913,0.3623491,1.088342,0.4041527,-0.3003456,-0.1459003,0.7659959,-0.218272,0.7718565,0.5619968,0.5211595,0.9882266,-0.05768012,-0.07237469,0.5797206,-0.165078
max,5.782584,2.983646,3.300136,0.4722913,0.3623491,1.088342,1.68974,3.969213,6.853997,1.618759,4.581439,2.33975,2.204618,4.401822,0.9882266,17.337,13.81699,0.5797206,6.057758


## K-Means