# Multi-Class Prediction of Obesity Risk
Input files - [Downloaded here](https://www.kaggle.com/competitions/playground-series-s4e2/data)
1. *train.csv*
2. *test.csv*

## Development Notes
* Save DataFrame as pickle (to_pickle) to preserve column data types -- load with read_pickle

In [1]:
### libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Load and Preview Data

In [2]:
### load data
train_raw = pd.read_csv('Data Download/train.csv')
test_raw=pd.read_csv('Data Download/test.csv')

### data info
train_raw.info()
print("\n")
test_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20758 entries, 0 to 20757
Data columns (total 18 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              20758 non-null  int64  
 1   Gender                          20758 non-null  object 
 2   Age                             20758 non-null  float64
 3   Height                          20758 non-null  float64
 4   Weight                          20758 non-null  float64
 5   family_history_with_overweight  20758 non-null  object 
 6   FAVC                            20758 non-null  object 
 7   FCVC                            20758 non-null  float64
 8   NCP                             20758 non-null  float64
 9   CAEC                            20758 non-null  object 
 10  SMOKE                           20758 non-null  object 
 11  CH2O                            20758 non-null  float64
 12  SCC                             

In [3]:
### preview data
train_raw.head(5)

Unnamed: 0,id,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,0,Male,24.443011,1.699998,81.66995,yes,yes,2.0,2.983297,Sometimes,no,2.763573,no,0.0,0.976473,Sometimes,Public_Transportation,Overweight_Level_II
1,1,Female,18.0,1.56,57.0,yes,yes,2.0,3.0,Frequently,no,2.0,no,1.0,1.0,no,Automobile,Normal_Weight
2,2,Female,18.0,1.71146,50.165754,yes,yes,1.880534,1.411685,Sometimes,no,1.910378,no,0.866045,1.673584,no,Public_Transportation,Insufficient_Weight
3,3,Female,20.952737,1.71073,131.274851,yes,yes,3.0,3.0,Sometimes,no,1.674061,no,1.467863,0.780199,Sometimes,Public_Transportation,Obesity_Type_III
4,4,Male,31.641081,1.914186,93.798055,yes,yes,2.679664,1.971472,Sometimes,no,1.979848,no,1.967973,0.931721,Sometimes,Public_Transportation,Overweight_Level_II


In [4]:
### preview data cont.
train_raw.describe(include='all')

Unnamed: 0,id,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
count,20758.0,20758,20758.0,20758.0,20758.0,20758,20758,20758.0,20758.0,20758,20758,20758.0,20758,20758.0,20758.0,20758,20758,20758
unique,,2,,,,2,2,,,4,2,,2,,,3,5,7
top,,Female,,,,yes,yes,,,Sometimes,no,,no,,,Sometimes,Public_Transportation,Obesity_Type_III
freq,,10422,,,,17014,18982,,,17529,20513,,20071,,,15066,16687,4046
mean,10378.5,,23.841804,1.700245,87.887768,,,2.445908,2.761332,,,2.029418,,0.981747,0.616756,,,
std,5992.46278,,5.688072,0.087312,26.379443,,,0.533218,0.705375,,,0.608467,,0.838302,0.602113,,,
min,0.0,,14.0,1.45,39.0,,,1.0,1.0,,,1.0,,0.0,0.0,,,
25%,5189.25,,20.0,1.631856,66.0,,,2.0,3.0,,,1.792022,,0.008013,0.0,,,
50%,10378.5,,22.815416,1.7,84.064875,,,2.393837,3.0,,,2.0,,1.0,0.573887,,,
75%,15567.75,,26.0,1.762887,111.600553,,,3.0,3.0,,,2.549617,,1.587406,1.0,,,


## Pre-processing

In [5]:
## create new tables for manipulation
train_clean = train_raw.drop('id',axis=1)
test_clean = test_raw.drop('id',axis=1)

### Fill NULLS

In [6]:
## identify columns with NaN -- no NaNs
train_clean.columns[train_clean.isna().any()]
test_clean.columns[test_clean.isna().any()]

Index([], dtype='object')

### Fix Column DataTypes

In [7]:
## convert object columns to category
for col in train_clean.columns[train_clean.dtypes==object]:
    train_clean[col] = train_clean[col].astype('category')

for col in test_clean.columns[test_clean.dtypes==object]:
    test_clean[col] = test_clean[col].astype('category')

## display categories of category columns
for col in train_clean.columns[train_clean.dtypes=='category']:
    print(col, ': \t', train_clean[col].cat.categories)

Gender : 	 Index(['Female', 'Male'], dtype='object')
family_history_with_overweight : 	 Index(['no', 'yes'], dtype='object')
FAVC : 	 Index(['no', 'yes'], dtype='object')
CAEC : 	 Index(['Always', 'Frequently', 'Sometimes', 'no'], dtype='object')
SMOKE : 	 Index(['no', 'yes'], dtype='object')
SCC : 	 Index(['no', 'yes'], dtype='object')
CALC : 	 Index(['Frequently', 'Sometimes', 'no'], dtype='object')
MTRANS : 	 Index(['Automobile', 'Bike', 'Motorbike', 'Public_Transportation', 'Walking'], dtype='object')
NObeyesdad : 	 Index(['Insufficient_Weight', 'Normal_Weight', 'Obesity_Type_I',
       'Obesity_Type_II', 'Obesity_Type_III', 'Overweight_Level_I',
       'Overweight_Level_II'],
      dtype='object')


In [8]:
## set ordering for ordered categorical columns
for col in ['CAEC', 'CALC']:
    train_clean[col] = pd.Categorical(train_clean[col], categories=['no','Sometimes','Frequently','Always'], ordered=True)
    test_clean[col] = pd.Categorical(test_clean[col], categories=['no','Sometimes','Frequently','Always'], ordered=True)


train_clean['NObeyesdad'] = pd.Categorical(train_clean['NObeyesdad'], categories=['Insufficient_Weight', 'Normal_Weight', 'Overweight_Level_I', 'Overweight_Level_II', 
                                                                              'Obesity_Type_I', 'Obesity_Type_II', 'Obesity_Type_III'], ordered=True)

## Save data

In [9]:
train_clean.to_pickle('train_clean.pkl')
test_clean.to_pickle('test_clean.pkl')