In [87]:
import numpy as np
import pandas as pd

In [101]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
submission = pd.read_csv('data/sample_submission.csv')

train.head(5)

Unnamed: 0,id,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,0,Male,24.443011,1.699998,81.66995,yes,yes,2.0,2.983297,Sometimes,no,2.763573,no,0.0,0.976473,Sometimes,Public_Transportation,Overweight_Level_II
1,1,Female,18.0,1.56,57.0,yes,yes,2.0,3.0,Frequently,no,2.0,no,1.0,1.0,no,Automobile,Normal_Weight
2,2,Female,18.0,1.71146,50.165754,yes,yes,1.880534,1.411685,Sometimes,no,1.910378,no,0.866045,1.673584,no,Public_Transportation,Insufficient_Weight
3,3,Female,20.952737,1.71073,131.274851,yes,yes,3.0,3.0,Sometimes,no,1.674061,no,1.467863,0.780199,Sometimes,Public_Transportation,Obesity_Type_III
4,4,Male,31.641081,1.914186,93.798055,yes,yes,2.679664,1.971472,Sometimes,no,1.979848,no,1.967973,0.931721,Sometimes,Public_Transportation,Overweight_Level_II


### BMI Feature 생성

In [102]:
train['BMI'] = train['Weight'] / (train['Height']**2)
test['BMI'] = test['Weight'] / (test['Height']**2)

### 명목형 변수 인코딩

In [103]:
# CAEC 변수 인코딩 수정
caec_mapping = {'no': 0, 'Sometimes': 1, 'Frequently': 2, 'Always': 3}
train['CAEC'] = train['CAEC'].map(caec_mapping)
test['CAEC'] = test['CAEC'].map(caec_mapping)

# CALC 변수 인코딩
calc_mapping = {'no': 0, 'Sometimes': 1, 'Frequently': 2}
train['CALC'] = train['CALC'].map(calc_mapping)
test['CALC'] = test['CALC'].map(calc_mapping)

# MTRANS 변수 인코딩
mtrans_mapping = {
    'Walking': 4,  # 가장 활동량이 많음
    'Bike': 3,  # 상당한 활동량
    'Public_Transportation': 2,  # 일정한 활동량
    'Motorbike': 1,  # 적은 활동량
    'Automobile': 0  # 가장 적은 활동량
}
train['MTRANS'] = train['MTRANS'].map(mtrans_mapping)
test['MTRANS'] = test['MTRANS'].map(mtrans_mapping)

# NObeyesdad 변수 인코딩
nobeyesdad_mapping = {
    'Insufficient_Weight': 0,
    'Normal_Weight': 1,
    'Overweight_Level_I': 2,
    'Overweight_Level_II': 3,
    'Obesity_Type_I': 4,
    'Obesity_Type_II': 5,
    'Obesity_Type_III': 6
}
train['NObeyesdad'] = train['NObeyesdad'].map(nobeyesdad_mapping)

In [111]:
categorical_features = ['Gender', 'family_history_with_overweight', 'FAVC', 'SMOKE', 'SCC']

train_encoded = pd.get_dummies(train, columns=categorical_features)
test_encoded = pd.get_dummies(test, columns=categorical_features)