### Import & Data Load

In [1]:
import numpy as np
import pandas as pd
from lightgbm import LGBMRegressor
from sklearn.preprocessing import LabelEncoder

In [None]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

In [3]:
train.head(5)

Unnamed: 0,ID,gender,age,height,weight,cholesterol,systolic_blood_pressure,diastolic_blood_pressure,glucose,bone_density,activity,smoke_status,medical_history,family_medical_history,sleep_pattern,edu_level,mean_working,stress_score
0,TRAIN_0000,F,72,161.49,58.47,279.84,165,100,143.35,0.87,moderate,ex-smoker,high blood pressure,diabetes,sleep difficulty,bachelors degree,,0.63
1,TRAIN_0001,M,88,179.87,77.6,257.37,178,111,146.94,0.07,moderate,ex-smoker,,diabetes,normal,graduate degree,,0.83
2,TRAIN_0002,M,47,182.47,89.93,226.66,134,95,142.61,1.18,light,ex-smoker,,,normal,high school diploma,9.0,0.7
3,TRAIN_0003,M,69,185.78,68.63,206.74,158,92,137.26,0.48,intense,ex-smoker,high blood pressure,,oversleeping,graduate degree,,0.17
4,TRAIN_0004,F,81,164.63,71.53,255.92,171,116,129.37,0.34,moderate,ex-smoker,diabetes,diabetes,sleep difficulty,bachelors degree,,0.36


### Check Data

In [4]:
train.isnull().sum()

ID                             0
gender                         0
age                            0
height                         0
weight                         0
cholesterol                    0
systolic_blood_pressure        0
diastolic_blood_pressure       0
glucose                        0
bone_density                   0
activity                       0
smoke_status                   0
medical_history             1289
family_medical_history      1486
sleep_pattern                  0
edu_level                    607
mean_working                1032
stress_score                   0
dtype: int64

In [5]:
# 결측값 있는 칼럼(column) 확인
missing_columns_train = train.columns[train.isnull().sum() > 0]
missing_columns_train

Index(['medical_history', 'family_medical_history', 'edu_level',
       'mean_working'],
      dtype='object')

In [6]:
train[missing_columns_train].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   medical_history         1711 non-null   object 
 1   family_medical_history  1514 non-null   object 
 2   edu_level               2393 non-null   object 
 3   mean_working            1968 non-null   float64
dtypes: float64(1), object(3)
memory usage: 93.9+ KB


In [7]:
categorical_na_cols = []
numerical_na_cols = []

# 결측값이 있는 각 칼럼에 대해 데이터 타입 확인
for col in missing_columns_train:
    # 해당 칼럼이 범주형(object 또는 category)인지 확인
    if col in train.select_dtypes(include=['object', 'category']).columns:
        categorical_na_cols.append(col)
    # 해당 칼럼이 수치형(int 또는 float)인지 확인
    elif col in train.select_dtypes(include=['int', 'float']).columns:
        numerical_na_cols.append(col)

print("결측값이 있는 범주형 변수:", categorical_na_cols)
print("결측값이 있는 수치형 변수:", numerical_na_cols)

결측값이 있는 범주형 변수: ['medical_history', 'family_medical_history', 'edu_level']
결측값이 있는 수치형 변수: ['mean_working']


### Data Preprocessing

In [8]:
# 범주형 변수의 결측값을 최빈값으로 대체
for col in categorical_na_cols:
    # 학습 데이터에서 최빈값 계산
    most_frequent = train[col].mode()[0]
    
    # 학습 데이터와 테스트 데이터 모두 해당 칼럼의 최빈값으로 대체
    train[col] = train[col].fillna(most_frequent)
    test[col] = test[col].fillna(most_frequent)

In [None]:
# mean_working에 대해 중앙값 대체
median_value = train['mean_working'].median()

train['mean_working'] = train['mean_working'].fillna(median_value)
test['mean_working'] = test['mean_working'].fillna(median_value)

In [10]:
# Label Encoding 적용 열 - 범주형 데이터
categorical_cols = train.select_dtypes(include='object').columns.drop('ID')

for feature in categorical_cols:
    le = LabelEncoder()
    le = le.fit(train[feature])
    train[feature] = le.transform(train[feature])
    
    for label in np.unique(test[feature]):
        if label not in le.classes_:
            le.classes_ = np.append(le.classes_, label)
    
    test[feature] = le.transform(test[feature])

In [11]:
x_train = train.drop(['ID', 'stress_score'], axis = 1)
y_train = train['stress_score']

test = test.drop('ID', axis = 1)

### Train / Predict

In [12]:
model = LGBMRegressor(random_state = 42)
model.fit(x_train, y_train)

pred = model.predict(test)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000804 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1454
[LightGBM] [Info] Number of data points in the train set: 3000, number of used features: 16
[LightGBM] [Info] Start training from score 0.482130


### Submission

In [None]:
submission = pd.read_csv('./data/sample_submission.csv')

In [14]:
submission['stress_score'] = pred
submission.head()

Unnamed: 0,ID,stress_score
0,TEST_0000,0.412394
1,TEST_0001,0.689009
2,TEST_0002,0.183309
3,TEST_0003,0.522015
4,TEST_0004,0.572112


In [None]:
submission.to_csv('./data/submit.csv', index=False)