## 작업형 2유형 최종정리
- 작업형1 : 3문제 (30점), 데이터 전처리
- `작업형2 : 1문제 (40점), 분류/회귀 예측 모델링`
- 작업형3 : 2문제 (30점), 가설 검정

## 주요 라이브러리
- palmerpenguins : 팔머펭귄 데이터셋의 목표는 iris 데이터셋의 대안으로 데이터 탐색 및 시각화를 위한 데이터셋 제공.
- scikit-learn : 머신러닝을 위한 라이브러리
- lightgbm : LightGBM은 Microsoft에서 개발한 오픈 소스 기계 학습 라이브러리로, 대용량 데이터셋에서 빠른 속도와 높은 성능을 제공하는 것이 특징

## 주의
- 각 코드에 대한 설명은 별도로 하지 않습니다.

## 데이터 파일 불러오기

In [1]:
import pandas as pd 
from palmerpenguins import load_penguins 

penguins = load_penguins()
penguins['ID'] = penguins.reset_index().index + 1
penguins.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year,ID
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male,2007,1
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female,2007,2
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,female,2007,3
3,Adelie,Torgersen,,,,,,2007,4
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,female,2007,5


In [2]:
cols = penguins.columns.tolist()
cols = cols[-1:] + cols[:-1]
print(cols)

['ID', 'species', 'island', 'bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g', 'sex', 'year']


- 컬럼의 순서를 변경한다. ID가 가장 먼저 오도록 한다.

In [3]:
penguins = penguins[cols]
penguins

Unnamed: 0,ID,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
0,1,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male,2007
1,2,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female,2007
2,3,Adelie,Torgersen,40.3,18.0,195.0,3250.0,female,2007
3,4,Adelie,Torgersen,,,,,,2007
4,5,Adelie,Torgersen,36.7,19.3,193.0,3450.0,female,2007
...,...,...,...,...,...,...,...,...,...
339,340,Chinstrap,Dream,55.8,19.8,207.0,4000.0,male,2009
340,341,Chinstrap,Dream,43.5,18.1,202.0,3400.0,female,2009
341,342,Chinstrap,Dream,49.6,18.2,193.0,3775.0,male,2009
342,343,Chinstrap,Dream,50.8,19.0,210.0,4100.0,male,2009


### 데이터 가공
- 지금까지 열린 대회에서는 결측치가 존재 하지 않았던 것으로 기억
    + 만약 잘못된 정보라면 알려주세요
    + 결측치를 제거한다.

In [4]:
penguins.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   ID                 344 non-null    int64  
 1   species            344 non-null    object 
 2   island             344 non-null    object 
 3   bill_length_mm     342 non-null    float64
 4   bill_depth_mm      342 non-null    float64
 5   flipper_length_mm  342 non-null    float64
 6   body_mass_g        342 non-null    float64
 7   sex                333 non-null    object 
 8   year               344 non-null    int64  
dtypes: float64(4), int64(2), object(3)
memory usage: 24.3+ KB


In [5]:
penguins.dropna().reset_index(drop=True)

Unnamed: 0,ID,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
0,1,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male,2007
1,2,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female,2007
2,3,Adelie,Torgersen,40.3,18.0,195.0,3250.0,female,2007
3,5,Adelie,Torgersen,36.7,19.3,193.0,3450.0,female,2007
4,6,Adelie,Torgersen,39.3,20.6,190.0,3650.0,male,2007
...,...,...,...,...,...,...,...,...,...
328,340,Chinstrap,Dream,55.8,19.8,207.0,4000.0,male,2009
329,341,Chinstrap,Dream,43.5,18.1,202.0,3400.0,female,2009
330,342,Chinstrap,Dream,49.6,18.2,193.0,3775.0,male,2009
331,343,Chinstrap,Dream,50.8,19.0,210.0,4100.0,male,2009


## 데이터셋 분리
- 기사시험과 같이 데이터셋을 만들기 위해 데이터셋을 분리하고 저장한다. 

### 회귀모형을 위한 데이터셋 정리

In [6]:
from sklearn.model_selection import train_test_split

X = penguins.drop(['body_mass_g'], axis = 1)
y = penguins['body_mass_g']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3,random_state=42
)

X_train.to_csv("penguin_reg_X_train.csv", index=False)
X_test.to_csv("penguin_reg_X_test.csv", index=False)
y_train.to_csv("penguin_reg_y_train.csv", index=False)
y_test.to_csv("penguin_reg_y_test.csv", index=False)

## 회귀모형 만들기 정리
- 기본적으로 아래 데이터셋 불러오기는 제공된다. 

In [60]:
import pandas as pd

X_train = pd.read_csv("penguin_reg_X_train.csv")
X_test = pd.read_csv("penguin_reg_X_test.csv")
y_train = pd.read_csv("penguin_reg_y_train.csv")

In [61]:
print(X_train.head(1))

   ID species island  bill_length_mm  bill_depth_mm  flipper_length_mm   sex  \
0  32  Adelie  Dream            37.2           18.1              178.0  male   

   year  
0  2007  


In [62]:
print(X_test.head(1))

    ID species  island  bill_length_mm  bill_depth_mm  flipper_length_mm  \
0  195  Gentoo  Biscoe            45.3           13.7              210.0   

      sex  year  
0  female  2008  


In [63]:
print(y_train.head(1))

   body_mass_g
0       3900.0


## ID 제거
- ID가 임의적이고 예측에 유용한 정보를 제공하지 않으므로 모델 학습에 도움이 되지 않고, 오히려 과적합을 초래할 수 있기 때문에 제거해야 한다.

In [64]:
X_train_id = X_train.pop("ID")
X_test_id = X_test.pop("ID")

### 데이터 확인

In [65]:
X_train.head(5)

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,sex,year
0,Adelie,Dream,37.2,18.1,178.0,male,2007
1,Gentoo,Biscoe,49.5,16.1,224.0,male,2009
2,Chinstrap,Dream,50.0,19.5,196.0,male,2007
3,Gentoo,Biscoe,48.4,14.6,213.0,male,2007
4,Gentoo,Biscoe,48.4,14.4,203.0,female,2009


In [66]:
X_test.head(5)

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,sex,year
0,Gentoo,Biscoe,45.3,13.7,210.0,female,2008
1,Gentoo,Biscoe,46.5,13.5,210.0,female,2007
2,Gentoo,Biscoe,46.5,14.8,217.0,female,2008
3,Gentoo,Biscoe,43.8,13.9,208.0,female,2008
4,Chinstrap,Dream,50.9,19.1,196.0,male,2008


In [67]:
y_train.head(5)

Unnamed: 0,body_mass_g
0,3900.0
1,5650.0
2,3900.0
3,5850.0
4,4625.0


### 결측치 확인

In [68]:
print(X_train.isnull().sum())

species              0
island               0
bill_length_mm       1
bill_depth_mm        1
flipper_length_mm    1
sex                  8
year                 0
dtype: int64


In [69]:
print(X_test.isnull().sum())

species              0
island               0
bill_length_mm       1
bill_depth_mm        1
flipper_length_mm    1
sex                  3
year                 0
dtype: int64


In [70]:
print(y_train.isnull().sum())

body_mass_g    1
dtype: int64


### 컬럼 분리
- 범주형 컬럼과 숫자형 컬럼으로 분리

In [71]:
import numpy as np

cat_cols = X_train.select_dtypes(exclude = np.number).columns.tolist()
num_cols = X_train.select_dtypes(include = np.number).columns.tolist()

print(cat_cols, num_cols)

['species', 'island', 'sex'] ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'year']


- year은 num_cols에서 제거한다.

In [72]:
num_cols.remove('year')
print(num_cols)

['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm']


### 데이터셋 분리

In [78]:
import pandas as pd
from sklearn.model_selection import train_test_split

numeric_cols = X_train.select_dtypes(include=['number']).columns
X_train[numeric_cols] = X_train[numeric_cols].fillna(X_train[numeric_cols].mean())

# If y_train['body_mass_g'] is numeric, fill its missing values
y_train['body_mass_g'].fillna(y_train['body_mass_g'].mean(), inplace=True)

# Proceed with train_test_split as before
X_tr, X_val, y_tr, y_val = train_test_split(
    X_train, y_train['body_mass_g'],
    test_size=0.3, 
    random_state=42
)

X_tr.shape, X_val.shape, y_tr.shape, y_val.shape

((168, 7), (72, 7), (168,), (72,))

In [80]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline 
# from sklearn.utils.fixes import loguniform
from scipy.stats import loguniform
from sklearn.metrics import make_scorer, mean_squared_error
from lightgbm import LGBMRegressor
from sklearn.model_selection import RandomizedSearchCV

import numpy as np

def rmse(y_tr, y_val):
    return np.sqrt(mean_squared_error(y_tr, y_val))

param_grid = {
    "clf__learning_rate": loguniform(0.0001, 0.1), 
    "clf__n_estimators" : np.arange(30, 50), 
    "clf__max_depth" : np.arange(3, 30, 2), 
    "clf__num_leaves" : np.arange(30, 50), 
    "clf__min_split_gain" : np.arange(0, 1.1, 0.1), 
    "clf__subsample" : np.arange(0.6, 1.0, 0.1)
}

column_transformer = ColumnTransformer([
    ("scaler", StandardScaler(), num_cols), 
    ("ohd_encoder", OneHotEncoder(), cat_cols)
], remainder="passthrough")

pipeline = Pipeline([
    ("preprocessing", column_transformer), 
    ("clf", LGBMRegressor(random_state=42))
])

random_search = RandomizedSearchCV(
    estimator=pipeline, 
    param_distributions = param_grid, 
    n_iter = 10, 
    scoring = make_scorer(rmse, greater_is_better=False),
    cv=5, 
    verbose=0, 
    n_jobs=-1
)

random_search.fit(X_tr, y_tr)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000545 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 150
[LightGBM] [Info] Number of data points in the train set: 168, number of used features: 12
[LightGBM] [Info] Start training from score 4223.185646


### 평가확인

In [85]:
def get_score(model, X_tr, X_val, y_tr, y_val):
    tr_pred = model.predict(X_tr)
    val_pred = model.predict(X_val)
    tr_score = rmse(y_tr, tr_pred)
    val_score = rmse(y_val, val_pred)
    return f"train: {tr_score}, validation : {val_score}"

get_score(random_search, X_tr, X_val, y_tr, y_val)

'train: 243.44817470359953, validation : 319.57180417269'

## 평가 제출