# Import

### lib

In [2]:
import sys
import os
import datetime

import pandas as pd
import numpy as np
# ML
# from sklearn.pipeline import Pipeline
# from sklearn.compose import ColumnTransformer
# from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder
# from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error
from sklearn.linear_model import LinearRegression
# from sklearn.preprocessing import PowerTransformer, MinMaxScaler
import lightgbm as lgb
import mlflow
import mlflow.lightgbm

In [3]:
sys.path.append("../../")


# from ... import ...

### env & settings

In [4]:
DATA_DIR = "/workspace/Storage/kaggle/Data/insurance"
PATH_TRAIN = os.path.join(DATA_DIR, "raw", "train.csv")
PATH_TEST = os.path.join(DATA_DIR, "raw", "test.csv")

In [5]:
# 출력할 컬럼 수를 충분히 늘리기
pd.set_option('display.max_columns', None)

### Data

In [6]:
train = pd.read_csv(PATH_TRAIN)
X_submit = pd.read_csv(PATH_TEST)

In [7]:
train.head()

Unnamed: 0,id,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Policy Start Date,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,Premium Amount
0,0,19.0,Female,10049.0,Married,1.0,Bachelor's,Self-Employed,22.598761,Urban,Premium,2.0,17.0,372.0,5.0,2023-12-23 15:21:39.134960,Poor,No,Weekly,House,2869.0
1,1,39.0,Female,31678.0,Divorced,3.0,Master's,,15.569731,Rural,Comprehensive,1.0,12.0,694.0,2.0,2023-06-12 15:21:39.111551,Average,Yes,Monthly,House,1483.0
2,2,23.0,Male,25602.0,Divorced,3.0,High School,Self-Employed,47.177549,Suburban,Premium,1.0,14.0,,3.0,2023-09-30 15:21:39.221386,Good,Yes,Weekly,House,567.0
3,3,21.0,Male,141855.0,Married,2.0,Bachelor's,,10.938144,Rural,Basic,1.0,0.0,367.0,1.0,2024-06-12 15:21:39.226954,Poor,Yes,Daily,Apartment,765.0
4,4,21.0,Male,39651.0,Single,1.0,Bachelor's,Self-Employed,20.376094,Rural,Premium,0.0,8.0,598.0,4.0,2021-12-01 15:21:39.252145,Poor,Yes,Weekly,House,2022.0


# 1. preprocess

In [8]:
EDA_result = pd.read_csv("EDA_result.csv").fillna("")

In [9]:
EDA_result

Unnamed: 0,name,type,mv_process,transform
0,id,interval,,drop
1,Age,interval,median,
2,Gender,nominal,random(uniform),encoding(nominal)
3,Annual Income,ratio,median,box-cox
4,Marital Status,nominal,random(uniform),encoding(nominal)
5,Number of Dependents,interval,random(uniform),
6,Education Level,ordinal,mode,encoding(ordinal)
7,Occupation,nominal,random(uniform),encoding(nominal)
8,Health Score,ratio,mean,
9,Location,nominal,mode,encoding(nominal)


In [10]:
def preprocess(df):
    random_state = 42
    ref = "2024-08-16"
    # ordinal 인코딩
    ordinal_dict = {
        "Education Level" : {
            "High School": 1,
            "Bachelor's" : 2,
            "Master's" : 3,
            "PhD" : 4,
        },
        "Policy Type": {
            "Basic":1,
            "Comprehensive":2,
            "Premium":3
        },
        "Customer Feedback":{
            "Poor":1,
            "Average":2,
            "Good":3
        },
        "Exercise Frequency":{
            'Rarely':1,
            'Monthly':2,
            'Weekly':3,
            'Daily':4
        }
    }
    df[list(ordinal_dict.keys())] = df[ordinal_dict.keys()].apply(lambda col: col.map(lambda x: ordinal_dict[col.name][x] if not pd.isna(x) else x))
    # datetime 인코딩
    ref = datetime.datetime.strptime(ref, "%Y-%m-%d")
    df['Policy Start Date'] = pd.to_datetime(df['Policy Start Date']).map(lambda x: (ref-x).total_seconds())
    # 결측처리
    for col in ['Health Score', 'Policy Start Date']:
        df[col] = df[col].fillna(df[col].mean())
    for col in ['Age', 'Annual Income','Vehicle Age','Credit Score','Insurance Duration','Customer Feedback','Exercise Frequency']:
        df[col] = df[col].fillna(df[col].median())
    for col in ['Education Level', 'Location', 'Policy Type', 'Smoking Status', 'Property Type']:
        df[col] = df[col].fillna(df[col].mode())
    for col in ['Previous Claims']:
        df[col] = df[col].fillna(0)
    for col in ['Gender', 'Marital Status', 'Number of Dependents', 'Occupation']:
        missing_indices = df[df[col].isna()].index
        
        # 기존 데이터에서 랜덤 추출
        n_missing = df[col].isna().sum()
        if not n_missing:
            continue
        
        imputes = df[col].dropna().sample(n=n_missing, random_state=random_state).tolist()
        
        # 결측치 주입
        df.loc[missing_indices, col] = imputes
        n_missing = df[col].isna().sum()
        
    # onehot 인코딩
    for col in ['Gender','Marital Status', 'Occupation', 'Location', 'Smoking Status', 'Property Type']:
        nominals = pd.get_dummies(train[col], prefix=col)
        df.drop(col, axis=1, inplace=True)
        df = pd.concat([df, nominals], axis=1)
    return df

In [11]:
train = preprocess(train)

## 1.3 split

In [12]:
train, val = train_test_split(train, test_size=0.4, random_state=42)
val, test = train_test_split(val, test_size=0.5, random_state=42)

In [13]:
X_train, y_train = train[[x for x in train.columns if x != "Premium Amount"]], train["Premium Amount"]
X_val, y_val = val[[x for x in val.columns if x != "Premium Amount"]], val["Premium Amount"]
X_test, y_test = test[[x for x in test.columns if x != "Premium Amount"]], test["Premium Amount"]

# 2. Baseline

## 2.1 statistic model

In [14]:
from sklearn.base import BaseEstimator, RegressorMixin
import numpy as np

# 커스텀 회귀 모델
class Model_mean:
    def __init__(self):  # 하이퍼파라미터 추가
        pass
    
    def fit(self, X, y):
        self.mean = np.mean(y)

    def predict(self, X):
        return [self.mean] * len(X)

In [15]:
baselinemodel1 = Model_mean()

In [16]:
baselinemodel1.fit(X_train, y_train)

In [17]:
y_pred = baselinemodel1.predict(X_test)

In [18]:
msle = mean_squared_log_error(y_test, y_pred)
rmsle = np.sqrt(msle)
print("RMSLE:", rmsle)

RMSLE: 1.1692104217092885


- 2024-12-17 15:52 기준 1등 1.02226

## 2.2 linear model

In [19]:
baselinemodel2 = LinearRegression()

In [20]:
baselinemodel2.fit(X_train, y_train)

In [21]:
y_pred = baselinemodel2.predict(X_test)

In [22]:
msle = mean_squared_log_error(y_test, y_pred)
rmsle = np.sqrt(msle)
print("RMSLE:", rmsle)

RMSLE: 1.1663660829446776


## 2.3 tree model

In [23]:
# 데이터셋 생성
train_data = lgb.Dataset(X_train, label=y_train)
valid_data = lgb.Dataset(X_val, label=y_val, reference=train_data)


In [24]:

# 사용자 정의 콜백: epoch별 메트릭 로그 기록
class MLflowLoggingCallback:
    def __init__(self):
        self.epoch = 0
    
    def __call__(self, env):
        self.epoch += 1
        # 검증용 RMSE 기록
        validation_score = env.evaluation_result_list[0][2]
        mlflow.log_metric("Validation_RMSE", validation_score, step=self.epoch)

In [25]:
# 모델 학습

# MLflow 실험 시작
mlflow.set_tracking_uri("http://175.214.62.133:50001/")
mlflow.set_experiment("lightgbm_rmsle_experiment")

with mlflow.start_run():
    
    # 하이퍼파라미터 설정
    params = {
        'objective': 'regression',  # 회귀 문제
        'metric': 'rmse',           # 평가 지표
        'boosting_type': 'gbdt',
        'num_leaves': 31,
        'learning_rate': 0.05,
        'feature_fraction': 0.9
    }

    # 콜백 함수 설정
    callbacks = [
        lgb.early_stopping(stopping_rounds=50),
        lgb.log_evaluation(period=100),
        MLflowLoggingCallback()                 # 사용자 콜백 추가
    ]
    model = lgb.train(
        params,
        train_data,
        valid_sets=[valid_data],
        num_boost_round=1000,
        callbacks=callbacks
    )

    y_pred = model.predict(X_test, num_iteration=model.best_iteration)
    msle = mean_squared_log_error(y_test, y_pred)
    rmsle = np.sqrt(msle)
    print("RMSLE:", rmsle)

    # MLflow에 파라미터 및 메트릭 기록
    mlflow.log_params(params)
    mlflow.log_metric("RMSLE", rmsle)
    
    # 모델 저장
    mlflow.lightgbm.log_model(model, "lightgbm_model")
    print("Model saved to MLflow")

2024/12/17 11:36:37 INFO mlflow.tracking.fluent: Experiment with name 'lightgbm_rmsle_experiment' does not exist. Creating a new experiment.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008310 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1412
[LightGBM] [Info] Number of data points in the train set: 720000, number of used features: 30
[LightGBM] [Info] Start training from score 1102.863665
Training until validation scores don't improve for 50 rounds
[100]	valid_0's rmse: 843.582
[200]	valid_0's rmse: 843.14
[300]	valid_0's rmse: 843.019
[400]	valid_0's rmse: 842.941
Early stopping, best iteration is:
[391]	valid_0's rmse: 842.938
RMSLE: 1.14045065451937




Model saved to MLflow
🏃 View run magnificent-grub-386 at: http://175.214.62.133:50001/#/experiments/286555346072408604/runs/75e10eed2aa3454b9820029214d6584d
🧪 View experiment at: http://175.214.62.133:50001/#/experiments/286555346072408604


# 3. Conclusion

1. 세 모델이 큰 차이는 나지 않고 1등에 비해 크게 떨어지는 것으로 보아 분석, 실험, 전처리가 많이 필요할 것으로 보임
2. linear regression은 가정을 확인하지 않고 진행함. 추가 분석과 전처리 필요
3. lgbm에서 step별 loss 감소는 log 모양으로 잘 일어남

# 4. Plan

- non-agile
    1. 코드 정리, 추상화, 모듈화
- agile
    1. 하이퍼파라미터튜닝
    2. Diagnostic Analysis(이후 심화 분석 및 전처리)
    3. 실험계획 리스트업