In [None]:
!pip install optuna category_encoders

In [None]:
#Regression
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import KFold, cross_val_score, train_test_split, TimeSeriesSplit
from sklearn.linear_model import Ridge, Lasso, LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, VotingRegressor, StackingRegressor, AdaBoostRegressor, HistGradientBoostingRegressor
from sklearn.dummy import DummyRegressor
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, mean_squared_error, f1_score
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, LabelEncoder,OrdinalEncoder, OneHotEncoder, PolynomialFeatures, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.inspection import PartialDependenceDisplay
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
import optuna

# 1. 데이터

In [None]:
train = pd.read_csv("/content/drive/MyDrive/SKT AI FLY/선박 대기/train.csv")
test = pd.read_csv("/content/drive/MyDrive/SKT AI FLY/선박 대기/test.csv")
sample_submission = pd.read_csv("/content/drive/MyDrive/SKT AI FLY/선박 대기/sample_submission.csv")

## 1) 시계열 데이터(ATA) 열 분리

In [None]:
import datetime

def transform_time(df):
    #I made these variables.
    df['ATA'] = pd.to_datetime(df['ATA'])
    df['YEAR'] = df['ATA'].dt.year
    df['MONTH'] = df['ATA'].dt.month
    df['DAY'] = df['ATA'].dt.day
    df['HOUR'] = df['ATA'].dt.hour
    df['MINUTES'] = df['ATA'].dt.minute
    df['WOY'] = df['ATA'].dt.weekofyear
    df['WEEKDAY'] = df['ATA'].dt.weekday
    df['WEEKEND'] = df['WEEKDAY'].apply(lambda x: 1 if x >= 5 else 0)
    df['HOLIDAY'] = df['ATA'].apply(lambda x: 1 if (x.month == 1 and x.day == 1) or (x.month == 12 and x.day==25) else 0)

    #ChatGPT makes these variables
    df['AGE'] = df['YEAR'] - df['BUILT']
    df['SHIP_VOLUME'] = df['LENGTH'] * df['BREADTH'] * df['DEPTH']
    df['EFFICIENT'] = df['DEADWEIGHT'] / df['GT']
    df['RELATIVE_DISTANCE'] = df['DIST'] / df['PORT_SIZE']

    return df

In [None]:
train = transform_time(train)
test = transform_time(test)

In [None]:
train.drop(columns=['ID','SHIPMANAGER','FLAG','SAMPLE_ID','ATA'],axis=1,inplace=True)
test.drop(columns=['ID','SHIPMANAGER','FLAG','SAMPLE_ID','ATA'],axis=1,inplace=True)

In [None]:
X = train.drop(columns=['CI_HOUR'],axis=1)
y = train['CI_HOUR']

## 2) 타겟 인코딩->라벨 인코딩

In [None]:
cat_cols = X.select_dtypes(include='object').columns.tolist()
from category_encoders.target_encoder import TargetEncoder
enc=TargetEncoder()
for col in cat_cols:
    X[col] = enc.fit_transform(X[col],y)
    test[col] = enc.transform(test[col])

# 2. Training usning Optuna

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('ss', MinMaxScaler(), X.columns),
    ], remainder='passthrough'
)

#Best Parameters using OPTUNA
params = {'max_iter': 1969,
          'max_leaf_nodes': 96,
          'max_depth': 11,
          'min_samples_leaf': 43,
          'l2_regularization': 0.03503250404129518}

pipe = Pipeline(
    [
        ('MIN',preprocessor),
        ('HIST',HistGradientBoostingRegressor(random_state=42, loss='absolute_error', **params))
    ]
)

pipe.fit(X,y)

# 3. Evaluation

In [None]:
fold = KFold(n_splits=5, shuffle=True, random_state=42)
score = cross_val_score(estimator=pipe, X=X, y=y, cv=fold, scoring='neg_mean_absolute_error')
# print(np.average(score))
# print(f"5 K-FOLD : {np.round(np.mean(-score),2)} & std +/-{np.round(np.std(-score),2)}")

In [None]:
print(f"MAE: {-1*np.average(score)}")