In [1]:
import pandas as pd
import autogluon
import numpy as np
from autogluon.tabular import TabularDataset, TabularPredictor
import bisect
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train_df = pd.read_parquet("data/train.parquet").drop(columns=['SAMPLE_ID'])
test_df = pd.read_parquet("data/test.parquet").drop(columns=['SAMPLE_ID'])

In [3]:
train_df['ATA'] = pd.to_datetime(train_df['ATA'])
test_df['ATA'] = pd.to_datetime(test_df['ATA'])

# datetime을 여러 파생 변수로 변환
for df in [train_df, test_df]:
    df['year'] = df['ATA'].dt.year
    df['month'] = df['ATA'].dt.month
    df['day'] = df['ATA'].dt.day
    df['hour'] = df['ATA'].dt.hour
    df['minute'] = df['ATA'].dt.minute
    df['weekday'] = df['ATA'].dt.weekday

# datetime 컬럼 제거
train_df.drop(columns='ATA', inplace=True)
test_df.drop(columns='ATA', inplace=True)

# Categorical 컬럼 인코딩
categorical_features = ['ARI_CO', 'ARI_PO', 'SHIP_TYPE_CATEGORY', 'ID', 'SHIPMANAGER', 'FLAG']
encoders = {}

for feature in categorical_features:
    le = LabelEncoder()
    train_df[feature] = le.fit_transform(train_df[feature].astype(str))
    le_classes_set = set(le.classes_)
    test_df[feature] = test_df[feature].map(lambda s: '-1' if s not in le_classes_set else s)
    le_classes = le.classes_.tolist()
    bisect.insort_left(le_classes, '-1')
    le.classes_ = np.array(le_classes)
    test_df[feature] = le.transform(test_df[feature].astype(str))
    encoders[feature] = le
# 결측치 처리
train_df.fillna(train_df.mean(), inplace=True)
test_df.fillna(train_df.mean(), inplace=True)

In [7]:
train_data = TabularDataset(train_df)
test_data = TabularDataset(test_df)

label = "CI_HOUR"
eval_metric = "mean_absolute_error"

In [8]:
predictor = TabularPredictor(
    label=label, problem_type='regression', eval_metric=eval_metric
).fit(train_data, presets="best_quality")

No path specified. Models will be saved in: "AutogluonModels/ag-20230926_003757/"
Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
	Consider setting `time_limit` to ensure training finishes within an expected duration or experiment with a small portion of `train_data` to identify an ideal `presets` and `hyperparameters` configuration.
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20230926_003757/"
AutoGluon Version:  0.8.2
Python Version:     3.10.13
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 22.2.0: Fri Nov 11 02:04:44 PST 2022; root:xnu-8792.61.2~4/RELEASE_ARM64_T8103
Disk Space Avail:   28.93 GB / 245.11 GB (11.8%)
Train Data Rows:    367441
Train Data Columns: 30
Label Column: CI_HOUR
Preprocessing data ...
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:

[1000]	valid_set's l1: 53.7464
[2000]	valid_set's l1: 51.1683
[3000]	valid_set's l1: 49.4552
[4000]	valid_set's l1: 48.124
[5000]	valid_set's l1: 47.0637
[6000]	valid_set's l1: 46.2787
[7000]	valid_set's l1: 45.586
[8000]	valid_set's l1: 44.9676
[9000]	valid_set's l1: 44.4582
[10000]	valid_set's l1: 44.0386
[1000]	valid_set's l1: 53.1819
[2000]	valid_set's l1: 51.0168
[3000]	valid_set's l1: 49.1289
[4000]	valid_set's l1: 47.8111
[5000]	valid_set's l1: 46.7126
[6000]	valid_set's l1: 45.896
[7000]	valid_set's l1: 45.2795
[8000]	valid_set's l1: 44.7178
[9000]	valid_set's l1: 44.2266
[10000]	valid_set's l1: 43.703
[1000]	valid_set's l1: 54.0638
[2000]	valid_set's l1: 51.4087
[3000]	valid_set's l1: 49.588
[4000]	valid_set's l1: 48.0526
[5000]	valid_set's l1: 46.8886
[6000]	valid_set's l1: 46.0192
[7000]	valid_set's l1: 45.2891
[8000]	valid_set's l1: 44.6527
[9000]	valid_set's l1: 44.1453
[10000]	valid_set's l1: 43.7178
[1000]	valid_set's l1: 54.1301
[2000]	valid_set's l1: 51.55
[3000]	valid

	-43.8289	 = Validation score   (-mean_absolute_error)
	575.4s	 = Training   runtime
	55.5s	 = Validation runtime
Fitting model: LightGBM_BAG_L1 ...
	Memory not enough to fit LGBModel folds in parallel. Will do sequential fitting instead. 	Consider decreasing folds trained in parallel by passing num_folds_parallel to ag_args_ensemble when calling predictor.fit
	Fitting 8 child models (S1F1 - S1F8) | Fitting with SequentialLocalFoldFittingStrategy


[1000]	valid_set's l1: 46.2716
[2000]	valid_set's l1: 40.2931
[3000]	valid_set's l1: 36.3474
[4000]	valid_set's l1: 34.0801
[5000]	valid_set's l1: 32.2888
[6000]	valid_set's l1: 31.0332
[7000]	valid_set's l1: 30.0751
[8000]	valid_set's l1: 29.1074
[9000]	valid_set's l1: 28.3619
[10000]	valid_set's l1: 27.8328
[1000]	valid_set's l1: 44.6309
[2000]	valid_set's l1: 39.585
[3000]	valid_set's l1: 35.9446
[4000]	valid_set's l1: 33.7409
[5000]	valid_set's l1: 32.3143
[6000]	valid_set's l1: 30.898
[7000]	valid_set's l1: 30.0355
[8000]	valid_set's l1: 29.2411
[9000]	valid_set's l1: 28.673
[10000]	valid_set's l1: 28.0967
[1000]	valid_set's l1: 45.4007
[2000]	valid_set's l1: 40.1501
[3000]	valid_set's l1: 36.4074
[4000]	valid_set's l1: 33.9071
[5000]	valid_set's l1: 32.455
[6000]	valid_set's l1: 31.2144
[7000]	valid_set's l1: 30.2918
[8000]	valid_set's l1: 29.5533
[9000]	valid_set's l1: 28.9883
[10000]	valid_set's l1: 28.4841
[1000]	valid_set's l1: 46.1463
[2000]	valid_set's l1: 39.9051
[3000]	va

	-28.1047	 = Validation score   (-mean_absolute_error)
	568.36s	 = Training   runtime
	56.79s	 = Validation runtime
Fitting model: RandomForestMSE_BAG_L1 ...
	-53.5117	 = Validation score   (-mean_absolute_error)
	161.84s	 = Training   runtime
	4.43s	 = Validation runtime
Fitting model: CatBoost_BAG_L1 ...
	Memory not enough to fit CatBoostModel folds in parallel. Will do sequential fitting instead. 	Consider decreasing folds trained in parallel by passing num_folds_parallel to ag_args_ensemble when calling predictor.fit
	Fitting 8 child models (S1F1 - S1F8) | Fitting with SequentialLocalFoldFittingStrategy


KeyboardInterrupt: 