In [1]:
import pandas as pd
import autogluon
import numpy as np
from autogluon.tabular import TabularDataset, TabularPredictor
import bisect
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train_df = pd.read_parquet("data/train.parquet").drop(columns=['SAMPLE_ID'])
test_df = pd.read_parquet("data/test.parquet").drop(columns=['SAMPLE_ID'])

In [3]:
train_df['ATA'] = pd.to_datetime(train_df['ATA'])
test_df['ATA'] = pd.to_datetime(test_df['ATA'])

# datetime을 여러 파생 변수로 변환
for df in [train_df, test_df]:
    df['year'] = df['ATA'].dt.year
    df['month'] = df['ATA'].dt.month
    df['day'] = df['ATA'].dt.day
    df['hour'] = df['ATA'].dt.hour
    df['minute'] = df['ATA'].dt.minute
    df['weekday'] = df['ATA'].dt.weekday

# datetime 컬럼 제거
train_df.drop(columns='ATA', inplace=True)
test_df.drop(columns='ATA', inplace=True)

# Categorical 컬럼 인코딩
categorical_features = ['ARI_CO', 'ARI_PO', 'SHIP_TYPE_CATEGORY', 'ID', 'SHIPMANAGER', 'FLAG']
encoders = {}

for feature in categorical_features:
    le = LabelEncoder()
    train_df[feature] = le.fit_transform(train_df[feature].astype(str))
    le_classes_set = set(le.classes_)
    test_df[feature] = test_df[feature].map(lambda s: '-1' if s not in le_classes_set else s)
    le_classes = le.classes_.tolist()
    bisect.insort_left(le_classes, '-1')
    le.classes_ = np.array(le_classes)
    test_df[feature] = le.transform(test_df[feature].astype(str))
    encoders[feature] = le
# 결측치 처리
train_df.fillna(train_df.mean(), inplace=True)
test_df.fillna(train_df.mean(), inplace=True)

In [4]:
train_data = TabularDataset(train_df)
test_data = TabularDataset(test_df)

label = "CI_HOUR"
eval_metric = "mean_absolute_error"

In [5]:
predictor = TabularPredictor(
    label=label, problem_type='regression', eval_metric=eval_metric
).fit(train_data, presets="best_quality")

No path specified. Models will be saved in: "AutogluonModels\ag-20230926_031846\"
Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
	Consider setting `time_limit` to ensure training finishes within an expected duration or experiment with a small portion of `train_data` to identify an ideal `presets` and `hyperparameters` configuration.
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels\ag-20230926_031846\"
AutoGluon Version:  0.8.2
Python Version:     3.9.13
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.22621
Disk Space Avail:   36.47 GB / 498.74 GB (7.3%)
Train Data Rows:    367441
Train Data Columns: 30
Label Column: CI_HOUR
Preprocessing data ...
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    5980.06 MB
	Train Data (Original)  Memory Usage: 79.37 MB (1.3% of avai

[1000]	valid_set's l1: 53.7464
[2000]	valid_set's l1: 51.1683
[3000]	valid_set's l1: 49.4552
[4000]	valid_set's l1: 48.124
[5000]	valid_set's l1: 47.0637
[6000]	valid_set's l1: 46.2787
[7000]	valid_set's l1: 45.586
[8000]	valid_set's l1: 44.9676
[9000]	valid_set's l1: 44.4582
[10000]	valid_set's l1: 44.0386
[1000]	valid_set's l1: 53.1819
[2000]	valid_set's l1: 51.0168
[3000]	valid_set's l1: 49.1289
[4000]	valid_set's l1: 47.8111
[5000]	valid_set's l1: 46.7126
[6000]	valid_set's l1: 45.896
[7000]	valid_set's l1: 45.2795
[8000]	valid_set's l1: 44.7178
[9000]	valid_set's l1: 44.2266
[10000]	valid_set's l1: 43.703
[1000]	valid_set's l1: 54.0638
[2000]	valid_set's l1: 51.4087
[3000]	valid_set's l1: 49.588
[4000]	valid_set's l1: 48.0526
[5000]	valid_set's l1: 46.8886
[6000]	valid_set's l1: 46.0192
[7000]	valid_set's l1: 45.2891
[8000]	valid_set's l1: 44.6527
[9000]	valid_set's l1: 44.1453
[10000]	valid_set's l1: 43.7178
[1000]	valid_set's l1: 54.1301
[2000]	valid_set's l1: 51.55
[3000]	valid

	-43.8289	 = Validation score   (-mean_absolute_error)
	505.56s	 = Training   runtime
	38.08s	 = Validation runtime
Fitting model: LightGBM_BAG_L1 ...
	Memory not enough to fit LGBModel folds in parallel. Will do sequential fitting instead. 	Consider decreasing folds trained in parallel by passing num_folds_parallel to ag_args_ensemble when calling predictor.fit
	Fitting 8 child models (S1F1 - S1F8) | Fitting with SequentialLocalFoldFittingStrategy


[1000]	valid_set's l1: 46.2716
[2000]	valid_set's l1: 40.2931
[3000]	valid_set's l1: 36.3474
[4000]	valid_set's l1: 34.0801
[5000]	valid_set's l1: 32.2888
[6000]	valid_set's l1: 31.0332
[7000]	valid_set's l1: 30.0751
[8000]	valid_set's l1: 29.1074
[9000]	valid_set's l1: 28.3619
[10000]	valid_set's l1: 27.8328
[1000]	valid_set's l1: 44.6309
[2000]	valid_set's l1: 39.585
[3000]	valid_set's l1: 35.9446
[4000]	valid_set's l1: 33.7409
[5000]	valid_set's l1: 32.3143
[6000]	valid_set's l1: 30.898
[7000]	valid_set's l1: 30.0355
[8000]	valid_set's l1: 29.2411
[9000]	valid_set's l1: 28.673
[10000]	valid_set's l1: 28.0967
[1000]	valid_set's l1: 45.4007
[2000]	valid_set's l1: 40.1501
[3000]	valid_set's l1: 36.4074
[4000]	valid_set's l1: 33.9071
[5000]	valid_set's l1: 32.455
[6000]	valid_set's l1: 31.2144
[7000]	valid_set's l1: 30.2917
[8000]	valid_set's l1: 29.5532
[9000]	valid_set's l1: 28.9882
[10000]	valid_set's l1: 28.484
[1000]	valid_set's l1: 46.1463
[2000]	valid_set's l1: 39.9051
[3000]	val

	-28.1047	 = Validation score   (-mean_absolute_error)
	487.71s	 = Training   runtime
	33.44s	 = Validation runtime
Fitting model: RandomForestMSE_BAG_L1 ...
	-53.3076	 = Validation score   (-mean_absolute_error)
	274.88s	 = Training   runtime
	11.31s	 = Validation runtime
Fitting model: CatBoost_BAG_L1 ...
	Memory not enough to fit CatBoostModel folds in parallel. Will do sequential fitting instead. 	Consider decreasing folds trained in parallel by passing num_folds_parallel to ag_args_ensemble when calling predictor.fit
	Fitting 8 child models (S1F1 - S1F8) | Fitting with SequentialLocalFoldFittingStrategy
	-29.523	 = Validation score   (-mean_absolute_error)
	2365.82s	 = Training   runtime
	0.39s	 = Validation runtime
Fitting model: ExtraTreesMSE_BAG_L1 ...
	-53.2973	 = Validation score   (-mean_absolute_error)
	166.53s	 = Training   runtime
	10.17s	 = Validation runtime
Fitting model: NeuralNetFastAI_BAG_L1 ...
	Memory not enough to fit NNFastAiTabularModel folds in parallel. Will 

[1000]	valid_set's l1: 40.7133
[2000]	valid_set's l1: 35.5466
[3000]	valid_set's l1: 32.9017
[4000]	valid_set's l1: 31.3898
[5000]	valid_set's l1: 30.3815
[6000]	valid_set's l1: 29.6742
[7000]	valid_set's l1: 29.2232
[8000]	valid_set's l1: 28.8626
[9000]	valid_set's l1: 28.5404
[10000]	valid_set's l1: 28.2451
[1000]	valid_set's l1: 39.9101
[2000]	valid_set's l1: 34.764
[3000]	valid_set's l1: 32.3903
[4000]	valid_set's l1: 30.7745
[5000]	valid_set's l1: 29.7227
[6000]	valid_set's l1: 29.0201
[7000]	valid_set's l1: 28.4168
[8000]	valid_set's l1: 27.9879
[9000]	valid_set's l1: 27.6624
[10000]	valid_set's l1: 27.3932
[1000]	valid_set's l1: 40.1395
[2000]	valid_set's l1: 35.3545
[3000]	valid_set's l1: 32.6326
[4000]	valid_set's l1: 31.049
[5000]	valid_set's l1: 29.9869
[6000]	valid_set's l1: 29.2681
[7000]	valid_set's l1: 28.6773
[8000]	valid_set's l1: 28.3
[9000]	valid_set's l1: 27.9325
[10000]	valid_set's l1: 27.7019
[1000]	valid_set's l1: 40.3102
[2000]	valid_set's l1: 35.2308
[3000]	val

	-27.7354	 = Validation score   (-mean_absolute_error)
	677.01s	 = Training   runtime
	69.5s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
	-26.4861	 = Validation score   (-mean_absolute_error)
	6.4s	 = Training   runtime
	0.01s	 = Validation runtime
Fitting 9 L2 models ...
Fitting model: LightGBMXT_BAG_L2 ...
	Memory not enough to fit LGBModel folds in parallel. Will do sequential fitting instead. 	Consider decreasing folds trained in parallel by passing num_folds_parallel to ag_args_ensemble when calling predictor.fit
	Fitting 8 child models (S1F1 - S1F8) | Fitting with SequentialLocalFoldFittingStrategy


[1000]	valid_set's l1: 20.0747
[2000]	valid_set's l1: 19.8525
[3000]	valid_set's l1: 19.7294
[4000]	valid_set's l1: 19.6566
[5000]	valid_set's l1: 19.6316
[6000]	valid_set's l1: 19.5843
[7000]	valid_set's l1: 19.564
[8000]	valid_set's l1: 19.5506
[9000]	valid_set's l1: 19.52
[10000]	valid_set's l1: 19.5112
[1000]	valid_set's l1: 19.7612
[2000]	valid_set's l1: 19.4969
[3000]	valid_set's l1: 19.3841
[4000]	valid_set's l1: 19.3029
[5000]	valid_set's l1: 19.2486
[6000]	valid_set's l1: 19.2228
[7000]	valid_set's l1: 19.2038
[8000]	valid_set's l1: 19.1695
[9000]	valid_set's l1: 19.1482
[10000]	valid_set's l1: 19.1296
[1000]	valid_set's l1: 20.1111
[2000]	valid_set's l1: 19.8158
[3000]	valid_set's l1: 19.6728
[4000]	valid_set's l1: 19.6062
[5000]	valid_set's l1: 19.5606
[6000]	valid_set's l1: 19.5254
[7000]	valid_set's l1: 19.4784
[8000]	valid_set's l1: 19.4591
[9000]	valid_set's l1: 19.4385
[10000]	valid_set's l1: 19.4084
[1000]	valid_set's l1: 19.9145
[2000]	valid_set's l1: 19.6371
[3000]	v

	-19.3745	 = Validation score   (-mean_absolute_error)
	583.49s	 = Training   runtime
	27.34s	 = Validation runtime
Fitting model: LightGBM_BAG_L2 ...
	Memory not enough to fit LGBModel folds in parallel. Will do sequential fitting instead. 	Consider decreasing folds trained in parallel by passing num_folds_parallel to ag_args_ensemble when calling predictor.fit
	Fitting 8 child models (S1F1 - S1F8) | Fitting with SequentialLocalFoldFittingStrategy


[1000]	valid_set's l1: 18.6802
[2000]	valid_set's l1: 18.4777
[3000]	valid_set's l1: 18.3443
[4000]	valid_set's l1: 18.2294
[5000]	valid_set's l1: 18.1427
[6000]	valid_set's l1: 18.0785
[7000]	valid_set's l1: 17.9964
[8000]	valid_set's l1: 17.946
[9000]	valid_set's l1: 17.8967
[10000]	valid_set's l1: 17.8644
[1000]	valid_set's l1: 18.4702
[2000]	valid_set's l1: 18.2438
[3000]	valid_set's l1: 18.1003
[4000]	valid_set's l1: 18.0122
[5000]	valid_set's l1: 17.9213
[6000]	valid_set's l1: 17.8511
[7000]	valid_set's l1: 17.7909
[8000]	valid_set's l1: 17.7214
[9000]	valid_set's l1: 17.6883
[10000]	valid_set's l1: 17.6496
[1000]	valid_set's l1: 18.8168
[2000]	valid_set's l1: 18.6427
[3000]	valid_set's l1: 18.5012
[4000]	valid_set's l1: 18.4174
[5000]	valid_set's l1: 18.3145
[6000]	valid_set's l1: 18.2277
[7000]	valid_set's l1: 18.1572
[8000]	valid_set's l1: 18.0962
[9000]	valid_set's l1: 18.0557
[10000]	valid_set's l1: 18.0034
[1000]	valid_set's l1: 18.9316
[2000]	valid_set's l1: 18.749
[3000]	

	-17.9127	 = Validation score   (-mean_absolute_error)
	597.28s	 = Training   runtime
	30.54s	 = Validation runtime
Fitting model: RandomForestMSE_BAG_L2 ...
	-18.5474	 = Validation score   (-mean_absolute_error)
	381.01s	 = Training   runtime
	11.71s	 = Validation runtime
Fitting model: CatBoost_BAG_L2 ...
	Memory not enough to fit CatBoostModel folds in parallel. Will do sequential fitting instead. 	Consider decreasing folds trained in parallel by passing num_folds_parallel to ag_args_ensemble when calling predictor.fit
	Fitting 8 child models (S1F1 - S1F8) | Fitting with SequentialLocalFoldFittingStrategy
	-18.2609	 = Validation score   (-mean_absolute_error)
	2561.25s	 = Training   runtime
	0.42s	 = Validation runtime
Fitting model: ExtraTreesMSE_BAG_L2 ...
	-18.5437	 = Validation score   (-mean_absolute_error)
	198.01s	 = Training   runtime
	11.11s	 = Validation runtime
Fitting model: NeuralNetFastAI_BAG_L2 ...
	Memory not enough to fit NNFastAiTabularModel folds in parallel. Will

[1000]	valid_set's l1: 18.3169
[2000]	valid_set's l1: 18.0034
[3000]	valid_set's l1: 17.7992
[4000]	valid_set's l1: 17.6961
[5000]	valid_set's l1: 17.5893
[6000]	valid_set's l1: 17.4971
[7000]	valid_set's l1: 17.4297
[8000]	valid_set's l1: 17.3768
[9000]	valid_set's l1: 17.3329
[10000]	valid_set's l1: 17.2936
[1000]	valid_set's l1: 18.0393
[2000]	valid_set's l1: 17.7232
[3000]	valid_set's l1: 17.5113
[4000]	valid_set's l1: 17.3776
[5000]	valid_set's l1: 17.2648
[6000]	valid_set's l1: 17.2013
[7000]	valid_set's l1: 17.128
[8000]	valid_set's l1: 17.0799
[9000]	valid_set's l1: 17.036
[10000]	valid_set's l1: 17.0023
[1000]	valid_set's l1: 18.3371
[2000]	valid_set's l1: 18.0329
[3000]	valid_set's l1: 17.8378
[4000]	valid_set's l1: 17.7009
[5000]	valid_set's l1: 17.5874
[6000]	valid_set's l1: 17.5013
[7000]	valid_set's l1: 17.4305
[8000]	valid_set's l1: 17.3796
[9000]	valid_set's l1: 17.3499
[10000]	valid_set's l1: 17.3167
[1000]	valid_set's l1: 18.419
[2000]	valid_set's l1: 18.1095
[3000]	v

	-17.2692	 = Validation score   (-mean_absolute_error)
	810.99s	 = Training   runtime
	56.25s	 = Validation runtime
Fitting model: WeightedEnsemble_L3 ...
	-16.7326	 = Validation score   (-mean_absolute_error)
	5.39s	 = Training   runtime
	0.01s	 = Validation runtime
AutoGluon training complete, total runtime = 15734.67s ... Best model: "WeightedEnsemble_L3"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels\ag-20230926_031846\")


In [6]:
print(predictor.leaderboard(silent=True))

                     model  score_val  pred_time_val      fit_time  \
0      WeightedEnsemble_L3 -16.732647     426.244046  10836.986019   
1     LightGBMLarge_BAG_L2 -17.269230     370.007979   7397.364897   
2           XGBoost_BAG_L2 -17.880388     365.693279   8067.608506   
3          LightGBM_BAG_L2 -17.912685     344.295493   7183.648377   
4    NeuralNetTorch_BAG_L2 -18.142438     315.958883   7309.651216   
5          CatBoost_BAG_L2 -18.260932     314.171013   9147.620538   
6     ExtraTreesMSE_BAG_L2 -18.543676     324.859503   6784.378687   
7   RandomForestMSE_BAG_L2 -18.547410     325.464979   6967.377995   
8   NeuralNetFastAI_BAG_L2 -18.750613     315.839683   7816.081948   
9        LightGBMXT_BAG_L2 -19.374462     341.096298   7169.864645   
10     WeightedEnsemble_L2 -26.486058     103.325940   3536.943009   
11    LightGBMLarge_BAG_L1 -27.735368      69.495028    677.013994   
12         LightGBM_BAG_L1 -28.104701      33.437971    487.711438   
13         CatBoost_

In [7]:
predictor.feature_importance(train_data)

Computing feature importance via permutation shuffling for 30 features using 5000 rows with 5 shuffle sets...
	7379.98s	= Expected runtime (1476.0s per shuffle set)
	5070.36s	= Actual runtime (Completed 5 of 5 shuffle sets)


Unnamed: 0,importance,stddev,p_value,n,p99_high,p99_low
WTI,1542.44968,15.830515,1.33124e-09,5,1575.04493,1509.854431
DUBAI,1231.431942,19.073053,6.903695e-09,5,1270.703622,1192.160262
BRENT,1122.740263,15.986617,4.931431e-09,5,1155.656928,1089.823598
BDI_ADJ,724.223061,17.941961,4.516647e-08,5,761.165806,687.280315
month,683.459804,18.388919,6.282537e-08,5,721.322842,645.596766
year,553.260356,8.958789,8.247238e-09,5,571.706627,534.814085
day,88.723683,1.491332,9.575397e-09,5,91.794356,85.65301
AIR_TEMPERATURE,32.369699,1.759908,1.044423e-06,5,35.993374,28.746025
DIST,24.377373,0.297105,2.647218e-09,5,24.989118,23.765629
BN,18.259519,0.884043,6.572959e-07,5,20.079775,16.439263


In [8]:
model_to_use = predictor.get_model_best()
model_pred = predictor.predict(test_data, model=model_to_use)

FileNotFoundError: [Errno 2] No such file or directory: '../data/sample_submission.csv'

In [11]:
model_pred

0           88.273972
1          336.836700
2            0.113809
3            0.043623
4           30.806232
             ...     
244984     151.110321
244985     382.482239
244986       0.175596
244987      -4.401476
244988    1436.849365
Name: CI_HOUR, Length: 244989, dtype: float32

In [13]:
submit = pd.read_csv("data/sample_submission.csv")
submit["CI_HOUR"] = [i if i > 0 else 0.0 for i in model_pred]
submit.to_csv("./csv/autogluon.csv", index=False)