v2 + 결측행 복구

In [1]:
import warnings
warnings.filterwarnings(action="ignore")
import datetime
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import bisect
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [2]:
train_df = pd.read_parquet('../data/train.parquet').drop(columns=['SAMPLE_ID'])
test_df = pd.read_parquet('../data/test.parquet').drop(columns=['SAMPLE_ID'])

In [3]:
# datetime 컬럼 처리
train_df['ATA'] = pd.to_datetime(train_df['ATA'])
test_df['ATA'] = pd.to_datetime(test_df['ATA'])

# datetime을 여러 파생 변수로 변환
for df in [train_df, test_df]:
    df['year'] = df['ATA'].dt.year
    df['month'] = df['ATA'].dt.month
    df['day'] = df['ATA'].dt.day
    df['hour'] = df['ATA'].dt.hour
    df['minute'] = df['ATA'].dt.minute
    df['weekday'] = df['ATA'].dt.weekday

# datetime 컬럼 제거
train_df.drop(columns=['ATA'], inplace=True)
test_df.drop(columns=['ATA'], inplace=True)

In [4]:
# WTI 처리
train_df["WITlt0"] = [0 if wti > 0 else 1 for wti in train_df["WTI"]]
train_df["WTI"] = [wti if wti > 0 else 0 for wti in train_df["WTI"]]

test_df["WITlt0"] = [0 if wti > 0 else 1 for wti in test_df["WTI"]]
test_df["WTI"] = [wti if wti > 0 else 0 for wti in test_df["WTI"]]

In [5]:
scaler = StandardScaler()
x = train_df[["DIST", "BUILT", "DEADWEIGHT", "GT", "LENGTH", "DUBAI", "BRENT", "WTI", "BDI_ADJ", "PORT_SIZE"]]
scaler.fit(x)
train_df[["DIST", "BUILT", "DEADWEIGHT", "GT", "LENGTH", "DUBAI", "BRENT", "WTI", "BDI_ADJ", "PORT_SIZE"]] = scaler.transform(x)
test_feautre = test_df[["DIST", "BUILT", "DEADWEIGHT", "GT", "LENGTH", "DUBAI", "BRENT", "WTI", "BDI_ADJ", "PORT_SIZE"]]
test_df[["DIST", "BUILT", "DEADWEIGHT", "GT", "LENGTH", "DUBAI", "BRENT", "WTI", "BDI_ADJ", "PORT_SIZE"]] = scaler.transform(test_feautre)

In [6]:
for name in ["BREADTH", "DEPTH", "DRAUGHT"]:
    train_df[name] = train_df[name].apply(lambda x: x // 10)
    test_df[name] = test_df[name].apply(lambda x: x // 10)

In [7]:
train_df = pd.get_dummies(train_df, columns=["ARI_CO", "ARI_PO", "SHIP_TYPE_CATEGORY"], drop_first=True)
test_df = pd.get_dummies(test_df, columns=["ARI_CO", "ARI_PO", "SHIP_TYPE_CATEGORY"], drop_first=True)

In [8]:
train_dummies = pd.get_dummies(train_df, columns=["FLAG"])
test_dummies = pd.get_dummies(test_df, columns=["FLAG"])

In [9]:
train_df, test_df = train_dummies.align(test_dummies, axis=1, fill_value=0)

In [10]:
categorical_features = ['ID', 'SHIPMANAGER']
encoders = {}

for feature in categorical_features:
  le = LabelEncoder()
  train_df[feature] = le.fit_transform(train_df[feature].astype(str))
  le_classes_set = set(le.classes_)
  test_df[feature] = test_df[feature].map(lambda s: '-1' if s not in le_classes_set else s)
  le_classes = le.classes_.tolist()
  bisect.insort_left(le_classes, '-1')
  le.classes_ = np.array(le_classes)
  test_df[feature] = le.transform(test_df[feature].astype(str))
  encoders[feature] = le

In [11]:
# train_df.to_parquet("../data/train_v2_0.parquet")
test_df.to_parquet("../data/test_v2_0.parquet")