날짜 변환<br/>
결측 값 행 제거 + 결측 값 많은 열 제거 + 수치 데이터 정규화 작업<br/>
음수 WTI는 새로운 컬럼으로 만들어 처리<br/>
10 단위 1로 수정<br/>
null data 분리<br/>
ATA 자체를 삭제<br/>

In [1]:
import warnings
warnings.filterwarnings(action="ignore")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import bisect
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [2]:
train_df = pd.read_parquet('../../data/HD_data/train.parquet').drop(columns=['SAMPLE_ID'])
test_df = pd.read_parquet('../../data/HD_data/test.parquet').drop(columns=['SAMPLE_ID'])

In [3]:
# datetime 컬럼 제거
train_df.drop(columns=['ATA'], inplace=True)
test_df.drop(columns=['ATA'], inplace=True)

In [4]:
assert sorted(train_df["ARI_CO"].unique()) == sorted(test_df["ARI_CO"].unique())

assert sorted(train_df["ARI_PO"].unique()) == sorted(test_df["ARI_PO"].unique())

assert sorted(train_df["SHIP_TYPE_CATEGORY"].unique()) == sorted(test_df["SHIP_TYPE_CATEGORY"].unique())

In [5]:
scaler = StandardScaler()
x = train_df[["DIST", "BUILT", "DEADWEIGHT", "GT", "LENGTH"]]
scaler.fit(x)
train_df[["DIST", "BUILT", "DEADWEIGHT", "GT", "LENGTH"]] = scaler.transform(x)
test_feautre = test_df[["DIST", "BUILT", "DEADWEIGHT", "GT", "LENGTH"]]
test_df[["DIST", "BUILT", "DEADWEIGHT", "GT", "LENGTH"]] = scaler.transform(test_feautre)

In [6]:
# train, test 동일한 경우
train_df = pd.get_dummies(train_df, columns=["ARI_CO", "ARI_PO", "SHIP_TYPE_CATEGORY"], drop_first=True)
test_df = pd.get_dummies(test_df, columns=["ARI_CO", "ARI_PO", "SHIP_TYPE_CATEGORY"], drop_first=True)

In [7]:
for name in ["BREADTH", "DEPTH", "DRAUGHT"]:
    train_df[name] = train_df[name].apply(lambda x: x // 10)
    test_df[name] = test_df[name].apply(lambda x: x // 10)

In [8]:
train_df = train_df.dropna(subset=["BREADTH", "DEPTH", "DRAUGHT", "LENGTH"], axis=0)

In [9]:
train_dummies = pd.get_dummies(train_df, columns=["FLAG"])
test_dummies = pd.get_dummies(test_df, columns=["FLAG"])

In [10]:
train_df, test_df = train_dummies.align(test_dummies, axis=1, fill_value=0)

In [11]:
assert sorted(list(train_df.columns)) == sorted(list(test_df.columns))

In [7]:
categorical_features = ['ID', 'SHIPMANAGER', 'PORT_SIZE']
encoders = {}

for feature in categorical_features:
  le = LabelEncoder()
  train_df[feature] = le.fit_transform(train_df[feature].astype(str))
  le_classes_set = set(le.classes_)
  test_df[feature] = test_df[feature].map(lambda s: '-1' if s not in le_classes_set else s)
  le_classes = le.classes_.tolist()
  bisect.insort_left(le_classes, '-1')
  le.classes_ = np.array(le_classes)
  test_df[feature] = le.transform(test_df[feature].astype(str))
  encoders[feature] = le

In [9]:
not_null_idx = train_df[train_df["U_WIND"].notna() & train_df["V_WIND"].notna() & train_df["AIR_TEMPERATURE"].notna() & train_df["BN"].notna()].index
train_not_null_df = train_df.loc[not_null_idx]

In [10]:
train_df.drop(columns=["U_WIND", "V_WIND", "AIR_TEMPERATURE", "BN"], inplace=True)

In [11]:
not_null_idx = test_df[test_df["U_WIND"].notna() & test_df["V_WIND"].notna() & test_df["AIR_TEMPERATURE"].notna() & test_df["BN"].notna()].index
test_not_null_df = test_df.loc[not_null_idx]

In [12]:
test_df.drop(columns=["U_WIND", "V_WIND", "AIR_TEMPERATURE", "BN"], inplace=True)

In [13]:
train_df.to_parquet("../../data/HD_data/train_v1_remove.parquet") # column 삭제
train_not_null_df.to_parquet("../../data/HD_data/train_v1_remain.parquet") # column 유지
test_df.to_parquet("../../data/HD_data/test_v1_remove.parquet")
test_not_null_df.to_parquet("../../data/HD_data/test_v1_remain.parquet")