In [9]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
import io
import zipfile
from io import BytesIO
from zipfile import ZipFile
from sklearn.preprocessing import LabelEncoder
import bisect

# 데이터 로드
# 특정 파일만 압축풀기
with ZipFile('../data/open.zip','r') as zipObj:
    list_filenames = zipObj.namelist()
    for filename in list_filenames:
        if filename.endswith('.csv'):
            zipRead = zipObj.read(filename)
            globals()[f'{filename[:-4]}'] = pd.read_csv(BytesIO(zipRead))
# train.csv >> train , test.csv >> test

# 날짜 데이터 전처리
# datetime 컬럼 처리
train['ATA'] = pd.to_datetime(train['ATA'])
test['ATA'] = pd.to_datetime(test['ATA'])

# datetime을 여러 파생 변수로 변환
for df in [train, test]:
    df['year'] = df['ATA'].dt.year
    df['month'] = df['ATA'].dt.month
    df['day'] = df['ATA'].dt.day
    df['hour'] = df['ATA'].dt.hour
    # ARI_CO와 ARI_PO 컬럼 병합
    df['ARI'] = df['ARI_CO']+df['ARI_PO']

# datetime 컬럼 제거
train.drop(columns='ATA', inplace=True)
test.drop(columns='ATA', inplace=True)

# 불필요 컬럼 제거 = 'ATA_LT','ID','SHIPMANAGER','FLAG'
train.drop(columns=['SAMPLE_ID','ATA_LT','ID','SHIPMANAGER','FLAG'], inplace=True)
test.drop(columns=['SAMPLE_ID','ATA_LT','ID','SHIPMANAGER','FLAG'], inplace=True)
# ARI_CO와 ARI_PO 컬럼 제거
train.drop(columns=['ARI_CO','ARI_PO'],axis=1,inplace=True)
test.drop(columns=['ARI_CO','ARI_PO'],axis=1,inplace=True)

# 같은 종류의 SHIP_TYPE으로 결측치 처리
Cargo_ships=train[train['SHIP_TYPE_CATEGORY']==train[train['BREADTH'].isnull()]['SHIP_TYPE_CATEGORY'].iloc[0]]
# 결측치 채우기
idx=train[train['BREADTH'].isnull()].index[0]
train.loc[idx,['BREADTH','DEPTH','DRAUGHT','LENGTH']] = Cargo_ships[['BREADTH','DEPTH','DRAUGHT','LENGTH']].mean()

# ARI와 SHIP_TYPE_CATEGORY 인코딩
categorical_features = ['ARI','SHIP_TYPE_CATEGORY']
encoders = {}
for feature in categorical_features:
    le = LabelEncoder()
    train[feature] = le.fit_transform(train[feature].astype(str))
    le_classes_set = set(le.classes_)
    test[feature] = test[feature].map(lambda s: '-1' if s not in le_classes_set else s)
    le_classes = le.classes_.tolist()
    bisect.insort_left(le_classes, '-1')
    le.classes_ = np.array(le_classes)
    test[feature] = le.transform(test[feature].astype(str))
    encoders[feature] = le



In [10]:
train.info()
# SHIP_TYPE과 ARI만 인코딩하면 됨.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 391939 entries, 0 to 391938
Data columns (total 20 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   SHIP_TYPE_CATEGORY  391939 non-null  int32  
 1   DIST                391939 non-null  float64
 2   BREADTH             391939 non-null  float64
 3   BUILT               391939 non-null  int64  
 4   DEADWEIGHT          391939 non-null  int64  
 5   DEPTH               391939 non-null  float64
 6   DRAUGHT             391939 non-null  float64
 7   GT                  391939 non-null  int64  
 8   LENGTH              391939 non-null  float64
 9   U_WIND              228251 non-null  float64
 10  V_WIND              228251 non-null  float64
 11  AIR_TEMPERATURE     227309 non-null  float64
 12  BN                  228251 non-null  float64
 13  PORT_SIZE           391939 non-null  float64
 14  CI_HOUR             391939 non-null  float64
 15  year                391939 non-nul

In [13]:
len(train)

391939

In [5]:
train.isnull().sum()

SAMPLE_ID                  0
ARI_CO                     0
ARI_PO                     0
SHIP_TYPE_CATEGORY         0
DIST                       0
ID                         0
BREADTH                    0
BUILT                      0
DEADWEIGHT                 0
DEPTH                      0
DRAUGHT                    0
GT                         0
LENGTH                     0
SHIPMANAGER                0
FLAG                       0
U_WIND                163688
V_WIND                163688
AIR_TEMPERATURE       164630
BN                    163688
PORT_SIZE                  0
CI_HOUR                    0
year                       0
month                      0
day                        0
hour                       0
dtype: int64

In [11]:
wind_bn = train.drop(columns='AIR_TEMPERATURE',axis=1)
air = train.drop(columns=['U_WIND','V_WIND','BN'],axis=1)

In [12]:
wind_bn_train = wind_bn[wind_bn['U_WIND'].notnull()]
len(train)-len(wind_bn_train)

163688

In [13]:
air_train = air[air['AIR_TEMPERATURE'].notnull()]
len(train)-len(air_train)

164630

In [14]:
# fill nan by KNN prediction
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [15]:
knn = KNeighborsClassifier(n_neighbors=5,n_jobs=-1)

In [16]:
# wind prediction
x_wind_bn_train = wind_bn_train.drop(columns=['U_WIND','V_WIND','BN'],axis=1)
y_wind_bn_train = wind_bn_train[['U_WIND','V_WIND','BN']]
x_wind_bn_train.shape,y_wind_bn_train.shape

((228251, 16), (228251, 3))

In [17]:
knn.fit(x_wind_bn_train,y_wind_bn_train)

ValueError: Unknown label type: continuous-multioutput. Maybe you are trying to fit a classifier, which expects discrete classes on a regression target with continuous values.

In [23]:
# air_temperature prediction
x_air_train = air_train.drop(columns=['AIR_TEMPERATURE'],axis=1)
y_air_train = air_train['AIR_TEMPERATURE']
x_air_train.shape,y_air_train.shape

((227309, 21), (227309,))