In [1]:
import os
import sys
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from matplotlib import style
style.use('ggplot')
import seaborn as sns
import tqdm
import random

from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score

import xgboost as xgb
from xgboost import XGBClassifier
from xgboost import plot_importance, plot_tree
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score
from sklearn.model_selection import RepeatedStratifiedKFold

import warnings
warnings.filterwarnings("ignore")

data_dir = '/mnt/elice/dataset'

In [2]:
# 재현성 확보를 위해 시드값을 고정합니다.
seed = 42
np.random.seed(seed)
random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)

# 데이터 전처리

In [115]:
# train_df: serial number, timestamp, X1, X2~18, Y
train_df = pd.read_csv(os.path.join(data_dir, "train.csv"), index_col='Serial Number')
# test_x: serial number, timestamp, X1, X2~18
test_x = pd.read_csv(os.path.join(data_dir, "test_x.csv"), index_col='Serial Number')

''' timestamp 열 형식 바꾸기 '''
train_df['TIMESTAMP'] = pd.to_datetime(train_df['TIMESTAMP'])
train_df['TIMESTAMP'] = train_df['TIMESTAMP'].map(lambda t: t.strftime('%Y-%m-%d %H:%M'))
test_x['TIMESTAMP'] = pd.to_datetime(test_x['TIMESTAMP'])
test_x['TIMESTAMP'] = test_x['TIMESTAMP'].map(lambda t: t.strftime('%Y-%m-%d %H:%M'))

''' 컬럼 키 추출 '''
serial_key = train_df.index.name
date_time_key = list(train_df.columns)[0]
feature_keys = list(train_df.columns)[2:-1]
target_key = list(train_df.columns)[-1]

# train_x: serial number, timestamp, X1, X2~18
train_x = train_df.drop(columns='Y')
# train_y: serial_number, Y
train_y = pd.read_csv(os.path.join(data_dir, "train_y.csv"), index_col='Serial Number')

In [32]:
train_x.describe()

Unnamed: 0,X2,X3,X4,X5,X6,X7,X8,X9,X10,X11,X12,X13,X14,X15,X16,X17,X18
count,555456.0,555456.0,555456.0,555456.0,555456.0,555456.0,555456.0,555456.0,555456.0,555456.0,555456.0,555456.0,555456.0,555456.0,555456.0,555456.0,555456.0
mean,35.749554,30.785502,22.94668,18.633593,32.675382,98.44359,75528.356478,41399.023717,4382.493629,839.373504,102.041683,3164.118,0.297592,101.505356,0.096595,36.435725,37.003419
std,97.954977,97.452965,2.880192,2.215788,4.230633,5.573999,42494.833905,32048.903648,22216.038517,483.931525,219.097431,250387.4,8.460394,218.376455,9.047217,3048.619571,2768.514097
min,7.0,6.0,13.0,0.0,24.0,3.0,3882.0,1728.0,90.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,23.0,20.0,21.0,18.0,30.0,99.0,60216.0,30377.0,1470.0,438.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,29.0,25.0,23.0,19.0,32.0,99.0,68739.5,36130.0,1742.0,851.0,4.0,4.0,0.0,4.0,0.0,0.0,2.0
75%,38.0,31.0,25.0,20.0,35.0,99.0,80103.0,44510.0,2158.0,1264.0,92.0,15.0,0.0,91.0,0.0,0.0,6.0
max,8262.0,8258.0,37.0,25.0,63.0,100.0,935156.0,742300.0,453934.0,1683.0,4138.0,25463980.0,592.0,4138.0,1083.0,313826.0,329989.0


x7, x14, x16, x17 제거해볼까?

## 장비 이름을 나타내는 X1 변수를 제거

In [116]:
train_x.drop(columns='X1', inplace=True)
test_x.drop(columns='X1', inplace=True)

## 변수 제거

In [117]:
train_x.drop(columns=['X7'], inplace=True)
test_x.drop(columns=['X7'], inplace=True)
feature_keys = list(train_x.columns)[1:-1]

## `StandardScaler` 를 활용해 표준화 

In [118]:
scaler = StandardScaler()

train_x[feature_keys] = scaler.fit_transform(train_x[feature_keys])
test_x[feature_keys] = scaler.transform(test_x[feature_keys])

## Serial Number를 기준으로 분리

In [119]:
# 데이터를 index가 같은 컬럼 별로 분리해서 리스트에 저장
# group: (그룹 이름, 그룹 데이터프레임) => group[1]: 그룹 데이터프레임
train_x_by_serial = [group[1] for group in train_x.groupby(train_x.index)]
test_x_by_serial = [group[1] for group in test_x.groupby(test_x.index)]

# TIMESTAMP 값을 기준으로 정렬
train_x_by_serial = [group.sort_values('TIMESTAMP') for group in train_x_by_serial]
test_x_by_serial = [group.sort_values('TIMESTAMP') for group in test_x_by_serial]

In [6]:
train_x_by_serial

[                      TIMESTAMP  X2  X3  X4  X5  X6  X7      X8     X9   X10  \
 Serial Number                                                                  
 19             2020-02-09 16:24  22  18  22  16  32  99   91219  43095  2132   
 19             2020-02-10 16:39  22  18  22  16  32  99   91413  43214  2133   
 19             2020-02-11 16:54  22  18  22  16  32  99   91606  43332  2135   
 19             2020-02-12 17:09  22  18  22  16  32  99   91799  43449  2136   
 19             2020-02-13 17:24  22  18  23  16  32  99   91991  43566  2137   
 ...                         ...  ..  ..  ..  ..  ..  ..     ...    ...   ...   
 19             2020-04-14 08:24  22  18  16  15  32  98  104116  51127  2290   
 19             2020-04-15 08:39  22  18  16  15  32  98  104314  51251  2291   
 19             2020-04-16 08:54  22  18  16  15  32  98  104516  51376  2292   
 19             2020-04-17 09:09  22  18  16  15  32  98  104717  51500  2294   
 19             2020-04-18 0

In [120]:
train_x_by_serial[2]

Unnamed: 0_level_0,TIMESTAMP,X2,X3,X4,X5,X6,X8,X9,X10,X11,X12,X13,X14,X15,X16,X17,X18
Serial Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
48,2020-02-09 08:01,0.237359,0.176644,1.060111,1.067976,1.258588,0.845695,0.629975,0.059574,-1.734490,-0.465737,-0.012637,-0.035175,-0.464819,-0.010677,-0.011952,0.0
48,2020-02-10 08:16,0.237359,0.176644,1.060111,1.067976,1.258588,0.849884,0.633251,0.059664,-1.684896,-0.410967,-0.012621,-0.035175,-0.409868,-0.010677,-0.011952,2.0
48,2020-02-11 08:31,0.237359,0.176644,1.060111,1.067976,1.258588,0.854143,0.636589,0.059844,-1.635302,-0.315119,-0.012613,-0.035175,-0.313703,-0.010677,-0.011952,3.0
48,2020-02-12 08:46,0.237359,0.176644,1.060111,1.067976,1.258588,0.858803,0.640521,0.060115,-1.583642,-0.219271,-0.012589,-0.035175,-0.217539,-0.010677,-0.011952,6.0
48,2020-02-13 09:01,0.237359,0.176644,1.060111,1.067976,1.258588,0.863227,0.644140,0.060295,-1.534048,-0.127987,-0.012589,-0.035175,-0.125954,-0.010677,-0.011952,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48,2020-04-14 00:01,0.237359,0.176644,1.407310,1.067976,1.258588,1.133848,0.867393,0.078570,1.480844,4.029984,-0.012373,-0.035175,4.045745,-0.010677,-0.011952,33.0
48,2020-04-15 00:16,0.237359,0.176644,1.407310,1.067976,1.258588,1.138296,0.870981,0.078840,1.530438,4.198859,-0.012345,-0.035175,4.215177,-0.010677,-0.011952,36.0
48,2020-04-16 00:31,0.237359,0.176644,1.407310,1.067976,1.258588,1.142649,0.874445,0.079110,1.580032,4.317527,-0.012345,-0.035175,4.334238,-0.010677,-0.011952,36.0
48,2020-04-17 00:46,0.237359,0.176644,1.407310,1.067976,1.258588,1.147262,0.878283,0.079335,1.631692,4.490966,-0.012337,-0.035175,4.508249,-0.010677,-0.011952,37.0


In [18]:
train_x_by_serial[2].index.unique()

Int64Index([48], dtype='int64', name='Serial Number')

In [19]:
train_x_by_serial[2].index.unique().item()

48

## train, valid 분할

학습용 데이터셋과 검증용 데이터셋을 생성합니다. 데이터의 Serial Number를 기준으로 8:2 비율로 분할하고, 동시에 학습용 데이터셋과 검증용 데이터셋의 Y 비율이 Serial Number 수 기준으로 비슷하도록 분할합니다.

In [121]:
def train_test_split(Xs, ys, test_ratio=0.2):
    ''' 각 (x, y) 쌍을 label 별로 딕셔너리에 저장 '''
    data_per_label = {}

    for x, y in zip(Xs, ys):
        label = y
        if label not in data_per_label:
            data_per_label[label] = []
        # key: label, value: 해당 label에 해당하는 (x, y) 쌍의 리스트
        data_per_label[label].append((x, y))

    train = []
    test = []

    for label in data_per_label:
        # label에 해당하는 데이터 가져온다
        data = data_per_label[label]
        # 테스트 데이터의 개수
        n_test = int(len(data) * test_ratio)
        test += data[:n_test]
        train += data[n_test:]

    X_train, y_train = zip(*train)
    X_test, y_test = zip(*test)

    return X_train, X_test, y_train, y_test

In [122]:
X_train, X_val, y_train, y_val = train_test_split(train_x_by_serial, train_y['Y'], test_ratio=0.2)

# X data에서 Timestamp를 제거합니다.
X_train = [x.drop(columns='TIMESTAMP') for x in X_train]
X_val = [x.drop(columns='TIMESTAMP') for x in X_val]
X_test = [x.drop(columns='TIMESTAMP') for x in test_x_by_serial]

print("Train Data의 개수 :", len(X_train))
print("Validation Data의 개수 :", len(X_val))
print("Test Data의 개수 :", len(X_test))

Train Data의 개수 : 6618
Validation Data의 개수 : 1654
Test Data의 개수 : 2069


## 머신러닝 모델에 적용하기 위해 학습, 검증, 테스트용 데이터를 각각 하나의 numpy array로 합칩니다.

In [123]:
def align_data(data, series_length):
    # X2~18열에 해당하는 시계열 데이터 값만 추출
    data_features = [x[feature_keys] for x in data]
    len_data = len(data_features)
    length_aligned_X = []
    for x in data_features:
        # 시계열 데이터 길이가 series_length 이상이면 > 뒷부분 잘라냄
        if len(x) >= series_length:
            length_aligned_X.append(x[:series_length])
        # 시계열 데이터 길이가 series_length보다 작으면 > 마지막 행을 반복하여 길이를 맞춤
        else:
            length_aligned_X.append(x.append([x.iloc[-1]] * (series_length - len(x))))
    return np.array(length_aligned_X).reshape(len_data, -1)

In [124]:
series_length = 67

X_train = align_data(X_train, series_length)
X_val = align_data(X_val, series_length)
X_test = align_data(X_test, series_length)

In [25]:
17 * 50

850

In [26]:
len(X_train[0])

850

y data도 numpy array로 변환합니다.

In [125]:
y_train = np.array(y_train)
y_val = np.array(y_val)

In [28]:
y_train

array([1, 1, 1, ..., 0, 0, 0])

# 최종 모델

In [17]:
!pip install xgb
from xgboost import XGBClassifier

Defaulting to user installation because normal site-packages is not writeable
[31mERROR: Could not find a version that satisfies the requirement xgb (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for xgb[0m[31m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [126]:
model = XGBClassifier(scale_pos_weight = 1,
                      colsample_bytree = 1, 
                      max_depth = 3, 
                      min_child_weight = 4,
                      subsample = 1,
                      eta = 0.22,
                      random_state = 42)
model.fit(X_train, y_train)

# 검증 데이터에 대한 예측
y_val_pred = model.predict(X_val)
# 검증 데이터에 대한 F1 점수 계산
f1_val = f1_score(y_val, y_val_pred, average='macro')
print('Validation F1 score = %.3f' % f1_val)

Validation F1 score = 0.887


# 모델 평가 및 결과 저장


In [127]:
submission = pd.read_csv(os.path.join(data_dir, "test_y.csv"), index_col='Serial Number')

# 테스트 데이터에 대해 예측하고, 결과를 저장합니다.
y_test_pred = model.predict(X_test)
submission["Y"] = y_test_pred
submission.to_csv("submission.csv", index_label='Serial Number')

### 결과 검증

`submission.csv` 파일을 다시 불러와 올바르게 값을 채웠는지 다시 한번 확인합니다.

In [128]:
submission = pd.read_csv("submission.csv", index_col='Serial Number')
submission

Unnamed: 0_level_0,Y
Serial Number,Unnamed: 1_level_1
100122I,1
100368G,0
101403L,0
101426G,0
101505B,0
...,...
997719U,0
998737L,0
999308S,0
999800H,0


In [129]:
# 예측 결과 중 1의 비율을 계산하고, 학습용 데이터의 비율과 비교합니다.
print(submission["Y"].mean())
print(train_y.mean())

0.17738037699371678
Y    0.146518
dtype: float64


### 제출

우측 상단의 제출 버튼을 눌러, `competition.ipynb` 파일과 `submission.csv` 파일을 제출합니다.