In [1]:
import os
import sys
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from matplotlib import style
style.use('ggplot')
import seaborn as sns
import tqdm
import random

from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score

!pip install xgboost
import xgboost as xgb
from xgboost import XGBClassifier
from xgboost import plot_importance, plot_tree
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score
from sklearn.model_selection import RepeatedStratifiedKFold

import warnings
warnings.filterwarnings("ignore")

data_dir = '/mnt/elice/dataset'

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
# 재현성 확보를 위해 시드값을 고정합니다.
seed = 42
np.random.seed(seed)
random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)

# About Dataset

- Info :
    - X 변수들은 SSD신뢰성에 영향을 줄 수 있는 SMART Attribute (비식별화된 상태)
    - Y는 특정 기준에 따라 분류된 Pass(0)/Fail(1)
    - 한 Serial Number에 대해 여러 개의 Row가 존재하지만 Y 값은 Serial Number 별로 동일한 값을 가짐
    - X 변수는 18개

|Index|Features|Format|Description|
|----|----|----|:----|
|1|Serial Number|15|Serial Number|
|2|TIMESTAMP|2020.2.9  4:59:00 AM|Date-time reference|
|3|X1|EI83N072710203N8D|Equipment name|
|4|X2~18|27|Features|

In [3]:
# train_df: serial number, timestamp, X1, X2~18, Y
train_df = pd.read_csv(os.path.join(data_dir, "train.csv"), index_col='Serial Number')
# test_x: serial number, timestamp, X1, X2~18
test_x = pd.read_csv(os.path.join(data_dir, "test_x.csv"), index_col='Serial Number')

''' timestamp 열 형식 바꾸기 '''
train_df['TIMESTAMP'] = pd.to_datetime(train_df['TIMESTAMP'])
train_df['TIMESTAMP'] = train_df['TIMESTAMP'].map(lambda t: t.strftime('%Y-%m-%d %H:%M'))
test_x['TIMESTAMP'] = pd.to_datetime(test_x['TIMESTAMP'])
test_x['TIMESTAMP'] = test_x['TIMESTAMP'].map(lambda t: t.strftime('%Y-%m-%d %H:%M'))

''' 컬럼 키 추출 '''
serial_key = train_df.index.name
date_time_key = list(train_df.columns)[0]
feature_keys = list(train_df.columns)[2:-1]
target_key = list(train_df.columns)[-1]

# train_x: serial number, timestamp, X1, X2~18
train_x = train_df.drop(columns='Y')
# train_y: serial_number, Y
train_y = pd.read_csv(os.path.join(data_dir, "train_y.csv"), index_col='Serial Number')

In [4]:
print(serial_key)
print(date_time_key)
print(feature_keys)
print(target_key)

Serial Number
TIMESTAMP
['X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8', 'X9', 'X10', 'X11', 'X12', 'X13', 'X14', 'X15', 'X16', 'X17', 'X18']
Y


In [5]:
print('----------------DF Shape-----------------\n')
print('shape of train_df:', train_df.shape)
print('shape of train_x:', train_x.shape)
print('shape of train_y:', train_y.shape)
print('shape of test_x:', test_x.shape)

print('\n---------------DF Summary----------------\n')
print(train_df.info())
print(train_df.describe())

print('\n---------------Unique Values-------------\n')
unique_list = {}
for col in ['TIMESTAMP', 'X1']:
        unique_list[col] = train_df[col].unique()
print(unique_list)

print('\n---------------Missing Values-------------\n')
print(train_df.isnull().sum())

----------------DF Shape-----------------

shape of train_df: (555456, 20)
shape of train_x: (555456, 19)
shape of train_y: (8272, 1)
shape of test_x: (138880, 19)

---------------DF Summary----------------

<class 'pandas.core.frame.DataFrame'>
Int64Index: 555456 entries, 19 to 113926
Data columns (total 20 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   TIMESTAMP  555456 non-null  object 
 1   X1         555456 non-null  object 
 2   X2         555456 non-null  int64  
 3   X3         555456 non-null  int64  
 4   X4         555456 non-null  int64  
 5   X5         555456 non-null  int64  
 6   X6         555456 non-null  int64  
 7   X7         555456 non-null  int64  
 8   X8         555456 non-null  int64  
 9   X9         555456 non-null  int64  
 10  X10        555456 non-null  int64  
 11  X11        555456 non-null  float64
 12  X12        555456 non-null  float64
 13  X13        555456 non-null  float64
 14  X14        555456 n

In [6]:
train_x.head()

Unnamed: 0_level_0,TIMESTAMP,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,X11,X12,X13,X14,X15,X16,X17,X18
Serial Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
19,2020-02-09 16:24,EI83N072710203N8H,22,18,22,16,32,99,91219,43095,2132,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19,2020-02-10 16:39,EI83N072710203N8H,22,18,22,16,32,99,91413,43214,2133,24.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19,2020-02-11 16:54,EI83N072710203N8H,22,18,22,16,32,99,91606,43332,2135,49.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
19,2020-02-12 17:09,EI83N072710203N8H,22,18,22,16,32,99,91799,43449,2136,73.0,2.0,0.0,0.0,2.0,0.0,0.0,0.0
19,2020-02-13 17:24,EI83N072710203N8H,22,18,23,16,32,99,91991,43566,2137,97.0,2.0,0.0,0.0,2.0,0.0,0.0,0.0


In [7]:
test_x.head()

Unnamed: 0_level_0,TIMESTAMP,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,X11,X12,X13,X14,X15,X16,X17,X18
Serial Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
100122I,2020-02-09 13:08,EJ86N538510606DC8,76,74,22,12,30,99,62160,27565,1980,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100122I,2020-02-10 13:23,EJ86N538510606DC8,76,74,22,12,30,99,62332,27652,1983,24.0,20.0,0.0,0.0,20.0,0.0,0.0,0.0
100122I,2020-02-11 13:38,EJ86N538510606DC8,76,74,22,12,30,99,62501,27737,1985,48.0,30.0,0.0,0.0,30.0,0.0,0.0,0.0
100122I,2020-02-12 13:53,EJ86N538510606DC8,76,74,22,12,30,99,62670,27819,1988,73.0,43.0,0.0,0.0,43.0,0.0,0.0,0.0
100122I,2020-02-13 14:08,EJ86N538510606DC8,76,74,22,12,30,99,62843,27906,1989,97.0,61.0,0.0,0.0,61.0,0.0,0.0,0.0


In [8]:
# Serial Number의 unique 값
print("Train Data의 Serial Number의 unique 값 :", len(train_x.index.unique()))
print("Test Data의 Serial Number의 unique 값 :", len(test_x.index.unique()))

Train Data의 Serial Number의 unique 값 : 8272
Test Data의 Serial Number의 unique 값 : 2069


# 데이터 전처리

## 장비 이름을 나타내는 X1 변수를 제거

In [4]:
train_x.drop(columns='X1', inplace=True)
test_x.drop(columns='X1', inplace=True)

## `MinMaxScaler` 를 활용해 표준화 

In [5]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

train_x[feature_keys] = scaler.fit_transform(train_x[feature_keys])
test_x[feature_keys] = scaler.transform(test_x[feature_keys])

## Serial Number를 기준으로 분리

In [6]:
# 데이터를 index가 같은 컬럼 별로 분리해서 리스트에 저장
# group: (그룹 이름, 그룹 데이터프레임) => group[1]: 그룹 데이터프레임
train_x_by_serial = [group[1] for group in train_x.groupby(train_x.index)]
test_x_by_serial = [group[1] for group in test_x.groupby(test_x.index)]

# TIMESTAMP 값을 기준으로 정렬
train_x_by_serial = [group.sort_values('TIMESTAMP') for group in train_x_by_serial]
test_x_by_serial = [group.sort_values('TIMESTAMP') for group in test_x_by_serial]

In [7]:
train_x_by_serial[2]

Unnamed: 0_level_0,TIMESTAMP,X2,X3,X4,X5,X6,X7,X8,X9,X10,X11,X12,X13,X14,X15,X16,X17,X18
Serial Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
48,2020-02-09 08:01,0.006299,0.00509,0.541667,0.84,0.358974,0.979381,0.115523,0.080831,0.012374,0.000000,0.000000,0.000000e+00,0.0,0.000000,0.0,0.0,0.000000
48,2020-02-10 08:16,0.006299,0.00509,0.541667,0.84,0.358974,0.979381,0.115715,0.080973,0.012379,0.014260,0.002900,1.570847e-07,0.0,0.002900,0.0,0.0,0.000006
48,2020-02-11 08:31,0.006299,0.00509,0.541667,0.84,0.358974,0.979381,0.115909,0.081117,0.012388,0.028520,0.007975,2.356270e-07,0.0,0.007975,0.0,0.0,0.000009
48,2020-02-12 08:46,0.006299,0.00509,0.541667,0.84,0.358974,0.979381,0.116122,0.081287,0.012401,0.043375,0.013050,4.712540e-07,0.0,0.013050,0.0,0.0,0.000018
48,2020-02-13 09:01,0.006299,0.00509,0.541667,0.84,0.358974,0.979381,0.116323,0.081444,0.012410,0.057635,0.017883,4.712540e-07,0.0,0.017883,0.0,0.0,0.000018
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48,2020-04-14 00:01,0.006299,0.00509,0.583333,0.84,0.358974,0.979381,0.128672,0.091105,0.013304,0.924540,0.238038,2.591897e-06,0.0,0.238038,0.0,0.0,0.000100
48,2020-04-15 00:16,0.006299,0.00509,0.583333,0.84,0.358974,0.979381,0.128875,0.091261,0.013317,0.938800,0.246979,2.866795e-06,0.0,0.246979,0.0,0.0,0.000109
48,2020-04-16 00:31,0.006299,0.00509,0.583333,0.84,0.358974,0.979381,0.129074,0.091410,0.013331,0.953060,0.253262,2.866795e-06,0.0,0.253262,0.0,0.0,0.000109
48,2020-04-17 00:46,0.006299,0.00509,0.583333,0.84,0.358974,0.979381,0.129284,0.091577,0.013342,0.967914,0.262446,2.945337e-06,0.0,0.262446,0.0,0.0,0.000112


In [18]:
train_x_by_serial[2].index.unique()

Int64Index([48], dtype='int64', name='Serial Number')

In [19]:
train_x_by_serial[2].index.unique().item()

48

In [20]:
train_y.loc[54509]

Y    0
Name: 54509, dtype: int64

## train, valid 분할

학습용 데이터셋과 검증용 데이터셋을 생성합니다. 데이터의 Serial Number를 기준으로 8:2 비율로 분할하고, 동시에 학습용 데이터셋과 검증용 데이터셋의 Y 비율이 Serial Number 수 기준으로 비슷하도록 분할합니다.

In [8]:
def train_test_split(Xs, ys, test_ratio=0.2):
    ''' 각 (x, y) 쌍을 label 별로 딕셔너리에 저장 '''
    data_per_label = {}

    for x, y in zip(Xs, ys):
        label = y
        if label not in data_per_label:
            data_per_label[label] = []
        # key: label, value: 해당 label에 해당하는 (x, y) 쌍의 리스트
        data_per_label[label].append((x, y))

    train = []
    test = []

    for label in data_per_label:
        # label에 해당하는 데이터 가져온다
        data = data_per_label[label]
        # 테스트 데이터의 개수
        n_test = int(len(data) * test_ratio)
        test += data[:n_test]
        train += data[n_test:]

    X_train, y_train = zip(*train)
    X_test, y_test = zip(*test)

    return X_train, X_test, y_train, y_test

In [9]:
X_train, X_val, y_train, y_val = train_test_split(train_x_by_serial, train_y['Y'], test_ratio=0.2)

# X data에서 Timestamp를 제거합니다.
X_train = [x.drop(columns='TIMESTAMP') for x in X_train]
X_val = [x.drop(columns='TIMESTAMP') for x in X_val]
X_test = [x.drop(columns='TIMESTAMP') for x in test_x_by_serial]

print("Train Data의 개수 :", len(X_train))
print("Validation Data의 개수 :", len(X_val))
print("Test Data의 개수 :", len(X_test))

Train Data의 개수 : 6618
Validation Data의 개수 : 1654
Test Data의 개수 : 2069


## 머신러닝 모델에 적용하기 위해 학습, 검증, 테스트용 데이터를 각각 하나의 numpy array로 합칩니다.

In [10]:
def align_data(data, series_length):
    # X2~18열에 해당하는 시계열 데이터 값만 추출
    data_features = [x[feature_keys] for x in data]
    len_data = len(data_features)
    length_aligned_X = []
    for x in data_features:
        # 시계열 데이터 길이가 series_length 이상이면 > 뒷부분 잘라냄
        if len(x) >= series_length:
            length_aligned_X.append(x[:series_length])
        # 시계열 데이터 길이가 series_length보다 작으면 > 마지막 행을 반복하여 길이를 맞춤
        else:
            length_aligned_X.append(x.append([x.iloc[-1]] * (series_length - len(x))))
    return np.array(length_aligned_X).reshape(len_data, -1)

In [11]:
series_length = 50

X_train = align_data(X_train, series_length)
X_val = align_data(X_val, series_length)
X_test = align_data(X_test, series_length)

In [25]:
17 * 50

850

In [26]:
len(X_train[0])

850

y data도 numpy array로 변환합니다.

In [12]:
y_train = np.array(y_train)
y_val = np.array(y_val)

In [28]:
y_train

array([1, 1, 1, ..., 0, 0, 0])

# 모델 학습

# 최종 모델

In [14]:
model = XGBClassifier(scale_pos_weight = 1,
                      colsample_bytree = 1, 
                      max_depth = 3, 
                      min_child_weight = 4,
                      subsample = 1,
                      eta = 0.22,
                      random_state = 42)
model.fit(X_train, y_train)

# 검증 데이터에 대한 예측
y_val_pred = model.predict(X_val)
# 검증 데이터에 대한 F1 점수 계산
f1_val = f1_score(y_val, y_val_pred, average='macro')
print('Validation F1 score = %.3f' % f1_val)

Validation F1 score = 0.852


# 모델 평가 및 결과 저장


In [15]:
submission = pd.read_csv(os.path.join(data_dir, "test_y.csv"), index_col='Serial Number')

# 테스트 데이터에 대해 예측하고, 결과를 저장합니다.
y_test_pred = model.predict(X_test)
submission["Y"] = y_test_pred
submission.to_csv("submission.csv", index_label='Serial Number')

### 결과 검증

`submission.csv` 파일을 다시 불러와 올바르게 값을 채웠는지 다시 한번 확인합니다.

In [16]:
submission = pd.read_csv("submission.csv", index_col='Serial Number')
submission

Unnamed: 0_level_0,Y
Serial Number,Unnamed: 1_level_1
100122I,1
100368G,0
101403L,0
101426G,0
101505B,0
...,...
997719U,0
998737L,1
999308S,0
999800H,0


In [17]:
# 예측 결과 중 1의 비율을 계산하고, 학습용 데이터의 비율과 비교합니다.
print(submission["Y"].mean())
print(train_y.mean())

0.19913001449975834
Y    0.146518
dtype: float64


### 제출

우측 상단의 제출 버튼을 눌러, `competition.ipynb` 파일과 `submission.csv` 파일을 제출합니다.