# Data

## before(ifft) : raw data

In [None]:
import pandas as pd
import collections
from collections import Counter

train = pd.read_csv('./leaktype_train.csv')
#test = pd.read_csv('./leaktype_test.csv')
X = np.array(train.drop('leaktype', axis=1))
y = np.array(train.leaktype) # y완
print(y[:10])
print(X.shape)
X_1d = X.reshape(1, -1) # normalization 위해 1-d 배열로 변경
print(X_1d.shape)

# X_1d

# train 데이터 label 별 갯수 파악
label_counts = collections.Counter(train['leaktype'])
print('Counts by label:', dict(label_counts))
print(f'Naive Accuracy: {100*max(label_counts.values())/sum(label_counts.values()):0.2f}%')

### data augmentation : before-aug

In [None]:
import pandas as pd
import collections
from collections import Counter

train = pd.read_csv('./leaktype_train.csv', header=0)

leak_out = train[train.leaktype=='out'] # 10배
leak_in = train[train.leaktype=='in'] # 10배
leak_noise = train[train.leaktype=='noise'] # 3.6
leak_other = train[train.leaktype=='other'] # 2.57

noise_index = list(leak_noise.index)
other_index = list(leak_other.index)
noise_random_idx = list(np.random.choice(noise_index, 3000))
other_random_idx = list(np.random.choice(other_index, 4000))

noise_extra = leak_noise.loc[noise_random_idx]
other_extra = leak_other.loc[other_random_idx]

# 단순 복제하여 oversampling
df_out = pd.concat([leak_out]*9, ignore_index=True)
df_in = pd.concat([leak_in]*9, ignore_index=True)
df_noise = pd.concat([leak_noise]*2, ignore_index=True)
df_other = pd.concat([leak_other]*1, ignore_index=True)

train_over = pd.concat([train, df_out, df_in, df_noise, df_other, noise_extra, other_extra], axis=0, ignore_index=True)

# train_over

#train_over.to_csv('./oversampled.csv')

# label 수 확인
label_counts = collections.Counter(train_over['leaktype'])
print('Counts by label:', dict(label_counts))
print(f'Naive Accuracy: {100*max(label_counts.values())/sum(label_counts.values()):0.2f}%')

## generate after(ifft)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

train = pd.read_csv('./leaktype_train.csv')

X_tmp = train.drop('leaktype', axis=1)
X_tmp = np.array(X_tmp)

# generate after(ifft)
X_ifft = np.fft.ifft(X_tmp)
plt.plot(X_ifft[0]) # 데이터 확인

X_ifft = X_ifft.real
after = pd.DataFrame(X_ifft)

# after

## normalization

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler = scaler.fit(X_1d.T)
normalized_X = scaler.transform(X_1d.T).T
norm_X = normalized_X.reshape((33600, 513)) # 데이터 별로 변경 필요
norm_X = to3d(norm_X) # X완

# Train

In [None]:
# 전처리한 데이터 X, y 준비 후 #

# splits 생성
X, y = norm_X, y

model_name = 'InceptionTimePlus'
data_type = 'before-aug' 

splits = get_splits(y, valid_size=0.2, stratify=True, random_state=42, shuffle=True) ################


# prepare dataloaders
tfms = [None, TSClassification()] # TSClassification == Categorize
batch_tfms = TSStandardize()
dls = get_ts_dls(X, y, splits=splits, tfms=tfms, batch_tfms=batch_tfms, bs=[64, 128])
print(f'dls.dataset:\n{dls.dataset}')
dls.show_batch(sharey=True) # 데이터 그래프로 보여줌
plt.show()

# build learner
model = build_ts_model(InceptionTimePlus, dls=dls) # model
learn = Learner(dls, model, metrics=accuracy)

# learning rate curve
learn.lr_find()

# train
learn = ts_learner(dls, metrics=accuracy, cbs=ShowGraph())
learn.fit_one_cycle(10, lr_max=1e-3)

# 모델 저장
PATH = Path(f'./models/{model_name}_{data_type}.pkl')
PATH.parent.mkdir(parents=True, exist_ok=True)
learn.export(PATH)

# visualize data
learn.show_results(sharey=True)
learn.show_probas()

interp = ClassificationInterpretation.from_learner(learn)
interp.plot_confusion_matrix()
interp.most_confused(min_val=3)
interp.print_classification_report()

# create predictions
PATH = Path(f'./models/{model_name}_{data_type}.pkl')
learn_gpu = load_learner(PATH, cpu=False)
probas, _, preds = learn_gpu.get_X_preds(X[splits[0]])
print(preds[-10:])

Test

In [None]:
import pandas as pd
import numpy as np
import joblib

test =  pd.read_csv('./leaktype_test.csv', index_col=0, header=0)
X_test = np.array(test)
print(X_test.shape)

X_1d = X_test.reshape(1,-1)
print(X_1d.shape)

# normalization
scaler = joblib.load(f'./models/standard_scalar.pkl')
normalized_X = scaler.transform(X_1d.T).T
X_test = normalized_X.reshape((7820, 513))
X_test.shape

# to3d
X_test = to3d(X_test)
print(X_test.shape)

# create predictions
model_name = 'MiniRocket' # train에서 입력한거랑 같게
data_type = 'before-aug'

PATH = Path(f'./models/{model_name}_{data_type}.pkl')
learn_gpu = load_learner(PATH, cpu=False)
probas, _, preds = learn_gpu.get_X_preds(X_test)
print(model_name)
print(preds[-10:])
print('-'*20)
#preds.to_csv(f'./{model_name}_{data_type}_preds.csv')
preds_df = pd.DataFrame(preds)
print(preds_df.head(3))

t_temp = test.reset_index()
test_preds = pd.concat([t_temp[['id']], preds_df], axis=1, ignore_index=False)
test_preds.rename(columns={0:'preds_label'}, inplace=True)
print(test_preds.head(3))

# test_preds

#test_preds.to_csv(f'./{model_name}_{data_type}_preds.csv')