# Simple EDA

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import matplotlib.pyplot as plt

submission = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2022/sample_submission.csv')
train_labels = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2022/train_labels.csv')
train = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2022/train.csv')
test = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2022/test.csv')

In [None]:
train_labels.shape, train.shape, test.shape

In [None]:
train['sequence'].unique()

In [None]:
print(f'Number of train sequence: from {train_labels.sequence.min()} to {train_labels.sequence.max()}')
print(f'Number of train sequence: from {train.sequence.min()} to {train.sequence.max()}')
print(f'Number of test sequence: from {test.sequence.min()} to {test.sequence.max()}')
print()

print(f'Number of train subjects: from {train.subject.min()} to {train.subject.max()}')
print(f'Number of test subjects: from {test.subject.min()} to {test.subject.max()}')
print()

print(f'Step(seconds): from {train.step.min()} to {train.step.max()}')
print()

print(f'States: {train_labels.state.unique()}')

In [None]:
train.head()

In [None]:
# Check the possible values of the 13 sensors
# 13個のセンサーの取りうる値を調べる
figure = plt.figure(figsize=(16, 8))
for sensor in range(13):
    sensor_name = f'sensor_{sensor:02d}'
    # 4*4行列でグラフをプロット
    plt.subplot(4, 4, sensor+1)
    # 100個の区間で(100個ずつでデータを区切って)ヒストグラムを表示させる
    plt.hist(train[sensor_name], bins=100)
    plt.title(f'{sensor_name} hist')
# height, widthのpaddingを設定
figure.tight_layout(h_pad=1.0, w_pad=0.5)
plt.show()

* As you can see, the histograms show that every sensor has outliers.
* 見てわかる通り、各ヒストグラムには外れ値が存在する
* next -> Exclude outliers(外れ値の除外)

In [None]:
# 外れ値を除外する
# plot the 'inner' part of the histogram after removing 2 % outliers on either side
# 上のヒストグラフから左右2%を除外したグラフをプロットする
figure = plt.figure(figsize=(16, 8))
for sensor in range(13):
    sensor_name = f'sensor_{sensor:02d}'
    plt.subplot(4, 4, sensor+1)
    # range(binの最小, binの最大)がデフォルトなのでrangeを明示的に指定して2%削る
    plt.hist(train[sensor_name], bins=100,
            range=(train[sensor_name].quantile(0.02),
                  train[sensor_name].quantile(0.98)))
    plt.title(f'{sensor_name} hist')
figure.tight_layout(h_pad=1.0, w_pad=0.5)
plt.show()

In [None]:
print('Count of unique values per sensor:')
for sensor in range(13):
    sensor_name = f'sensor_{sensor:02d}'
    print(f'{sensor_name}: {len(np.unique(train[sensor_name])):6d}')

In [None]:
# 20 step(second) interval
# 20秒間隔で各センサーの変動を確認する
# sequenceはrandom値
sequences = [0,10,1568,12345,25967]
# figure: 描画領域全体、axes: 個別の座標軸
# subplots(13行, 5列, subplot間でx軸を共有, インチでサイズ指定)
figure, axes = plt.subplots(13, len(sequences), sharex=True, figsize=(16, 16))
for i, sequence in enumerate(sequences):
    for sensor in range(13):
        sensor_name = f'sensor_{sensor:02d}'
        # subplot(13行、5列、 領域順序->1番目,6番目,11番目,16番目...)
        plt.subplot(13, len(sequences), sensor * len(sequences) + i + 1)
        plt.plot(range(60), train[train.sequence == sequence][sensor_name],
                color=plt.rcParams['axes.prop_cycle'].by_key()['color'][i % 10])
        if sensor == 0: plt.title(f'Sequence {sequence}')
        # sequenceNoが0の時はyラベル追記
        if sequence == sequences[0]: plt.ylabel(sensor_name)
figure.tight_layout(w_pad=0.1)
plt.suptitle('Selected Time Series', y=1.02)
plt.show()

* Characteristic(特徴的なセンサ) ?
    * sensor02
    * sensor07
    * sensor12

# Predict LightGBM

In [None]:
import xgboost as xgb
import lightgbm as lgb
import seaborn as sns
from sklearn.decomposition import LatentDirichletAllocation, NMF, TruncatedSVD
from sklearn.metrics import f1_score, fbeta_score, precision_score, recall_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold,KFold,train_test_split,GroupKFold
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
import warnings
from datetime import datetime

In [None]:
sensor = ['00','01','02','03','04','05','06','07','08','09','10','11','12']

columns = []
for i in sensor:
    columns.append(f'sensor_{i}')
   

def feature_engineer(df):
    df_copy = df.copy()
    for i in sensor:
        # sequence, subject毎のセンサーの平均値を取得
        mean_value = df.groupby(['sequence', 'subject'])[f'sensor_{i}'].mean()
        # 名前をつける -> あとでマージする
        mean_value = mean_value.rename(f'sensor_{i}_mean')
        
        # sequence, subject毎のセンサーの標準偏差を取得
        std_value  = df.groupby(['sequence','subject'])[f'sensor_{i}'].std()
        std_value  = std_value.rename(f'sensor_{i}_std')
        
        # sequence, subject毎のセンサーの歪度(Skewness)を取得
        skew_value  = df.groupby(['sequence','subject'])[f'sensor_{i}'].skew()
        skew_value  = skew_value.rename(f'sensor_{i}_skew')
        
        # sequence, subject毎のセンサーの最大値を取得
        max_value  = df.groupby(['sequence','subject'])[f'sensor_{i}'].max()
        max_value  = max_value.rename(f'sensor_{i}_max')
    
        # sequence, subject毎のセンサーの最小値を取得
        min_value  = df.groupby(['sequence','subject'])[f'sensor_{i}'].min()
        min_value  = min_value.rename(f'sensor_{i}_min')
        
        # DataFrameにcopyする
        df_copy = df_copy.merge(mean_value, left_on=['sequence', 'subject'], right_index=True)
        df_copy = df_copy.merge(std_value,  left_on=['sequence', 'subject'], right_index=True)
        df_copy = df_copy.merge(skew_value, left_on=['sequence', 'subject'], right_index=True)
        df_copy = df_copy.merge(max_value,  left_on=['sequence', 'subject'], right_index=True)
        df_copy = df_copy.merge(min_value,  left_on=['sequence', 'subject'], right_index=True)
    
    # 元のカラムを削除 -> mean, max, std...etc のみのカラムに整形する
    df_copy = df_copy.drop(columns, axis=1)
    # 60 step
    df_copy = df_copy[::60]
    return df_copy

In [None]:
# 訓練用データとテスト用データを作成
df_train = feature_engineer(train)
df_test = feature_engineer(test)

In [None]:
df_train

In [None]:
df_test

In [None]:
# 学習データ用にsequence, subjectのカラムを削除する
X_train = df_train.drop(["sequence", "subject"], axis=1).reset_index(drop=True)
y_train = train_labels.drop(["sequence"], axis=1)
X_test  = df_test.drop(["sequence", "subject"], axis=1).reset_index(drop=True)

In [None]:
X_train

In [None]:
y_train

In [None]:
X_test

## Model

In [None]:
params = {
        # 勾配ブースティング決定木(Gradient Boosting Decision Tree)を使う
        'boosting_type': 'gbdt',
        # binary -> 2値分類
        'objective': 'binary',
        # 学習率, default -> 0.1
        'learning_rate': 0.001,
        # 木の最大深さを決める。defaultの-1は上限無しなので、ここも必ず設定したほうがいい
        'max_depth': 8,
        # baggingで選択されるサンプルの割合。default 1.0で、baggingは無効化されている。baggingするには"bagging_freq"も正の値にしなくてはらない
        'bagging_fraction': 0.8,
        # 何回に一回baggingするか。baggingするには"bagging_fraction"が1未満である必要がある
        'bagging_freq': 1,
        # 1.0未満の値にすると、特徴量の一部を削減して学習を行う用になる。default -> 1.0
        'feature_fraction': 0.8, 
        # metric(誤差関数の測定方法) -> AUCの最大化を目指す
        'metric': 'auc'
    }

# KFold(データの分割数k, shuffle -> ランダムにデータを選択, random_state -> 乱数のシードを指定)
kf = KFold(n_splits = 5, shuffle = True, random_state = 70)
models = []
for fold, (tr_idx, va_idx) in enumerate(kf.split(X_train)):
    print(f'--------fold:{fold+1}--------')
    fold+=1
    tr_x, va_x = X_train.iloc[tr_idx], X_train.iloc[va_idx]
    tr_y, va_y = y_train.iloc[tr_idx], y_train.iloc[va_idx]
    data_train = lgb.Dataset(tr_x, tr_y)
    data_val = lgb.Dataset(va_x, va_y)
    
    
    lgb_results = {}   
    model = lgb.train(
        params = params,
        train_set = data_train,
        valid_sets = [data_val ,data_train],
        valid_names=['eval', 'train'],
        num_boost_round = 1000,
      #  valid_sets = watchlist,
        early_stopping_rounds=50,
        evals_result=lgb_results,
        verbose_eval=100
    )
    models.append(model)

## Predict

In [None]:
prediction = np.zeros(X_test.shape[0])
for i, model in enumerate(models):
    pred = model.predict(X_test)
    prediction += pred
prediction = prediction/len(models)

In [None]:
prediction

In [None]:
# 提出物として正しいのは[0, 1]の2値なので
tmp_prediction = prediction
tmp_prediction.round()

In [None]:
submission['state'] = tmp_prediction
submission.to_csv('submission.csv', index=False)