In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Step 1. Library and Dataset

## Import Library

In [None]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.ticker import *
import seaborn as sns
from collections import Counter
from scipy.stats import kde, kurtosis
from scipy.stats.mstats import gmean


import os

print("Numpy ver.", np.__version__)
print("Pandas ver.", pd.__version__)
print("Matplotlib ver.", matplotlib.__version__)
print("Seaborn ver.", sns.__version__)

print(os.listdir('../input/tabular-playground-series-apr-2022'))

In [None]:
# !pip install pycaret==2.2.3

In [None]:
# from pycaret.utils import version
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.inspection import permutation_importance

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_validate
from sklearn.pipeline import Pipeline
from scipy.stats import uniform, randint

from sklearn.metrics import confusion_matrix, plot_confusion_matrix
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

from sklearn.pipeline import Pipeline
import time, logging, gc
import warnings
warnings.filterwarnings('ignore')

import tensorflow as tf

from tensorflow.keras.layers import *
from tensorflow.keras import *
from tensorflow.keras.callbacks import *
from sklearn.model_selection import KFold, GroupKFold
from tensorflow.keras.metrics import AUC


## Load Dataset

In [None]:
BASE_DIR = '../input/tabular-playground-series-apr-2022/'
train = pd.read_csv(BASE_DIR + "train.csv")
train_labels = pd.read_csv(BASE_DIR + "train_labels.csv")
test = pd.read_csv(BASE_DIR + "test.csv")
submission = pd.read_csv(BASE_DIR + "sample_submission.csv")
print("Train Data:",train.shape)
print("Train Label Data:",train_labels.shape)
print("Test Data:",test.shape)
print("Sample Data:",submission.shape)

In [None]:
train.info()

In [None]:
train_labels.info()

In [None]:
test.info()

In [None]:
submission.info()

In [None]:
train.head()

In [None]:
train_labels.head()

In [None]:
test.head()

In [None]:
submission.head()

In [None]:
# train = train.set_index(['sequence', 'subject', 'step'])
# test = test.set_index(['sequence', 'subject', 'step'])

# Step 2. EDA

## Overview of data

In [None]:
df = train.describe()
display(df.style.format('{:,.2f}')\
        .background_gradient(subset=(df.index[3:], df.columns[3:]),
                             cmap="RdBu", vmin=-700, vmax=700, axis=1))

In [None]:
df2 = test.describe()
display(df2.style.format('{:,.2f}')\
        .background_gradient(subset=(df2.index[3:], df2.columns[3:]),
                             cmap="RdBu", vmin=-700, vmax=700, axis=1))

In [None]:
missing = pd.DataFrame({
    'train_miss' : train.isna().sum(),
    'test_miss' : test.isna().sum(),
})
print("Missing Value :")
missing.T

- train 데이터와 test 데이터 모두 결측치는 존재하지 않는다.
- 13가지의 sensor 변수는 대부분이 0 주위의 값을 가지지만, 정상 범위를 벗어난 이상치도 존재하는 것으로 보인다.

## Target Distribution

In [None]:
train = train.merge(train_labels, on='sequence')
train['state'].value_counts()

In [None]:
fig, ax = plt.subplots(figsize=(5,5))
labels = list(map(bool, train['state'].value_counts().index))
lst = train['state'].value_counts().to_list()

pie = ax.pie(lst, labels=labels, autopct='%.2f%%',
             textprops=dict(color="white", fontsize=15, weight="bold"),
             colors = ['#6B8DFF', '#F5B3B8'], shadow=True,
             wedgeprops=dict(width=0.75), startangle=45)
ax.set_title("Target Distribution", size=20)
ax.legend(title="State", title_fontsize=12, loc="best", fontsize=12)

plt.show()

## Sequence, Subject, Step

In [None]:
fig, axes = plt.subplots(2, 3, figsize=(12, 3*2))

sns.kdeplot(data=train, x='sequence', shade=True, bw=10, color='black',ax=axes[0][0])
sns.kdeplot(data=train, x='subject', shade=True, bw=10, color='red', ax=axes[0][1])
sns.kdeplot(data=train, x='step', shade=True, bw=10, color='green', ax=axes[0][2])

sns.kdeplot(data=train, x='sequence', shade=True, bw=10, color='black',ax=axes[1][0])
sns.kdeplot(data=train, x='subject', shade=True, bw=10, color='red', ax=axes[1][1])
sns.kdeplot(data=train, x='step', shade=True, bw=10, color='green', ax=axes[1][2])

axes[0][1].set_title("Train data", pad = 10, size=20)
axes[1][1].set_title("Test data", pad = 10, size=20)

plt.tight_layout()
plt.show()

- train 데이터와 test 데이터 모두 sequence, subject, step은 정규분포의 형태를 보인다.

In [None]:
fig, axes = plt.subplots(3,1,figsize=(12,5*3))

sns.kdeplot(data=train,x='sequence', hue='state', ax=axes[0])
sns.kdeplot(data=train,x='subject', hue='state', ax=axes[1])
sns.kdeplot(data=train,x='step', hue='state', ax=axes[2])

plt.show()

- train 데이터에서 sequence는 state의 값에 따른 분포의 차이가 크지 않지만, <br/>subject는 다소 상이한 분포를 보이며, step은 state의 값과 상관 없이 균일한 분포를 보인다.

In [None]:
ss = train.groupby(['subject', 'state'])['sequence'].nunique().reset_index()
fig, ax = plt.subplots(figsize=(12,7))

for i in reversed(range(0,2)):
    x = ss[ss['state']==i]['subject']
    y = ss[ss['state']==i]['sequence']
    ax.plot(x,y)

ax.spines['top'].set_visible(False)
ax.spines['left'].set_position(("outward", 10))
ax.spines['left'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['bottom'].set_visible(False)
ax.set_ylim(0,200)
ax.set_title("Unique Sequences per Subject (Train)", pad = 10, size=20)
ax.legend([1, 0], title="State", title_fontsize=15,
          loc="upper right", fontsize=12)
ax.grid(axis="y", linewidth=0.3, color="gray")
plt.show()

- 각 subject에 대한 고유 sequence의 수는 State 값이 1인 경우에 대체로 큰 값을 보이며, State 값이 0인 경우에는 50 이내에서 비교적 고른 분포를 보인다.

In [None]:
ss = test.groupby(['subject'])['sequence'].nunique().reset_index()
fig, ax = plt.subplots(figsize=(12,7))

for i in reversed(range(0,2)):
    x = ss['subject']
    y = ss['sequence']
    ax.plot(x,y)

ax.spines['top'].set_visible(False)
ax.spines['left'].set_position(("outward", 10))
ax.spines['left'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['bottom'].set_visible(False)
ax.set_ylim(0,200)
ax.set_title("Unique Sequences per Subject (Test)", pad = 10, size=20)
ax.grid(axis="y", linewidth=0.3, color="gray")
plt.show()

- 같은 맥락으로, Test 데이터에서도 subject에 대한 고유 sequence의 수가 State 값에 영향을 미칠 것으로 보인다.

## Sensors Ditributions

In [None]:
sensors = [col for col in train if col.startswith('sensor')]

fig, axes = plt.subplots(13, 2, figsize=(12, 13*4))

row = 0
for sensor in sensors:
    sns.kdeplot(data=train, x=sensor, hue='state', ax=axes[row][0])
    sns.kdeplot(data=test, x=sensor, ax=axes[row][1])
    row += 1

axes[0][0].set_title("Sensor Distribution in Train", pad = 10, size = 20)
axes[0][1].set_title("Sensor Distribution in Test", pad = 10, size = 20)
    
fig.tight_layout()
plt.show()

- 대부분의 sensor는 0을 기준으로 좌우 대칭의 형태를 띄고 있으나, sensor_02의 경우 왼쪽 꼬리가 긴 분포를 보인다.
- 각 sensor마다 이상치로 인해 전체적인 분포가 잘 드러나지 않기 때문에 적절한 이상치 처리가 필요하다.

In [None]:
def plot_sequence(sequence: int) -> None:
    mask = train.sequence == sequence
    ax = train[mask][sensors].plot(subplots=True,
                                   sharex=True,
                                   figsize=(12,20))
    ax[0].set_title(f'Sequence {sequence}', size=22)
    plt.show()

plot_sequence(5)

In [None]:
fig, ax = plt.subplots(figsize=(15, 10))

mask = np.triu(np.ones_like(train[sensors].corr(), dtype=np.bool))
sns.heatmap(train[sensors].corr(), mask = mask,
            cmap = "RdBu", vmin = -1, vmax = 1, annot = True, fmt = '.3f')
ax.set_title('Correlation Heatmap between sensors', pad = 12, size=20)

plt.show()

- 대부분의 변수들 사이에는 상관관계가 약하거나 거의 없는 것으로 나타나지만, <br/>
  (sensor_00, sensor_06, sensor_09), (sensor_03, sensor_07, sensor_11)은 뚜렷한 양의 상관관계를 가진다.
- 따라서 상관계수가 가장 큰 sensor_00과 sensor_03만 선택하고 sensor_06, sensor_07, sensor_09, sensor_11은 제거한다.

## Outlier Distribution
- Sensor 데이터의 이상치를 IQR 방식으로 탐지한다.
    - 하한값 : 1분위수 - IQR * 1.5
    - 상한값 : 3분위수 + IQR * 1.5
    - 하한값보다 작거나 상한값보다 큰 값을 이상치로 간주한다.

In [None]:
def get_outlier(df, col, weight=1.5):
    q1 = np.percentile(df[col].values, 25)
    q3 = np.percentile(df[col].values, 75)

    iqr = q3 - q1
    low_lim = q1 - iqr * weight
    high_lim = q3 + iqr * weight
    
    idx = df[col][(df[col] < low_lim) | (df[col] > high_lim)].index.to_list()
    return idx

out_sensor = []
for sensor in sensors:
     out_sensor.append(get_outlier(train, sensor))

In [None]:
out_df = pd.DataFrame(index=train.index)
for i, sensor in enumerate(sensors):
    out_df[sensor] = train.index.isin(out_sensor[i])
out_df['tot_count'] = np.sum(out_df, axis=1)

In [None]:
fig, ax = plt.subplots(figsize=(12,5))

sns.countplot(x="tot_count", data=out_df)
ax.set_title("Distribution of outlier counts", pad = 10, size=20)
ax.set_xlabel("the number of outlier", fontsize=15)
ax.set_ylabel("")
ax.set_ylim(-1000,550000)

for p in ax.patches:
    height = p.get_height()
    ax.text(p.get_x() + p.get_width() / 2., height + 5000, f'{height:,}', ha = 'center', size = 10)

plt.show()

- train 데이터의 레코드별 이상치 개수를 분석한 결과,<br/>전체의 약 3분의 1 정도가 이상치가 없고, 반대로 모든 값이 정상 범위를 벗어나는 경우는 7개가 있다.

In [None]:
out_df['state'] = train['state']
fig, ax = plt.subplots(3,5,figsize=(12, 9))

for i, sensor in enumerate(sensors):
    sns.countplot(x="state", data=train.iloc[get_outlier(train,sensor)], ax=ax[i//5][i%5])
    ax[i//5][i%5].set_title(sensor)

sns.countplot(x="state", data=out_df[out_df['tot_count']!=0], ax=ax[2][3])
sns.countplot(x="state", data=out_df[out_df['tot_count']==0], ax=ax[2][4])

ax[2][3].set_title("with outlier")
ax[2][4].set_title("without outlier")

fig.tight_layout()
plt.show()

- 각 센서별로 이상치가 있는 sequence의 state 비율은 대부분 차이가 크지 않지만,<br/> sensor_02의 경우에는 state 값이 0인 sequence가 1인 sequence보다 월등히 많다.
- 해당 sequence가 이상치를 포함하느냐 포함하지 않느냐는 state의 비율에 거의 영향을 주지 않는 것으로 보인다.

# Step 3. Feature Engineering

## Removal of correlated variables

In [None]:
'''train = train.drop(['sensor_06', 'sensor_07', 'sensor_09', 'sensor_11'], axis=1)
test = test.drop(['sensor_06', 'sensor_07', 'sensor_09', 'sensor_11'], axis=1)'''

In [None]:
'''sensors = [col for col in train if col.startswith('sensor')]
fig, ax = plt.subplots(figsize=(10, 7))

mask = np.triu(np.ones_like(train[sensors].corr(), dtype=np.bool))
sns.heatmap(train[sensors].corr(), mask = mask,
            cmap = "RdBu", vmin = -1, vmax = 1, annot = True, fmt = '.3f')
ax.set_title('Correlation Heatmap between sensors', pad = 12, size=20)

plt.show()'''

## Sequence Count for each Subject

In [None]:
def count_sequences(df):
    count_sequences = (df.groupby('subject').sequence.size()/60).astype(int).reset_index()
    count_sequences['num_sequences'] = count_sequences.sequence
    count_sequences = count_sequences.drop('sequence', axis=1)
    return count_sequences

train_count_sequences = count_sequences(train)
train = train.merge(train_count_sequences, on='subject', how='left')

test_count_sequences = count_sequences(test)
test = test.merge(test_count_sequences, on='subject', how='left')

## Add statistical variables for each sensor

In [None]:
for sensor in sensors:
    train[f'{sensor}_step_diff'] = train.groupby(['sequence','subject'])[sensor].diff()
    train[f'{sensor}_step_diff'].fillna(train[f'{sensor}_step_diff'].median(), inplace=True)
    train[f'{sensor}_step_mean'] = train.groupby(['sequence','subject'])[sensor].transform('mean')
    train[f'{sensor}_step_median'] = train.groupby(['sequence','subject'])[sensor].transform('median')
    train[f'{sensor}_step_std'] = train.groupby(['sequence','subject'])[sensor].transform('std')
    train[f'{sensor}_step_min'] = train.groupby(['sequence','subject'])[sensor].transform('min')
    train[f'{sensor}_step_max'] = train.groupby(['sequence','subject'])[sensor].transform('max')
    # train[f'{sensor}_skew'] = train.groupby(['sequence','subject'])[sensor].skew()
    # train[f'{sensor}_kurtosis'] = train.groupby(['sequence','subject'])[sensor].apply(pd.DataFrame.kurt)
    
    test[f'{sensor}_step_diff'] = test.groupby(['sequence','subject'])[sensor].diff()
    test[f'{sensor}_step_diff'].fillna(test[f'{sensor}_step_diff'].median(), inplace=True)
    test[f'{sensor}_step_mean'] = test.groupby(['sequence','subject'])[sensor].transform('mean')
    test[f'{sensor}_step_median'] = test.groupby(['sequence','subject'])[sensor].transform('median')
    test[f'{sensor}_step_std'] = test.groupby(['sequence','subject'])[sensor].transform('std')
    test[f'{sensor}_step_min'] = test.groupby(['sequence','subject'])[sensor].transform('min')
    test[f'{sensor}_step_max'] = test.groupby(['sequence','subject'])[sensor].transform('max')
    # test[f'{sensor}_skew'] = test.groupby(['sequence','subject'])[sensor].skew()
    # test[f'{sensor}_kurtosis'] = test.groupby(['sequence','subject'])[sensor].apply(pd.DataFrame.kurt)

## Whether sensor 2 is a constant

In [None]:
'''df = train.groupby(['sequence', 'step'])['sensor_02'].sum()
constant_s02 = df.unstack(-1).std(axis=1)[df.unstack(-1).std(axis=1) == 0]
isin_ = train['sequence'].isin(constant_s02.index.to_list())'''

In [None]:
'''train['Constant_s02'] = np.zeros(len(train))
train['Constant_s02'][isin_] = 1
train['Constant_s02'].value_counts()'''

In [None]:
# train[['Constant_s02', 'state']].value_counts()

# Step 4. PCA

In [None]:
'''y = train['state'].copy()
X = train.drop('state', axis=1).copy()
X_test = test.copy()'''

In [None]:
'''sc = StandardScaler()
X_scaled = sc.fit_transform(X)'''

In [None]:
'''pca = PCA().fit(X)

fig, ax = plt.subplots(figsize=(10,4))
xi = np.arange(1, 1+X.shape[1], step=1)
yi = np.cumsum(pca.explained_variance_ratio_)

ax.plot(xi, yi, marker='o', linestyle='--', color='b')
ax.set_label('Number of Components')
ax.set_ylabel('Cumulative variance (%)')
ax.set_title('Explained variance by each component')
plt.show()

print('Explained variance by each component')
for i, val in enumerate(pca.explained_variance_ratio_):
    print(i, f'{val:.4f}')
    if i == 5:
        break'''

In [None]:
'''pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)
print(X_pca.shape)'''

In [None]:
'''total_var = pca.explained_variance_ratio_.sum() * 100
print(f'Total Explained Variance : {total_var:.2f} %')'''

# Step 5. Modeling

## Data Split

In [None]:
'''# Existing Data
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.3, random_state=42)
print("Train data\t:", X_train.shape, y_train.shape)
print("Validation data\t:", X_val.shape, y_val.shape)'''

In [None]:
'''# PCA Data
X_pca_train, X_pca_val, y_train, y_val = train_test_split(
    X_pca, y, test_size=0.3, random_state=42)
print("Train data\t:", X_pca_train.shape, y_train.shape)
print("Validation data\t:", X_pca_val.shape, y_val.shape)'''

## Pycaret

In [None]:
'''from pycaret.classification import *
setup_clf = setup(data=train, target='state', fold_shuffle=True)'''

In [None]:
'''best_model = compare_models(sort='AUC', fold=5,
                            include=['rf', 'knn', 'lr', 'svm', 'lightgbm'])'''

In [None]:
'''pred = predict_model(best_model, data=test)
pred.head()'''

## Hyperparameter Tuning

### Grid Search

In [None]:
'''# define classifier
classifiers = {
    "KNN" : KNeighborsClassifier(),
    "LogisticRegression" : LogisticRegression(random_state=42),
    "RandomForest" : RandomForestClassifier(random_state=42),
    "LGBM" : LGBMClassifier(random_state=42),
    "SVM" : SVC(random_state=42)
}

# define grid
KNN_grid = {'n_neighbors': [3, 5, 7, 9],
            'p': [1, 2]}

LR_grid = {'penalty': ['l1','l2'],
           'C': [0.25, 0.5, 0.75, 1, 1.25],
           'max_iter': [50, 100, 150]}

RF_grid = {'n_estimators': [50, 100, 150, 200],
        'max_depth': [6, 8, 10, 12]}

LGBM_grid = {'n_estimators': [50, 100, 150, 200],
        'max_depth': [6, 8, 10, 12],
        'learning_rate': [0.05, 0.1, 0.15]}

SVM_grid = [{'svc__C': [0.01, 0.1, 1.0, 10.0],
             'svc__kernel': ['linear']},
            {'svc__C': [0.01, 0.1, 1.0, 10.0],
             'svc__gamma': [0.01, 0.1, 1.0, 10.0],
             'svc__kernel': ['rbf']}]

grid = {
    "KNN" : KNN_grid,
    "LogisticRegression" : LR_grid,
    "RandomForest" : RF_grid,
    "LGBM" : LGBM_grid,
    "SVM" : SVM_grid
}'''

In [None]:
'''i=0
clf_best_params = classifiers.copy()
scores = pd.DataFrame({
                    'Classifer':classifiers.keys(),
                    'Train accuracy' : np.zeros(len(classifiers)),
                    'Validation accuracy': np.zeros(len(classifiers)),
                    'Training time': np.zeros(len(classifiers))
                    })

for key, classifier in classifiers.items():
    start = time.time()
    clf = GridSearchCV(estimator=classifier, param_grid=grid[key], n_jobs=-1, cv=None)

    clf.fit(X_train, y_train)
    scores.iloc[i,1]=clf.score(X_train, y_train)
    scores.iloc[i,2]=clf.score(X_val, y_val)
    clf_best_params[key]=clf.best_params_
    
    stop = time.time()
    scores.iloc[i,3]=np.round((stop - start)/60, 2)
    
    print('Model:', key)
    print('Training time (mins):', scores.iloc[i,3])
    print('')
    i+=1'''

### Random Search

In [None]:
'''# define classifier
classifiers = {
    "KNN" : KNeighborsClassifier(),
    "LogisticRegression" : LogisticRegression(random_state=42),
    "RandomForest" : RandomForestClassifier(random_state=42),
    "LGBM" : LGBMClassifier(random_state=42),
    "SVM" : SVC(random_state=42)
}

# define grid
KNN_grid = {'n_neighbors': randint(3,9),
            'p': randint(1,2)}

LR_grid = {'penalty': ['l1','l2'],
           'C': uniform(0.25, 1.25),
           'max_iter': randint(50,150)}

RF_grid = {'n_estimators': randint(50,200),
        'max_depth': randint(4,12)}

LGBM_grid = {'n_estimators': randint(50,200),
        'max_depth': randint(4,12),
        'learning_rate': uniform(0.05, 0.15)}

SVM_grid = [{'svc__C': uniform(0.1, 10.0),
             'svc__kernel': ['linear']},
            {'svc__C': uniform(0.1, 10.0),
             'svc__gamma': uniform(0.1, 10.0),
             'svc__kernel': ['rbf']}]

grid = {
    "KNN" : KNN_grid,
    "LogisticRegression" : LR_grid,
    "RandomForest" : RF_grid,
    "LGBM" : LGBM_grid,
    "SVM" : SVM_grid
}'''

In [None]:
'''i=0
clf_best_params = classifiers.copy()
scores = pd.DataFrame({
                    'Classifer':classifiers.keys(),
                    'Train accuracy' : np.zeros(len(classifiers)),
                    'Validation accuracy': np.zeros(len(classifiers)),
                    'Training time': np.zeros(len(classifiers))
                    })

for key, classifier in classifiers.items():
    start = time.time()
    clf = RandomizedSearchCV(estimator=classifier, param_distributions=grid[key], n_jobs=-1, n_iter=100)

    clf.fit(X_train, y_train)
    scores.iloc[i,1]=clf.score(X_train, y_train)
    scores.iloc[i,2]=clf.score(X_val, y_val)
    clf_best_params[key]=clf.best_params_
    
    stop = time.time()
    scores.iloc[i,3]=np.round((stop - start)/60, 2)
    
    print('Model:', key)
    print('Training time (mins):', scores.iloc[i,3])
    print('')
    i+=1'''

In [None]:
# clf_best_params

In [None]:
# scores

## Multiple Modeling

In [None]:
'''classifiers = [
    LogisticRegression(),
    LGBMClassifier(random_state=42),
    SVC(random_state=42)
]'''

In [None]:
'''FOLDS = 3
val_preds = []
test_preds = []

for classifier in classifiers:
    print(classifier)
    model = classifier.fit(X_train, y_train)
    val_preds.append(model.predict(X_val))
    test_preds.append(model.predict(X_test))
    
    splitter = StratifiedKFold(n_splits = FOLDS, shuffle = True, random_state=42)
    scores = cross_validate(classifier, X_train, y_train, return_train_score = True, cv=splitter)
    print("\t", np.mean(scores['train_score']), np.mean(scores['test_score']), "\n")'''

## Logistic Regression

In [None]:
'''lr = LogisticRegression()

lr.fit(X_train, y_train)
y_pred = lr.predict(X_val)

lr.fit(X_pca_train, y_train)
y_pca_pred = lr.predict(X_pca_val)'''

In [None]:
'''splitter = StratifiedKFold(n_splits = 3, shuffle = True, random_state=42)
scores = cross_validate(lr, X_train, y_train, return_train_score = True, cv=splitter)
scores_pca = cross_validate(lr, X_pca_train, y_train, return_train_score = True, cv=splitter)

print("Exsisting Data :", np.mean(scores['train_score']), np.mean(scores['test_score']))
print("PCA Data :", np.mean(scores_pca['train_score']), np.mean(scores_pca['test_score']))'''

-> PCA 후에 교차검증 정확도가 더 낮아지는 것으로 보아, 해당 데이터의 주성분들은 모델의 성능 향상 효과가 없음.

In [None]:
'''pd.DataFrame(confusion_matrix(y_val, y_pred),
                index = [["actual", "actual"], ["N", "P"]],
                columns = [["pred", "pred"], ["N", "P"]])'''

In [None]:
'''fig, ax = plt.subplots(figsize=(7,7))

fpr, tpr, _ = roc_curve(y_val, y_pred)
ax.plot(fpr, tpr, color='r', lw=2)
ax.plot([0, 1], [0, 1], color="navy", lw=1, linestyle="--")
plt.gca().set_aspect('equal')

ax.set_xlim([0.0, 1.0])
ax.set_ylim([0.0, 1.0])
ax.set_xlabel("FPR", size=12)
ax.set_ylabel("TPR", size=12)
ax.set_title("ROC Curve", size=15)

plt.show()

print("AUC Score:", roc_auc_score(y_val, y_pred))'''

## LightGBM

In [None]:
'''lgb = LGBMClassifier(random_state=42)
lgb_pca = LGBMClassifier(random_state=42)

lgb.fit(X_train, y_train)
y_pred = lgb.predict(X_val)

lgb_pca.fit(X_pca_train, y_train)
y_pca_pred = lgb_pca.predict(X_pca_val)'''

In [None]:
'''splitter = StratifiedKFold(n_splits = 3, shuffle = True, random_state=42)
scores = cross_validate(lgb, X_train, y_train, return_train_score = True, cv=splitter)
scores_pca = cross_validate(lgb_pca, X_pca_train, y_train, return_train_score = True, cv=splitter)

print("Exsisting Data :", np.mean(scores['train_score']), np.mean(scores['test_score']))
print("PCA Data :", np.mean(scores_pca['train_score']), np.mean(scores_pca['test_score']))'''

-> PCA 후에 교차검증 정확도가 더 낮아지는 것으로 보아, 해당 데이터의 주성분들은 모델의 성능 향상 효과가 없음.

In [None]:
'''print("Acc. :", accuracy_score(y_val, y_pred))
print("Prec. :", precision_score(y_val, y_pred))
print('Recall :', recall_score(y_val, y_pred))
print('f1. :', f1_score(y_val, y_pred))'''

In [None]:
'''fig, ax = plt.subplots(figsize=(7,7))

fpr, tpr, _ = roc_curve(y_val, y_pred)
ax.plot(fpr, tpr, color='r', lw=2)
ax.plot([0, 1], [0, 1], color="navy", lw=1, linestyle="--")
plt.gca().set_aspect('equal')

ax.set_xlim([0.0, 1.0])
ax.set_ylim([0.0, 1.0])
ax.set_xlabel("FPR", size=12)
ax.set_ylabel("TPR", size=12)
ax.set_title("ROC Curve", size=15)

plt.show()

print("AUC Score:", roc_auc_score(y_val, y_pred))'''

# LSTM
- https://www.kaggle.com/code/ryanbarretto/lstm-baseline

In [None]:
'''# Existing Data
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.3, random_state=42)
print("Train data\t:", X_train.shape, y_train.shape)
print("Validation data\t:", X_val.shape, y_val.shape)'''

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from IPython.display import display
from sklearn.model_selection import GroupKFold
from sklearn.metrics import roc_auc_score

In [None]:
groups = train["sequence"]
train = train.drop(["sequence", "subject", "step",'state'], inplace=False, axis=1).values
test = test.drop(["sequence", "subject", "step"], inplace=False, axis=1).values
labels = train_labels["state"]
train = train.reshape(int(len(train)/60), 60, 92)
test = test.reshape(int(len(test)/60), 60, 92)

In [None]:
def BuildNN():
    # with tpu_strategy.scope():
        model = keras.models.Sequential([
            keras.layers.Input(shape=(60, 92)),
            keras.layers.LSTM(500, return_sequences=True),
            keras.layers.LSTM(400, return_sequences=True),
            keras.layers.LSTM(300, return_sequences=True),
            keras.layers.LSTM(200, return_sequences=True),
            keras.layers.Conv1D(32, 7),
            keras.layers.MaxPooling1D(),
            keras.layers.Conv1D(64, 3),
            keras.layers.MaxPooling1D(),
            keras.layers.Conv1D(128, 3),
            keras.layers.GlobalMaxPooling1D(),
            keras.layers.Dense(150, activation="relu"),
            keras.layers.Dense(50, activation="swish"),
            keras.layers.Dense(1, activation="sigmoid")
        ])

        model.compile(optimizer="adam", loss="binary_crossentropy", metrics=[keras.metrics.AUC()])
        return model

In [None]:
'''tpu = tf.distribute.cluster_resolver.TPUClusterResolver.connect()
tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)'''

In [None]:
cv_score = 0
test_preds = []
kf = GroupKFold(n_splits=5)
for fold_idx, (train_idx, valid_idx) in enumerate(kf.split(train, labels, groups.unique())):
    
    print("*"*15, f"Fold {fold_idx+1}", "*"*15)
    
    X_train, X_valid = train[train_idx], train[valid_idx]
    y_train, y_valid = labels.iloc[train_idx].values, labels.iloc[valid_idx].values
    
    model = BuildNN()
    model.fit(X_train, y_train, validation_data=(X_valid, y_valid), epochs=100, batch_size=256, 
              callbacks=[keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)])
    
    cv_score += roc_auc_score(y_valid, model.predict(X_valid).squeeze())
    
    test_preds.append(model.predict(test).squeeze())
    
print(cv_score/5)

## Permutation Importance

In [None]:
'''result = permutation_importance(lgb, X_val, y_val, n_repeats=10, random_state=42, n_jobs=-1)
sorted_idx = result.importances_mean.argsort()'''

In [None]:
'''importance = pd.DataFrame({"Feature" : X_val.columns[sorted_idx], 
                           "Importance" : result.importances_mean[sorted_idx]})\
                        .sort_values("Importance", ascending=False).reset_index(drop=True)
importance.style.background_gradient(cmap="RdBu", vmin=-0.1, vmax=0.1)'''

In [None]:
'''fig, ax = plt.subplots(figsize=(12,len(importance)*0.3))
sns.barplot(x = "Importance", y = "Feature", data=importance)
ax.set_title("Permutation Importance", pad = 10, size = 20)
# ax.set_xlim(-0.001, 0.13)
ax.set_xlabel("Importance", fontsize=15)
ax.set_ylabel("Feature", fontsize=15)
fig.tight_layout()
plt.show()'''

In [None]:
'''importance'''

# Step 7. Submission

In [None]:
submission["state"] = sum(test_preds)/5
submission.to_csv("submission.csv", index=False)
submission

In [None]:
'''lgb_test= lgb.predict(test)
pred = pd.DataFrame({"sequence":test['sequence'],
                     "state":lgb_test.tolist()})
pred['state'].value_counts()'''

In [None]:
'''pred_fin = pred.groupby('sequence').mean().reset_index()
pred_fin['state'].unique()'''

In [None]:
'''submission['state'] = pred_fin['state']
submission'''

In [None]:
'''submission.to_csv('submission.csv', index=False)'''