In [None]:
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb
import lightgbm as lgb
import seaborn as sns
from sklearn.decomposition import LatentDirichletAllocation, NMF, TruncatedSVD
from sklearn.metrics import f1_score, fbeta_score, precision_score, recall_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold,KFold,train_test_split,GroupKFold
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
import warnings
from datetime import datetime

In [None]:
train = pd.read_csv('../input/tabular-playground-series-apr-2022/train.csv')
train_label = pd.read_csv('../input/tabular-playground-series-apr-2022/train_labels.csv')
test = pd.read_csv('../input/tabular-playground-series-apr-2022/test.csv')
sub = pd.read_csv('../input/tabular-playground-series-apr-2022/sample_submission.csv')

In [None]:
train

In [None]:
train_label

In [None]:
test

# Feature engineering

In [None]:
sensor = ['00','01','02','03','04','05','06','07','08','09','10','11','12']

drop_columes = []
for i in sensor:
    drop_columes.append(f"sensor_{i}")
    
drop_columes.append("step")


def feature_engineer(df):
    df_copy = df.copy()
    for i in sensor:
        mean_value = df.groupby(['sequence','subject'])[f"sensor_{i}"].mean()
        mean_value = mean_value.rename(f"sensor_{i}_mean")
        
        
        std_value  = df.groupby(['sequence','subject'])[f"sensor_{i}"].std()
        std_value  = std_value.rename(f"sensor_{i}_std")

        skew_value  = df.groupby(['sequence','subject'])[f"sensor_{i}"].skew()
        skew_value  = skew_value.rename(f"sensor_{i}_skew")
        
#         kurt_value  = df.groupby(['sequence','subject'])[f"sensor_{i}"].kurtosis()
#         kurt_value  = kurt_value.rename(f"sensor_{i}_kurt")
        
        
        
        max_value  = df.groupby(['sequence','subject'])[f"sensor_{i}"].max()
        max_value  = max_value.rename(f"sensor_{i}_max")

        min_value  = df.groupby(['sequence','subject'])[f"sensor_{i}"].min()
        min_value  = min_value.rename(f"sensor_{i}_min")
        
        



        df_copy = df_copy.merge(mean_value, left_on=['sequence', 'subject'], right_index=True)
        df_copy = df_copy.merge(std_value,  left_on=['sequence', 'subject'], right_index=True)
        df_copy = df_copy.merge(skew_value, left_on=['sequence', 'subject'], right_index=True)
#         df_copy = df_copy.merge(kurt_value, left_on=['sequence', 'subject'], right_index=True)
        df_copy = df_copy.merge(max_value,  left_on=['sequence', 'subject'], right_index=True)
        df_copy = df_copy.merge(min_value,  left_on=['sequence', 'subject'], right_index=True)
    
    df_copy = df_copy.drop(drop_columes, axis=1)
    df_copy = df_copy[::60]
    return df_copy

In [None]:
df_train = feature_engineer(train)
df_test =  feature_engineer(test)

In [None]:
df_train

In [None]:
df_test

In [None]:
X_train = df_train.drop(["sequence", "subject"], axis=1).reset_index(drop=True)
y_train = train_label.drop(["sequence"], axis=1)
X_test  = df_test.drop(["sequence", "subject"], axis=1).reset_index(drop=True)

In [None]:
X_train

In [None]:
y_train

In [None]:
X_test

# Model

In [None]:
paras = {
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'learning_rate': 0.1,
        'max_depth': 8,
       # 'random_state': seed,
        'bagging_fraction': 0.8,
        'feature_fraction': 0.8, 
        'metric': 'auc'
    }


kf = KFold(n_splits = 5, shuffle = True, random_state = 70)
models = []
for fold, (tr_idx, va_idx) in enumerate(kf.split(X_train)):
    print(f'--------fold:{fold+1}--------')
    fold+=1
    tr_x, va_x = X_train.iloc[tr_idx], X_train.iloc[va_idx]
    tr_y, va_y = y_train.iloc[tr_idx], y_train.iloc[va_idx]
    data_train = lgb.Dataset(tr_x, tr_y)
    data_val = lgb.Dataset(va_x, va_y)
    
    
    lgb_results = {}   
    model = lgb.train(
        params = paras,
        train_set = data_train,
        valid_sets = [data_val ,data_train],
        valid_names=['eval', 'train'],
        num_boost_round = 1000,
      #  valid_sets = watchlist,
        early_stopping_rounds=50,
        evals_result=lgb_results,
        verbose_eval=100
    )
    models.append(model)

# Prediction

In [None]:
prediction = np.zeros(X_test.shape[0])
for i,model in enumerate(models):
    pred = model.predict(X_test) 
    prediction += pred
prediction = prediction/len(models)

In [None]:
prediction

In [None]:
# prediction = np.where(prediction > 0.5, 1, 0)
# prediction

In [None]:
submission = pd.read_csv('../input/tabular-playground-series-apr-2022/sample_submission.csv')
submission['state'] = prediction
submission.to_csv('submission.csv', index=False)