# Introduction
Thanks for looking.
In this notebook, we will show a simple implementation of the lightgbm method for beginners in machine learning.

# Module Load

In [None]:
import numpy as np
import pandas as pd
import warnings
import time
warnings.simplefilter('ignore')
import math
from statistics import mean
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import timedelta
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

# Loading Data
Loading data by using read_csv in pandas.

In [None]:
train_df = pd.read_csv('../input/tabular-playground-series-apr-2022/train.csv')
test_df = pd.read_csv('../input/tabular-playground-series-apr-2022/test.csv')
train_label_df = pd.read_csv('../input/tabular-playground-series-apr-2022/train_labels.csv')

In [None]:
# comfirm data
print(train_df.shape)  
train_df.head(70)

In [None]:
print(train_label_df.shape) 
train_label_df.head(10)

In [None]:
print(test_df.shape)
test_df.head(10)

## Data Summary
***
**train.csv**... Training set. Consists of 60 seconds of recordings from 13 biometric sensors of about 26,000 experimental participants.  
*  sequence - ID for each sequence
*  subject - ID of the subject who participated in the experiment
*  step - Time step of recording, 1 second interval
*  sensor_00 - sensor_12 - Value of each of the 13 sensors at that time step  

**train_label.csv**... Class label for each sequence
*  sequence - ID of each sequence
*  state - The state of the subject at each sequence. This is the label we are predicting  

**test.csv**... Data to predict the state of each of the 12218 (733080 divided by 60) people.
***
There are 60 seconds of data for each person, and the state (0 or 1) of the person is predicted based on the value of the data.

## Explanatory Data Analysis
First, check the statistics of each data using describe

In [None]:
train_df.loc[:, 'sensor_00': 'sensor_12'].describe()

## Preprcessing
Combine data to process test and training data in batches

In [None]:
ntrain = train_df.shape[0]
all_data = pd.concat((train_df, test_df))#.reset_index(drop=True)
print(all_data.shape)
all_data.head()

In [None]:
all_data.info()

Add features

In [None]:
features = all_data.columns.tolist()[3:]
for feature in features:
    all_data[feature + '_lag1'] = all_data.groupby('sequence')[feature].shift(1)
    all_data[feature + '_back_lag1'] = all_data.groupby('sequence')[feature].shift(-1)
    all_data.fillna(0, inplace=True)
    all_data[feature + '_diff1'] = all_data[feature] - all_data[feature + '_lag1']
    # New features
    for window in [3,6,12]:
        all_data[feature+'_roll_'+str(window)+'_mean'] = all_data.groupby('sequence')[feature]\
        .rolling(window=window, min_periods=1).mean().reset_index(level=0,drop=True)
        
        all_data[feature+'_roll_'+str(window)+'_std'] = all_data.groupby('sequence')[feature]\
        .rolling(window=window, min_periods=1).std().reset_index(level=0,drop=True)
        
        all_data[feature+'_roll_'+str(window)+'_sum'] = all_data.groupby('sequence')[feature]\
        .rolling(window=window, min_periods=1).sum().reset_index(level=0,drop=True)
# Experemental features
all_data['sens_00_06'] = all_data['sensor_00'] * all_data['sensor_06']
all_data['sens_03_07'] = all_data['sensor_03'] * all_data['sensor_07']
all_data['sens_03_11'] = all_data['sensor_03'] * all_data['sensor_11']
for feature in ['sens_00_06', 'sens_03_07', 'sens_03_11']:
    all_data[feature + '_lag1'] = all_data.groupby('sequence')[feature].shift(1)
all_data.fillna(0, inplace=True)


all_data.head()

Consolidate information

In [None]:
features = all_data.columns[3:]
print(features)
# mean
mean_seq = all_data.groupby('sequence').mean()
all_data_summ = mean_seq.rename(columns={s: s+'_mean' for s in features})
# std
std_seq = all_data.groupby('sequence').std().drop(['step', 'subject'], axis=1)
all_data_summ = pd.concat([all_data_summ, std_seq.rename(columns={s: s+'_std' for s in features})], axis=1)
# max
max_seq = all_data.groupby('sequence').max().drop(['step', 'subject'], axis=1)
all_data_summ = pd.concat([all_data_summ, max_seq.rename(columns={s: s+'_max' for s in features})], axis=1)
# min
min_seq = all_data.groupby('sequence').min().drop(['step', 'subject'], axis=1)
all_data_summ = pd.concat([all_data_summ, min_seq.rename(columns={s: s+'_min' for s in features})], axis=1)
# sum
sum_seq = all_data.groupby('sequence').sum().drop(['step', 'subject'], axis=1)
all_data_summ = pd.concat([all_data_summ, sum_seq.rename(columns={s: s+'_sum' for s in features})], axis=1)
# median
medi_seq = all_data.groupby('sequence').median().drop(['step', 'subject'], axis=1)
all_data_summ = pd.concat([all_data_summ, medi_seq.rename(columns={s: s+'_medi' for s in features})], axis=1)
# first quantile
quan1_seq = all_data.groupby('sequence').quantile(0.25).drop(['step', 'subject'], axis=1)
all_data_summ = pd.concat([all_data_summ, quan1_seq.rename(columns={s: s+'_quantile1' for s in features})], axis=1)
# third quartile
quan3_seq = all_data.groupby('sequence').quantile(0.75).drop(['step', 'subject'], axis=1)
all_data_summ = pd.concat([all_data_summ, quan3_seq.rename(columns={s: s+'_quantile3' for s in features})], axis=1)

# mean
mean_seq = all_data.groupby('subject').mean().drop(['step', 'sequence'], axis=1)
all_data_summ = pd.concat([all_data_summ, mean_seq.rename(columns={s: 'subject_'+s+'_mean' for s in features})], axis=1)
# std
std_seq = all_data.groupby('subject').std().drop(['step', 'sequence'], axis=1)
all_data_summ = pd.concat([all_data_summ, std_seq.rename(columns={s: 'subject_'+s+'_std' for s in features})], axis=1)
# max
max_seq = all_data.groupby('subject').max().drop(['step', 'sequence'], axis=1)
all_data_summ = pd.concat([all_data_summ, max_seq.rename(columns={s: 'subject_'+s+'_max' for s in features})], axis=1)
# min
min_seq = all_data.groupby('subject').min().drop(['step', 'sequence'], axis=1)
all_data_summ = pd.concat([all_data_summ, min_seq.rename(columns={s: 'subject_'+s+'_min' for s in features})], axis=1)
# sum
sum_seq = all_data.groupby('subject').sum().drop(['step', 'sequence'], axis=1)
all_data_summ = pd.concat([all_data_summ, sum_seq.rename(columns={s: 'subject_'+s+'_sum' for s in features})], axis=1)
# median
medi_seq = all_data.groupby('subject').median().drop(['step', 'sequence'], axis=1)
all_data_summ = pd.concat([all_data_summ, medi_seq.rename(columns={s: 'subject_'+s+'_medi' for s in features})], axis=1)
# first quantile
quan1_seq = all_data.groupby('subject').quantile(0.25).drop(['step', 'sequence'], axis=1)
all_data_summ = pd.concat([all_data_summ, quan1_seq.rename(columns={s: 'subject_'+s+'_quantile1' for s in features})], axis=1)
# third quartile
quan3_seq = all_data.groupby('subject').quantile(0.75).drop(['step', 'sequence'], axis=1)
all_data_summ = pd.concat([all_data_summ, quan3_seq.rename(columns={s: 'subject_'+s+'_quantile3' for s in features})], axis=1)
# print(all_data_summ.columns.tolist())
all_data_summ.fillna(0, inplace=True)
all_data_summ.head()

In [None]:
all_data_summ.info()

Undo test_df and train_df

In [None]:
train_df = all_data_summ[:ntrain//60]
test_df = all_data_summ[ntrain//60:]
print(train_df.shape, test_df.shape)
train_df.head()

# Trainig
Now that feature engineering is done, create and train the model

## Model Partitioning
Split `train_df` into training data (used to train the model) and test data (used to verify generalization performance of the model)

In [None]:
test_size = 0.20
features = train_df.columns[2:]
X, y = train_df[features].values, train_label_df['state'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, stratify=y)
print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

## Hyperparameter tuning
Furthermore, the training data is split into training data and validation data, and hyperparameter tuning is performed using the validation data.
This process is time consuming, so skip this cell when implementing without tuning.

In [None]:
# import optuna.integration.lightgbm as lgb   # tuning model
# X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.30, random_state=1998, stratify=y_train)
# # Create dataset for LightGBM
# lgb_train = lgb.Dataset(X_train, y_train)
# lgb_valid = lgb.Dataset(X_valid, y_valid)
# # setting parameters
# params = {
#     # binary classification problem
#     'objective': 'binary',
#     # Aim to maximize AUC
#     'metric': 'auc',
#     # Fatal case output
#     'verbosity': -1,
#     # number of leaf
#     'num_leaves': 50,
#     # learning rate
#     'learning_rate': 0.05,
#     # feature fraction
#     'feature_fraction': 0.9,
#     # bagging fraction
#     'bagging_fraction': 0.408242911006906,
#     # bagging frequency
#     'bagging_freq': 5,
# }

# # Model creation from training data
# gbm = lgb.train(params, lgb_train, valid_sets=lgb_valid,
#                 verbose_eval=50, # Learning result output every 50 iter
#                 num_boost_round=10000, # max iteration
#                 early_stopping_rounds=100
#                )
# best_params = gbm.params
# print(best_params)
# # Check forecast accuracy
# y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
# print(y_pred)

Learning without Tuning.  
Here, k-fold cross validation is performed to create a model with better generalization performance

In [None]:
import lightgbm as lgb   # without tuning
# parameter
params = {'objective': 'binary', 
          'metric': 'auc', 
          'verbosity': -1, 
          'num_leaves': 45, 
          'learning_rate': 0.05, 
          'feature_fraction': 1.0, 
          'bagging_fraction': 0.7464532049351305, 
          'bagging_freq': 2, 
          'feature_pre_filter': False, 
          'lambda_l1': 0.004190355150239527, 
          'lambda_l2': 1.5273917248709095e-08, 
          'min_child_samples': 25}

kfold = StratifiedKFold(n_splits=10,
                        random_state=1, shuffle=True).split(X_train, y_train)     #(分割数、シード)を指定
scores = []   # score list
models = []   # model list
for k, (train, test) in enumerate(kfold):
    X_trainset_lgb = lgb.Dataset(X_train[train], y_train[train])
    X_validset_lgb = lgb.Dataset(X_train[test], y_train[test])

    gbm = lgb.train(params, X_trainset_lgb, valid_sets=X_validset_lgb,
                verbose_eval=50,
                num_boost_round=10000, 
                early_stopping_rounds=100)
    pred_t = gbm.predict(X_train[test], num_iteration=gbm.best_iteration)
    score = roc_auc_score(y_train[test], pred_t) 
    scores.append(score)  
    print('Fold: %2d, AUC: %.3f' % (k+1, score))
    models.append(gbm)
print('\nCV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))

Create a function for predict, make predictions with k models, and average the predictions.

In [None]:
def predict(models, X_test):
#　Create array for storing test data
    y_pred = np.zeros((len(X_test), len(models)))

    for fold_, model in enumerate(models):
        # predict each model
        pred_ = model.predict(X_test, num_iteration=model.best_iteration)
        # save predict
        y_pred[:, fold_] = pred_ 
    y_pred = y_pred.mean(axis=1)
    return y_pred
y_pred = predict(models, X_test)

evaluate models by ROC curve and AUC

In [None]:
roc = roc_curve(y_test, y_pred)
print("roc", roc_auc_score(y_test, y_pred))
fpr, tpr, thresholds = roc
plt.plot(fpr, tpr, marker='o')
plt.xlabel('FPR: False positive rate')
plt.ylabel('TPR: True positive rate')
plt.grid()

In [None]:
y_pred_train = predict(models, X_train)
y_pred_train = y_pred_train
roc = roc_curve(y_train, y_pred_train)
print("roc", roc_auc_score(y_train, y_pred_train))
fpr, tpr, thresholds = roc
plt.plot(fpr, tpr, marker='o')
plt.xlabel('FPR: False positive rate')
plt.ylabel('TPR: True positive rate')
plt.grid()

# Submit Prediction
Create data for submission

In [None]:
X_submit = test_df[features].values
y_submit = predict(models, X_submit)
print(y_submit.shape)
plt.hist(y_submit, bins=30, density=True)
plt.show()

In [None]:
submission_df = pd.read_csv('../input/tabular-playground-series-apr-2022/sample_submission.csv')
print(submission_df.shape)
submission_df.head()

In [None]:
submission_df['state'] = pd.DataFrame(y_submit)
submission_df.head()

In [None]:
submission_df.to_csv("submission.csv", index=False, header=True)