In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import roc_auc_score

import xgboost as xgb
from lightgbm import LGBMClassifier

In [None]:
train_data = pd.read_csv('../input/tabular-playground-series-apr-2022/train.csv')
train_labels = pd.read_csv('../input/tabular-playground-series-apr-2022/train_labels.csv')

In [None]:
train_data_stats = pd.DataFrame()

for sensor in range(13):
    sensor_name = f"sensor_{sensor:02d}"
    train_data_stats[f'{sensor_name}''_max'] = train_data.groupby('sequence')[f'{sensor_name}'].max()
    train_data_stats[f'{sensor_name}''_min'] = train_data.groupby('sequence')[f'{sensor_name}'].min()
    train_data_stats[f'{sensor_name}''_mean'] = train_data.groupby('sequence')[f'{sensor_name}'].mean()
    train_data_stats[f'{sensor_name}''_std'] = train_data.groupby('sequence')[f'{sensor_name}'].std()
    train_data_stats[f'{sensor_name}''_median'] = train_data.groupby('sequence')[f'{sensor_name}'].median()

In [None]:
X = train_data_stats
y = train_labels['state']

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y)

In [None]:
params = {'n_estimators': 1000,
          'max_depth': 7,
          'learning_rate': 0.1,
          'subsample': 0.95,
          'colsample_bytree': 0.60,
          'reg_lambda': 1.50,
          'reg_alpha': 6.10,
          'gamma': 1.40,
          'random_state': 69,
          'objective': 'binary:logistic',
          'tree_method': 'hist',
        } 

In [None]:
xgb_class = xgb.XGBClassifier(**params)
xgb_class.fit(X_train, y_train, eval_set = [(X_val, y_val)], eval_metric = ['auc'], early_stopping_rounds = 128)
print('Train set score:', lgb.score(X_train, y_train))
print('Test set score:', lgb.score(X_val, y_val))

print('Crossvalidation score:', cross_val_score(lgb, X_val, y_val, scoring="roc_auc", cv=5).mean())

y_pred_proba = lgb.predict_proba(X_val)[::, 1]
auc = roc_auc_score(y_val, y_pred_proba)

print('AUC Test score:', auc)

In [None]:
#y_pred_proba = xgb_class.predict_proba(X_val)[::, 1]
#auc = roc_auc_score(y_val, y_pred_proba)
#print(auc)

In [None]:
params = { 'max_bin': 60, 
          'num_leaves': 15, 
          'max_depth': 7,
          'objective': 'binary',
          'metric': 'auc', 
}

In [None]:
lgb = LGBMClassifier(**params)
lgb.fit(X_train, y_train)
print('Train set score:', lgb.score(X_train, y_train))
print('Test set score:', lgb.score(X_val, y_val))

print('CV AUC score:', cross_val_score(lgb, X_val, y_val, scoring="roc_auc", cv=5).mean())

y_pred_proba = lgb.predict_proba(X_val)[::, 1]
auc = roc_auc_score(y_val, y_pred_proba)

print('AUC validation score:', auc)

In [None]:
test_data = pd.read_csv('../input/tabular-playground-series-apr-2022/test.csv')

In [None]:
test_data_stats = pd.DataFrame()

for sensor in range(13):
    sensor_name = f"sensor_{sensor:02d}"
    test_data_stats[f'{sensor_name}''_max'] = test_data.groupby('sequence')[f'{sensor_name}'].max()
    test_data_stats[f'{sensor_name}''_min'] = test_data.groupby('sequence')[f'{sensor_name}'].min()
    test_data_stats[f'{sensor_name}''_mean'] = test_data.groupby('sequence')[f'{sensor_name}'].mean()
    test_data_stats[f'{sensor_name}''_std'] = test_data.groupby('sequence')[f'{sensor_name}'].std()
    test_data_stats[f'{sensor_name}''_median'] = test_data.groupby('sequence')[f'{sensor_name}'].median()

In [None]:
X_test = test_data_stats

y_pred = xgb_class.predict_proba(X_test)
#y_pred = lgb.predict(X_test)

In [None]:
submission = pd.DataFrame()

submission.index = X_test.index
submission['state'] = y_pred[:, 1]

submission

In [None]:
submission.to_csv('twelveth_submission.csv')