# Importing Libraries and Loading datasets

In [None]:
import numpy as np
import pandas as pd
import lightgbm as lgb

import matplotlib.pyplot as plt
import seaborn as sns

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold

In [None]:
train = pd.read_csv("../input/tabular-playground-series-apr-2022/train.csv")
train_labels = pd.read_csv("../input/tabular-playground-series-apr-2022/train_labels.csv")
test = pd.read_csv("../input/tabular-playground-series-apr-2022/test.csv")
sub = pd.read_csv("../input/tabular-playground-series-apr-2022/sample_submission.csv")

# Explore Data

In [None]:
train.head()

In [None]:
train_labels.head()

In [None]:
train.describe()

In [None]:
print("Columns: \n{0}".format(list(train.columns)))

# Basic Data Check

In [None]:
print('Train data shape:', train.shape)
print('Test data shape:', test.shape)

## Missing values

In [None]:
missing_values_train = train.isna().any().sum()
print('Missing values in train data: {0}'.format(missing_values_train[missing_values_train > 0]))

missing_values_test = test.isna().any().sum()
print('Missing values in test data: {0}'.format(missing_values_test[missing_values_test > 0]))

## Duplicates

In [None]:
duplicates_train = train.duplicated().sum()
print('Duplicates in train data: {0}'.format(duplicates_train))

duplicates_test = test.duplicated().sum()
print('Duplicates in test data: {0}'.format(duplicates_test))

# Feature Engineering

In [None]:
def aggregate(df, aggregation_cols, prefix):
    result = df[['sequence', 'subject']].copy()
    for sensor in df.columns.tolist()[3:]:
        group = df.groupby(aggregation_cols).aggregate({ sensor: ['mean', 'max', 'min', 'mad', 'sum', 'median'] })
        group.columns = group.columns.map(prefix.join)
        group = pd.merge(df.groupby(aggregation_cols).size().reset_index(name=str(prefix) + 'size'),
                         group, how='left', on=aggregation_cols)
        result = pd.merge(result, group, how='left')
    return result.drop_duplicates().reset_index(drop=True)

def create_features(df):
    sequence = aggregate(df, ['sequence', 'subject'], "_")
    subject = aggregate(df, ['subject'], "_subject_")
    return sequence.merge(subject, how='left', on=['sequence', 'subject'])

train_data = create_features(train)
test_data = create_features(test)

train_data.head()

# Reduce memory usage

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)  
        else:
            df[col] = df[col].astype('category')
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df
reduce_mem_usage(train_data)
reduce_mem_usage(test_data);

# Modelling

In [None]:
X = train_data.copy()
y = train_labels["state"].copy()

sc = StandardScaler()
X = sc.fit_transform(X)
test_X = sc.transform(test_data)

N_SPLITS = 10

In [None]:
# Credits to https://www.kaggle.com/code/tyrionlannisterlzy/xgboost-dnn-ensemble-lb-0-978/notebook?scriptVersionId=93228361
params = {'n_estimators': 8192,
          'max_depth': 7,
          'learning_rate': 0.1,
          'subsample': 0.96,
          'colsample_bytree': 0.80,
          'reg_lambda': 1.50,
          'reg_alpha': 6.10,
          'gamma': 1.40,
          'random_state': 1,
          'objective': 'binary:logistic'}

In [None]:
scores = []
y_probs = []
cv = StratifiedKFold(n_splits=N_SPLITS, random_state=1, shuffle=True)
model = XGBClassifier(**params, use_label_encoder=False)
for fold, (train_idx, test_idx) in enumerate(cv.split(X, y)):  
    train_X, val_X = X[train_idx], X[test_idx]
    train_y, val_y = y.iloc[train_idx], y.iloc[test_idx]

    # Fit model
    model.fit(train_X, train_y,
              eval_set=[(val_X, val_y)], eval_metric = ['auc','logloss'],
              early_stopping_rounds = 64, verbose=False)
    # Make predictions
    predictions = model.predict(val_X)
    
    # Get AUC
    auc = roc_auc_score(val_y, predictions)
    print("Fold: %d  \t\t AUC:  %f" %(fold + 1, auc))
    
    scores.append(auc)
    y_probs.append(model.predict_proba(test_X))
print("Mean accuracy score:", np.array(scores).mean())

## Submission

In [None]:
sub["state"] = np.mean(y_probs, axis=0)[:, 1]
sub.to_csv("submission.csv", index=False)
sub

# Feature Importances

In [None]:
df = pd.DataFrame({'features': train_data.columns,
                   'importance': model.feature_importances_})

plt.figure(figsize=(16, 24))
sns.barplot(x='importance', y="features", data=df,
            order=df.sort_values('importance', ascending=False).features)
plt.title('Feature Importances')
plt.show()