In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_df = pd.read_csv('/kaggle/input/tabular-playground-series-may-2022/train.csv')
test_df = pd.read_csv('/kaggle/input/tabular-playground-series-may-2022/test.csv')

# Brief investigation of the dataset

No nan values in all data

Only f_27 is special that it's dtpye is object (str)

Each sequence is 10 letters long

In [None]:
train_df.count(), train_df.dtypes

In [None]:
train_df.target.value_counts()

In [None]:
fs = [f'f_{i:02d}' for i in range(0, 31)]
num_df = train_df[['target']+fs].drop(['f_27'], axis=1)

In [None]:
fig, axs = plt.subplots(6, 6, figsize=(24, 20))
for i, col in enumerate(num_df.columns):
    num_df[f'{col}'].plot(kind='hist', bins=100, ax=axs[i // 6, i % 6], title=f'{col}')
fig.tight_layout()

In [None]:
num_df.f_18.unique(), num_df.f_29.unique(), num_df.f_30.unique()

In [None]:
num_df.corr()

# Looking into the whole sequence

Kind of obvious that certain sequences are more likely to have target 0/1

Although the amount of duplicated sequence is not that high in the data, it may still be useful for very fine optimization in accuracy.

The full sequence information may be useful though some means of feature engineering

In [None]:
display(train_df['f_27'])
print('% of unique sequence in train', len(train_df['f_27'].unique()) / len(train_df['f_27']))
print('% of unique sequence in test', len(test_df['f_27'].unique()) / len(test_df['f_27']))

In [None]:
f27_target_seq_counts = train_df[['target', 'f_27']].groupby(['target', 'f_27']).size().unstack(fill_value=0).T

most_occur_in_0 = f27_target_seq_counts.sort_values(by=[0], ascending=False)
most_occur_in_1 = f27_target_seq_counts.sort_values(by=[1], ascending=False)

fig, axs = plt.subplots(2, figsize=(20,10))
most_occur_in_0.head(5000).plot(kind='area', stacked=False, ax=axs[0], title='Top 5000 occuring duplicated sequences for target 0')
most_occur_in_1.head(5000).plot(kind='area', stacked=False, ax=axs[1], title='Top 5000 occuring duplicated sequences for target 1')
fig.tight_layout()

In [None]:
f27_map = most_occur_in_0.reset_index()
f27_map.columns = ['f_27', 'f_27_tar0', 'f_27_tar1']
f27_map

If we look into the test set, only very little data have the same duplicated sequences as in train set

In [None]:
print('Among all sequences in test set')
print('% of same duplicated sequence as in train set:', test_df['f_27'].isin(f27_target_seq_counts.index).sum() / len(test_df))
print('# of same top 100 duplicated sequence as in train set, for target 0:',test_df['f_27'].isin(most_occur_in_0.index).sum())
print('# of same top 100 duplicated sequence as in train set, for target 1:', test_df['f_27'].isin(most_occur_in_1.index).sum())

# Examining the sequence characters by position

In [None]:
# Splitting the sequences into single letters
f27_split = train_df['f_27'].str.split(pat ="\s*", expand = True).iloc[:,1:-1]
f27_split_test = test_df['f_27'].str.split(pat ="\s*", expand = True).iloc[:,1:-1]

Distribution of letters in different positions of sequences

In [None]:
fig, axs = plt.subplots(5, 2, figsize=(10, 20))
for i in range(10):
    f27_split.iloc[:, i].value_counts().plot(kind='bar', ax=axs[i // 2, i % 2], title=f'sequence position {i}')
fig.tight_layout()

In [None]:
fig, axs = plt.subplots(5, 2, figsize=(10, 20))
for i in range(10):
    f27_split_test.iloc[:, i].value_counts().plot(kind='bar', ax=axs[i // 2, i % 2], title=f'sequence position {i}')
fig.tight_layout()

In [None]:
f27_split['target'] = train_df['target']

We now check if the letter in certain position will have biases towards target 0/1

For example,

In position 0, A may have slight bias towards 1 and B may have slight bias towards 0

In position 7, a very interesting inverse pattern of 0/1 occurs. As from the distribution of letters above, you can only see it is an uniform distribution

In [None]:
fig, axs = plt.subplots(10, figsize=(10,30))
for i in range(10):
    target_seqpos_counts = f27_split[['target', i+1]].groupby(['target', i+1]).size().unstack(fill_value=0).T
    target_seqpos_counts.plot(kind='bar', ax=axs[i], title=f'sequence position {i}')
fig.tight_layout()

We first try to use the position and letter information for prediction only

# Feature engineering

In [None]:
# from sklearn.preprocessing import OrdinalEncoder
# enc = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=np.nan)
# f27_encoded = enc.fit_transform(f27_split.drop(['target'], axis=1)).astype(int)
f27_split_df = pd.DataFrame(f27_split)
f27_split_df.columns = [f'f_27_pos{i}' for i in range(10)] + ['target']
full_df = pd.concat([train_df, f27_split_df], axis=1)

# f27_encoded_test = enc.transform(f27_split_test).astype(int)
f27_split_test_df = pd.DataFrame(f27_split_test)
f27_split_test_df.columns = [f'f_27_pos{i}' for i in range(10)]
full_test_df = pd.concat([test_df, f27_split_test_df], axis=1)

In [None]:
def aggregate_features(df):
    # High imformation numurical features
    # See https://www.kaggle.com/code/ambrosm/tpsmay22-advanced-keras
    df['i_02_21'] = (df.f_21 + df.f_02 > 5.2).astype(int) - (df.f_21 + df.f_02 < -5.3).astype(int)
    df['i_05_22'] = (df.f_22 + df.f_05 > 5.1).astype(int) - (df.f_22 + df.f_05 < -5.4).astype(int)
    i_00_01_26 = df.f_00 + df.f_01 + df.f_26
    df['i_00_01_26'] = (i_00_01_26 > 5.0).astype(int) - (i_00_01_26 < -5.0).astype(int)
    
    # f_27 char sequence features
    df["f_27_unique_len"] = df["f_27"].apply(lambda x: len(set(x)))
    df["f_27_unique_char"] = df["f_27"].apply(lambda x :  ''.join(sorted(set(x))))
    return df

full_df = aggregate_features(full_df)
full_test_df = aggregate_features(full_test_df)

In [None]:
full_df[['f_27', 'f_27_unique_char'] + [f'f_27_pos{i}' for i in range(10)]]

In [None]:
cat_cols = ['f_27', "f_27_unique_char"] + [f'f_27_pos{i}' for i in range(10)]
# Setting the int type features as categorical also
cat_cols += [f'f_{i:02d}' for i in range(7, 19)] + ['f_29', 'f_30', 'i_02_21', 'i_05_22', 'i_00_01_26']

X_train = full_df.drop(['id', 'target'], axis=1)
X_train[cat_cols] = X_train[cat_cols].astype('category')
y_train = train_df.target

X_test = full_test_df.drop(['id'], axis=1)
X_test[cat_cols] = X_test[cat_cols].astype('category')

In [None]:
X_train.dtypes

# LGBM

In [None]:
import lightgbm as lgb

train_data = lgb.Dataset(X_train, label=y_train)
# param = {'objective': 'cross_entropy', 'metric': ['auc', 'binary_logloss']}
# num_round = 100
# hist = lgb.cv(param, train_data, num_round, nfold=3)
# hist['auc-mean'][-1]

In [None]:
import optuna
import sklearn.metrics
from sklearn.model_selection import train_test_split

def objective(trial):
    param = {
    "objective": "cross_entropy",
    "metric": "auc",
    "verbosity": -1,
    "boosting_type": "gbdt",
    "num_leaves": trial.suggest_int("num_leaves", 2, 256),
    "max_depth": trial.suggest_int("max_depth", 2, 10),
    "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
    "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
    "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
    "num_iterations": trial.suggest_int("num_iterations", 20, 200),
    "min_child_samples": trial.suggest_int("min_child_samples", 5, 30),
    }

    train_data = lgb.Dataset(X_train, label=y_train)
    # Add a callback for pruning.
    pruning_callback = optuna.integration.LightGBMPruningCallback(trial, "auc")
    hist = lgb.cv(param, train_data, nfold=3, callbacks=[pruning_callback])
    auc = hist['auc-mean'][-1]
    return auc

In [None]:
# study = optuna.create_study(
#     pruner=optuna.pruners.MedianPruner(n_warmup_steps=10), direction="maximize"
# )
# study.optimize(objective, n_trials=100)

# print("Number of finished trials: {}".format(len(study.trials)))
# print("Best trial:")
# trial = study.best_trial

# print("  Value: {}".format(trial.value))
# print("  Params: ")
# for key, value in trial.params.items():
#     print("    {}: {}".format(key, value))

In [None]:
# optuna.visualization.plot_param_importances(study)

In [None]:
# params = trial.params

# Tuned by Optuna beforehand (Value: 0.993050884974973)
params = {'num_leaves': 241, 'max_depth':10, 'feature_fraction': 0.9751915273276082, 'bagging_fraction': 0.9183132982296186, 
         'bagging_freq': 5, 'num_iterations': 193, 'min_child_samples': 11}

clf = lgb.train(params, train_data)

In [None]:
y_pred = clf.predict(X_test)

In [None]:
submission = test_df[['id']]
submission['target'] = y_pred
submission.loc[submission.target < 0, 'target'] = 0
submission.to_csv('submission.csv', index=False)

In [None]:
pd.read_csv('submission.csv')