EDA-https://www.kaggle.com/subinium/tps-oct-simple-eda
LGBM-https://www.kaggle.com/ezietsman/simple-python-lightgbm-example

In [None]:
#Import Library
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns


import lightgbm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer

In [None]:
# Since there are many columns, you need to configure it for EDA to be convenient.
# matplotlib setting
mpl.rcParams['figure.dpi'] = 200
mpl.rcParams['axes.spines.top'] = False
mpl.rcParams['axes.spines.right'] = False

# pandas setting
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [None]:
train = pd.read_csv('../input/tabular-playground-series-oct-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-oct-2021/test.csv')
sample_submission = pd.read_csv('../input/tabular-playground-series-oct-2021/sample_submission.csv')

In [None]:
print(train.shape)
print(test.shape)

train.head()

In [None]:
train.info()

In [None]:
#find int coulmns
for col in train.columns:
    if 'int' in str(train[col].dtype):
        print(col, end=' ')

In [None]:
train.loc[:, 'f0':'f284'].describe().T.style.bar(subset=['mean'], color='#205ff2')\
                            .background_gradient(subset=['std'], cmap='Reds')\
                            .background_gradient(subset=['50%'], cmap='coolwarm')

In [None]:
train['target'].value_counts()

In [None]:
# Feature Distribution
# If you have too much data, it's a good idea to sample and visualize the approximate distribution first.
np.random.seed(2110)
train = train.sample(10000)
test = test.sample(10000)

In [None]:
fig, axes = plt.subplots(11,11,figsize=(12, 12))
axes = axes.flatten()

for idx, ax in enumerate(axes):
    sns.kdeplot(data=train, x=f'f{idx}', 
                fill=True, 
                ax=ax)
    sns.kdeplot(data=test, x=f'f{idx}', 
                fill=True, 
                ax=ax)
    ax.set_xticks([])
    ax.set_yticks([])
    ax.set_xlabel('')
    ax.set_ylabel('')
    ax.spines['left'].set_visible(False)
    ax.set_title(f'f{idx}', loc='right', weight='bold', fontsize=10)

fig.supxlabel('Average by class (by feature f0-f120)', ha='center', fontweight='bold')

fig.tight_layout()
plt.show()

In [None]:
fig, axes = plt.subplots(11,11,figsize=(12, 12))
axes = axes.flatten()

for idx, ax in enumerate(axes, 121):
    sns.kdeplot(data=train, x=f'f{idx}', 
                fill=True, 
                ax=ax)
    sns.kdeplot(data=test, x=f'f{idx}', 
                fill=True, 
                ax=ax)
    ax.set_xticks([])
    ax.set_yticks([])
    ax.set_xlabel('')
    ax.set_ylabel('')
    ax.spines['left'].set_visible(False)
    ax.set_title(f'f{idx}', loc='right', weight='bold', fontsize=10)

fig.supxlabel('Average by class (by feature f121-f241)', ha='center', fontweight='bold')

fig.tight_layout()
plt.show()

In [None]:
# Binary Feature
binary_mean = train.loc[:,'f242':'f284'].mean()

fig, ax = plt.subplots(1, 1, figsize=(15, 6))

ax.bar(binary_mean.index, binary_mean, linewidth=0.2, edgecolor='black', alpha=1, color='#244747')

ax.set_ylim(0, 1)
ax.set_xticks(range(0, 44, 4))
ax.margins(0.01)
ax.grid(axis='y', linestyle='--', linewidth=0.2, zorder=5)
ax.set_title('Mean of binary features', loc='center', fontweight='bold')
ax.legend()
plt.show()

In [None]:
# get the labels
y = train.target.values
train.drop(['id', 'target'], inplace=True, axis=1)
x = train.values

In [None]:
#Create training and validation sets

x, x_test, y, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
# Create the LightGBM data containers
categorical_features = [c for c, col in enumerate(train.columns) if 'cat' in col]
train_data = lightgbm.Dataset(x, label=y, categorical_feature=categorical_features)
test_data = lightgbm.Dataset(x_test, label=y_test)

In [None]:
# Train the model


parameters = {
    'application': 'binary',
    'objective': 'binary',
    'metric': 'auc',
    'is_unbalance': 'true',
    'boosting': 'gbdt',
    'num_leaves': 31,
    'feature_fraction': 0.5,
    'bagging_fraction': 0.5,
    'bagging_freq': 20,
    'learning_rate': 0.05,
    'verbose': 0
}

model = lightgbm.train(parameters,
                       train_data,
                       valid_sets=test_data,
                       num_boost_round=5000,
                       early_stopping_rounds=100)

In [None]:
# Create a submission

submission = pd.read_csv('../input/tabular-playground-series-oct-2021/test.csv')
ids = submission['id'].values
submission.drop('id', inplace=True, axis=1)


x = submission.values
y = model.predict(x)

output = pd.DataFrame({'id': ids, 'target': y})
output.to_csv("submission.csv", index=False)