In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import lightgbm as lgb
import optuna.integration.lightgbm as lgbo

from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
mmscaler = MinMaxScaler(feature_range=(0, 1), copy=True)
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error # 平均絶対誤差
#from sklearn.metrics import mean_squared_error # 平均二乗誤差
#from sklearn.metrics import mean_squared_log_error # 対数平均二乗誤差
from sklearn.metrics import r2_score # 決定係数
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

import missingno as msno
import plotly.express as px

import warnings
warnings.filterwarnings("ignore")

In [None]:
# Pandas setting to display more dataset rows and columns
pd.set_option('display.max_rows', 150)
pd.set_option('display.max_columns', 600)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.float_format', lambda x: '%.5f' % x)

# 1. Import data

In [None]:
sample_submission = pd.read_csv("/kaggle/input/tabular-playground-series-may-2022/sample_submission.csv")
train = pd.read_csv("/kaggle/input/tabular-playground-series-may-2022/train.csv")
test = pd.read_csv("/kaggle/input/tabular-playground-series-may-2022/test.csv")

In [None]:
sample_submission.head()

In [None]:
sample_submission.info()

In [None]:
sample_submission.describe()

In [None]:
train

In [None]:
train.info()

In [None]:
train.describe()

In [None]:
test

In [None]:
test.info()

In [None]:
test.describe()

train、testともに、
f_27だけがアルファベットが連続する文字列になっています。<br>
f_07からf_18までは0以上の整数値で、f_29は0または1または2、f_30は0または1の値になっているようです。<br>
ほかの項目はマイナスの値を含む数値のようです。<br>
For both train and test, f_27 is a string with consecutive alphabets.<br>
It seems that f_07 to f_18 are integer values greater than or equal to 0, f_29 is 0 or 1 or 2, and f_30 is 0 or 1.<br>
Other items seem to be numbers with negative values.

# 2. EDA

In [None]:
# Colors to be used for plots
colors = ["lightcoral", "sandybrown", "darkorange", "mediumseagreen", "lightseagreen",
          "cornflowerblue", "mediumpurple", "palevioletred", "lightskyblue", "sandybrown",
          "yellowgreen", "indianred", "lightsteelblue", "mediumorchid", "deepskyblue"]

In [None]:

# train

figure = plt.figure(figsize=(16, 18))
for feat in range(31):
    feat_name = f'f_{feat:02d}'
    if(feat_name != 'f_27'): # f_27を除く
        plt.subplot(8, 4, feat+1)
        plt.hist(train[feat_name], bins=100)
        plt.title(f'{feat_name}')
figure.tight_layout(h_pad=1.0, w_pad=0.8)
plt.show()


In [None]:

#test

figure = plt.figure(figsize=(16, 18))
for feat in range(31):
    feat_name = f'f_{feat:02d}'
    if(feat_name != 'f_27'): # f_27を除く
        plt.subplot(8, 4, feat+1)
        plt.hist(test[feat_name], bins=100)
        plt.title(f'{feat_name}')
figure.tight_layout(h_pad=1.0, w_pad=0.8)
plt.show()


整数値の項目以外は概ね正規分布しているようです。<br>
It seems that the items other than the items with integer values are normally distributed.

In [None]:

# Heatmap(train)

corr = train.corr().round(2)
plt.figure(figsize=(20,10))
sns.heatmap(corr, vmin=-1, vmax=1, center=0, square=False, annot=True, cmap='coolwarm')
plt.show()


ターゲットと強い相関がある特徴量はないようですが、特徴量同士で相関のあるものはあります。<br>
It seems that there are no features that are strongly correlated with the target, but there are some that are correlated with each other.

In [None]:

# Heatmap(test)

corr = test.corr().round(2)
plt.figure(figsize=(20,10))
sns.heatmap(corr, vmin=-1, vmax=1, center=0, square=False, annot=True, cmap='coolwarm')
plt.show()


特徴量同士で相関のあるものはあります。<br>
There are some that have a strong correlation between the features.

In [None]:
# Concat train and test
all_data = pd.concat([train,test],ignore_index=True)

In [None]:

all_data.drop(columns=['id', 'target']).describe().T\
        .style.bar(subset=['mean'], color=px.colors.qualitative.G10[0])\
        .background_gradient(subset=['std'], cmap='Greens')\
        .background_gradient(subset=['50%'], cmap='BuGn')


# 3. Preprosessing

f_27の特徴量は10文字のアルファベットです。<br>
1文字ずつに分けて、ラベルエンコード（数値に変更）します。<br>
The feature of f_27 is a 10-character alphabet. <br>
Label-encode it by dividing it into characters.<br>

In [None]:
%%time
# f_27を1文字ずつに分解
# Divide f_27 into characters

tmp_all = all_data
for i in range(10):
    temp = []
    for j in range(len(tmp_all)):
        temp.append(tmp_all['f_27'][j][i])
    tmp_all['f_27_' + str(i + 1)] = temp
tmp_all

In [None]:

%%time
# 出現率を計算
# Calculate the appearance rate

A_count = 0
B_count = 0
C_count = 0
D_count = 0
E_count = 0
F_count = 0
G_count = 0
H_count = 0
I_count = 0
J_count = 0
K_count = 0
L_count = 0
M_count = 0
N_count = 0
O_count = 0
P_count = 0
Q_count = 0
R_count = 0
S_count = 0
T_count = 0
U_count = 0
V_count = 0
W_count = 0
X_count = 0
Y_count = 0
Z_count = 0

for i in range(10):
    for j in range(160000):
        txt = tmp_all['f_27_' + str(i + 1)][j]
        if(txt == 'A'):
            A_count += 1
        elif(txt == 'B'):
            B_count += 1
        elif(txt == 'C'):
            C_count += 1
        elif(txt == 'D'):
            D_count += 1
        elif(txt == 'E'):
            E_count += 1
        elif(txt == 'F'):
            F_count += 1
        elif(txt == 'G'):
            G_count += 1
        elif(txt == 'H'):
            H_count += 1
        elif(txt == 'I'):
            I_count += 1
        elif(txt == 'J'):
            J_count += 1
        elif(txt == 'K'):
            K_count += 1
        elif(txt == 'L'):
            L_count += 1
        elif(txt == 'M'):
            M_count += 1
        elif(txt == 'N'):
            N_count += 1
        elif(txt == 'O'):
            O_count += 1
        elif(txt == 'P'):
            P_count += 1
        elif(txt == 'Q'):
            Q_count += 1
        elif(txt == 'R'):
            R_count += 1
        elif(txt == 'S'):
            S_count += 1
        elif(txt == 'T'):
            T_count += 1
        elif(txt == 'U'):
            U_count += 1
        elif(txt == 'V'):
            V_count += 1
        elif(txt == 'W'):
            W_count += 1
        elif(txt == 'X'):
            X_count += 1
        elif(txt == 'Y'):
            Y_count += 1
        elif(txt == 'Z'):
            Z_count += 1
charAry = {'A':A_count / 1600000,
 'B':B_count / 1600000,
 'C':C_count / 1600000,
 'D':D_count / 1600000,
 'E':E_count / 1600000,
 'F':F_count / 1600000,
 'G':G_count / 1600000,
 'H':H_count / 1600000,
 'I':I_count / 1600000,
 'J':J_count / 1600000,
 'K':K_count / 1600000,
 'L':L_count / 1600000,
 'M':M_count / 1600000,
 'N':N_count / 1600000,
 'O':O_count / 1600000,
 'P':P_count / 1600000,
 'Q':Q_count / 1600000,
 'R':R_count / 1600000,
 'S':S_count / 1600000,
 'T':T_count / 1600000,
 'U':U_count / 1600000,
 'V':V_count / 1600000,
 'W':W_count / 1600000,
 'X':X_count / 1600000,
 'Y':Y_count / 1600000,
 'Z':Z_count / 1600000}
charAry


In [None]:

# 出現率順にソート
# Sort by appearance rate
sortedAry = sorted(charAry.items(), key=lambda x:x[1], reverse=True)
sortedAry


In [None]:
'''
# 出現率の高い順に採番
# Numbering in descending order of appearance rate

newAry = {}
for i in range(len(sortedAry)):
    key = sortedAry[i][0]
    newAry[key] = i
newAry
'''

In [None]:
'''
%%time
# 出現率をラベルとして使用
# Use appearance rate as a label
for i in range(10):
    tmpArray = []
    for j in range(1600000):
        txt = tmp_all['f_27_' + str(i + 1)][j]
        tmpArray.append(charAry[txt])
    tmp_all['f_27_' + str(i + 1)] = tmpArray
all_data = tmp_all
'''
'''
# 出現率の高い順に採番（B:0,A:1,C:2,D:3･･･）
# Label encode in descending order of appearance rate（B:0,A:1,C:2,D:3･･･）
for i in range(10):
    tmpArray = []
    for j in range(1600000):
        txt = tmp_all['f_27_' + str(i + 1)][j]
        tmpArray.append(newAry[txt])
    tmp_all['f_27_' + str(i + 1)] = tmpArray
all_data = tmp_all
all_data
'''

In [None]:

le = LabelEncoder()

#tmp_all['f_27_le'] = le.fit_transform(tmp_all['f_27'])

labels = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']
le.fit(labels)

for i in range(10):
    all_data['f_27_' + str(i + 1)] = le.transform(tmp_all['f_27_' + str(i + 1)])


all_data


In [None]:
# f_27_1～f_27_10の分布をチェック
# Check the distribution for each of f_27_1 to f_27_10

figure = plt.figure(figsize=(16, 18))
for feat in range(10):
    feat_name = 'f_27_' + str(feat+1)
    plt.subplot(5, 2, feat+1)
    plt.hist(all_data[feat_name], bins=100)
    plt.title(f'{feat_name}')
figure.tight_layout(h_pad=1.0, w_pad=0.8)
plt.show()

上位ランカーの方々の投稿を見ると、f_00、f_01、f_02、f_05、f_26が特殊な特徴量のようで、次のように処理すると良い結果がでるようです。<br>
この処理をする前のベストスコアは0.97924でした。<br>
Looking at the submissions of the top rankers, f_00, f_01, f_02, f_05, f_26 seem to be special features, and it seems that good results can be obtained by processing as follows. <br>
The best score before this process was 0.97924.

In [None]:

def feature_engineering(df):
    # f_27にいくつの固有文字が含まれているか
    # How many unique characters are in f_27
    df["unique_characters"] = df.f_27.apply(lambda s: len(set(s)))
    
    # 以下については、なぜこのような処理をするのか不明
    # I don't know about billow.
#    df['02+21'] = df['f_02'] + df['f_21']
    df['i_02_21'] = (df.f_21 + df.f_02 > 5.2).astype(int) - (df.f_21 + df.f_02 < -5.3).astype(int)
#    df['05+22'] = df['f_05'] + df['f_22']
    df['i_05_22'] = (df.f_22 + df.f_05 > 5.1).astype(int) - (df.f_22 + df.f_05 < -5.4).astype(int)
#    df['00+01+26'] = df['f_00'] + df['f_01'] + df['f_26']
    i_00_01_26 = df.f_00 + df.f_01 + df.f_26
    df['i_00_01_26'] = (i_00_01_26 > 5.0).astype(int) - (i_00_01_26 < -5.0).astype(int)
    
    return df

all_data = feature_engineering(all_data)

# f_27でその文字列が発生する頻度
# How often the string occurs in f_27
#all_data['value_frequency'] = all_data['f_27'].map(all_data['f_27'].value_counts() / len(all_data))

all_data


In [None]:
# Scaling

#all_data['f_27_le'] = preprocessing.minmax_scale(all_data['f_27_le'])


In [None]:
# Split all for train and test
df_train = all_data.iloc[train.index[0]:train.index[-1]+1].drop(columns=["f_27"])
df_test = all_data.iloc[train.index[-1]+1:].drop(columns=["f_27", "target"])

In [None]:
df_train

In [None]:

# Heatmap(df_train)

corr = df_train.drop(columns=['id']).corr().round(2)
plt.figure(figsize=(20,10))
sns.heatmap(corr, vmin=-1, vmax=1, center=0, square=False, annot=True, cmap='coolwarm')
plt.show()


In [None]:
df_test

In [None]:

# Heatmap(df_test)

corr = df_test.drop(columns=['id']).corr().round(2)
plt.figure(figsize=(20,10))
sns.heatmap(corr, vmin=-1, vmax=1, center=0, square=False, annot=True, cmap='coolwarm')
plt.show()


# 4. Modeling

In [None]:
X = df_train.drop(columns=['id', 'target'])
value = df_train['target']

In [None]:
'''
# optuna

opt_params = {
    "objective":"binary",
    "metric":"auc"#binary_logloss
}

X_train,X_test,y_train,y_test = train_test_split(
    X,
    value,
    test_size=0.2
)
    
reg_train = lgb.Dataset(
    X_train,
    y_train
)

reg_eval = lgb.Dataset(
    X_test,
    y_test,
    reference=reg_train
)

opt=lgbo.train(
    opt_params,
    reg_train,
    valid_sets = reg_eval,
    verbose_eval=False,
    num_boost_round = 50000,
    early_stopping_rounds = 300
)

opt.params
'''

In [None]:
%%time
X_train, X_test, t_train, t_test = train_test_split(X, value, test_size=0.2, random_state=0)

lgb_train = lgb.Dataset(X_train, t_train)
lgb_eval = lgb.Dataset(X_test, t_test, reference=lgb_train)

params = {
    'task': 'prediction',
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'learning_rate': 0.01,#0.1
    'num_iterations': 50000,#100
    'feature_pre_filter': False,
#    'max_depth': -1,
#    'min_data_in_leaf': 20,
    
    'lambda_l1': 1.2937466617272022e-07,
    'lambda_l2': 0.31902396621525436,
    'num_leaves': 49,
    'feature_fraction': 0.9799999999999999,
    'bagging_fraction': 0.8925736551622421,
    'bagging_freq': 2,
    'min_child_samples': 20,
    
#    'lambda_l1': opt.params['lambda_l1'],
#    'lambda_l2': opt.params['lambda_l2'],
#    'num_leaves': opt.params['num_leaves'],
#    'feature_fraction': opt.params['feature_fraction'],
#    'bagging_fraction': opt.params['bagging_fraction'],
#    'bagging_freq': opt.params['bagging_freq'],
#    'min_child_samples': opt.params['min_child_samples'],
    
    'verbosity': -1
}

#はデフォルト値

model = lgb.train(
    params,
    train_set=lgb_train,
    valid_sets=lgb_eval,
    early_stopping_rounds=500,#100
    verbose_eval=100
)

***Result***<br>
<metric=auc><br>
All default : 0.996637<br>
learning_rate=0.05 : 0.996762<br>
learning_rate=0.03、early_stopping_rounds=200 : 0.996858<br>
learning_rate=0.03、early_stopping_rounds=300 : 0.996869<br>
learning_rate=0.03、early_stopping_rounds=500 : 0.996889<br>
learning_rate=0.03、early_stopping_rounds=500、lambda_l1=0.8、lambda_l2=0.8 : 0.996894<br>
learning_rate=0.03、early_stopping_rounds=500、lambda_l1=0.8、lambda_l2=0.8、num_leaves=35 : 0.996953<br>
using optuna : 0.99711<br>
<br>
<br>


# 5. Prediction

In [None]:
%%time
X_test = df_test.drop(columns=['id'])
sample_submission['target'] = model.predict(X_test)
sample_submission

# 6. Make submission file

In [None]:
sample_submission.to_csv('submission.csv', index=False)