In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import numpy as np
import optuna
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPooling2D, BatchNormalization
from keras.utils import np_utils
from sklearn.metrics import log_loss
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train = pd.read_csv('../input/tabular-playground-series-may-2022/train.csv')
test = pd.read_csv('../input/tabular-playground-series-may-2022/test.csv')
sample_submission = pd.read_csv('../input/tabular-playground-series-may-2022/sample_submission.csv')

In [None]:
train

In [None]:
test

In [None]:
train.describe()

In [None]:
train.isnull().any(axis=0)

In [None]:
list(train['target'].unique())

In [None]:
train.dtypes

In [None]:
# isdigit：数値か文字列か判断したい時
train['f_27'].astype('str').str.isdigit()

In [None]:
train.loc[(train['target'] == 0)]

In [None]:
train

In [None]:
train.isnull().sum()

In [None]:
train.nunique()

In [None]:
train.duplicated(subset=['f_20']).sum()

In [None]:
train['f_27'].nunique()

In [None]:
train

In [None]:
import matplotlib.pyplot as plt
!pip install japanize-matplotlib
import japanize_matplotlib
import seaborn as sns

for i in range(0,31):
    if i < 10:
        plt.figure(figsize=(10,4))
        plt.xlim(-10, 16)
        flierprops = dict(marker='o', markerfacecolor='purple', markersize=6,
                          linestyle='none', markeredgecolor='black')
        sns.boxplot(x=train[f'f_0{i}'], flierprops=flierprops)
    
    elif i == 27:
        continue
        
    else:
        plt.figure(figsize=(10,4))
        plt.xlim(-20, 20)
        flierprops = dict(marker='o', markerfacecolor='purple', markersize=6,
                          linestyle='none', markeredgecolor='black')
        sns.boxplot(x=train[f'f_{i}'], flierprops=flierprops)

In [None]:
train.query(' f_17 >13 ')

In [None]:
for i in range(0,31):
    if i < 10:
        plt.figure(figsize=(10,4))
        plt.xlim(-10, 16)
        flierprops = dict(marker='o', markerfacecolor='purple', markersize=6,
                          linestyle='none', markeredgecolor='black')
        sns.boxplot(x=test[f'f_0{i}'], flierprops=flierprops)
    
    elif i == 27:
        continue
        
    else:
        plt.figure(figsize=(10,4))
        plt.xlim(-20, 20)
        flierprops = dict(marker='o', markerfacecolor='purple', markersize=6,
                          linestyle='none', markeredgecolor='black')
        sns.boxplot(x=test[f'f_{i}'], flierprops=flierprops)

In [None]:
sns.heatmap(train[train.columns[train.columns != 'f_27']])

In [None]:
from tqdm import tqdm
for df in [train, test]:
    for i in tqdm(range(10)):
        df[f'f_27_{i}'] = df.f_27.str.get(i).apply(ord) - ord('A')
        df[f'f_27_{i}'] = df.f_27.str.get(i).apply(ord) - ord('A')
        
    df["unique_characters"] = df.f_27.apply(lambda s: len(set(s)))
    df["unique_characters"] = df.f_27.apply(lambda s: len(set(s)))
    
    df['i_02_21'] = (df.f_21 + df.f_02 > 5.2).astype(int) - (df.f_21 + df.f_02 < -5.3).astype(int)
    df['i_05_22'] = (df.f_22 + df.f_05 > 5.1).astype(int) - (df.f_22 + df.f_05 < -5.4).astype(int)
    i_00_01_26 = df.f_00 + df.f_01 + df.f_26
    df['i_00_01_26'] = (i_00_01_26 > 5.0).astype(int) - (i_00_01_26 < -5.0).astype(int)

In [None]:
train.drop('f_27', axis=1, inplace=True)
test.drop('f_27', axis=1, inplace=True)
features = test.columns.to_list()
len(features)

In [None]:
from sklearn.preprocessing import QuantileTransformer
qt = QuantileTransformer(n_quantiles=1000, 
                         output_distribution='normal', 
                         random_state=42).fit(train[features])

train[features] = qt.transform(train[features])
test[features] = qt.transform(test[features])

In [None]:
X = train.drop(['target'], axis=1)
y = train['target']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
# 5-fold CVモデルの学習

# 【ブロック１：　初期化】
# 5つのモデルを保存するリストの初期化
models = []

# 学習用データでの予測値を保存するデータフレームの初期化
df_train_preds = pd.DataFrame({'y_train': y_train})

# 検証用データでの予測値を保存するデータフレームの初期化
df_eval_preds = pd.DataFrame({'y_eval': [],
                              'y_eval_pred': []})

# テストデータでの予測値を保存するデータフレームの初期化
df_test_preds = pd.DataFrame({'y_test': y_test})
# インデックスが0からの連番になるように初期化

df_test_preds.reset_index(inplace=True, drop=True)

# RMSEを保存するデータフレームの初期化
df_auc =pd.DataFrame({'train': [],
                       'eval': [],
                       'test': []})

# ラウンド数の初期化
round_no = 0

In [None]:
# 【ブロック２：　モデルの学習】
# 学習データの数だけの数列（0行から最終行まで連番）
row_no_list = list(range(len(y_train)))

# KFoldクラスをインスタンス化（これを使って5分割する）
K_fold = KFold(n_splits=5, shuffle=True,  random_state=42)

count = 0 
# KFoldクラスで分割した回数だけ実行（ここでは5回）
for train_cv_no, eval_cv_no in K_fold.split(row_no_list, y_train):
    # ilocで取り出す行を指定
    X_train_cv = X_train.iloc[train_cv_no, :]
    y_train_cv = pd.Series(y_train).iloc[train_cv_no]
    X_eval_cv = X_train.iloc[eval_cv_no, :]
    y_eval_cv = pd.Series(y_train).iloc[eval_cv_no]
    
    if count == 0:
        # 学習用
        lgb_train = lgb.Dataset(X_train_cv, y_train_cv,
                                free_raw_data=False)
        # 検証用
        lgb_eval = lgb.Dataset(X_eval_cv, y_eval_cv, reference=lgb_train,
                               free_raw_data=False)
        def objective(trial):
            params = {
                'objective': 'binary',
                'metric': 'auc',
                'verbosity': -1,
                'boosting_type': 'gbdt',
                'max_depth': 8,
                'lambda_l1': trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
                'lambda_l2': trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
                'feature_fraction': trial.suggest_float("feature_fraction", 0.4, 1.0),
                'bagging_fraction': trial.suggest_float("bagging_fraction", 0.4, 1.0),
                'bagging_freq': trial.suggest_int("bagging_freq", 1, 7),
                'min_child_samples': trial.suggest_int("min_child_samples", 5, 100),
                'learning_rate': 0.05,
                'num_leaves': int(0.7 * 2 ** 8),
                'seed': 42
                }                                   
    
            model = lgb.train(params,   
                                 train_set=lgb_train,
                                 num_boost_round=1000, 
                                 valid_names=['train', 'valid'],    # 学習経過で表示する名称
                                 valid_sets=[lgb_train, lgb_eval],
                                 early_stopping_rounds=3,
                                 verbose_eval=1) 

            y_pred = model.predict(X_test, num_iteration=model.best_iteration)
            accuracy = roc_auc_score(y_test, y_pred)

            return accuracy

        study = optuna.create_study(direction='minimize')
        study.optimize(objective, n_trials=30)

        print('Number of finished trials:', len(study.trials))
        print('Best trial:', study.best_trial.params)
        
        best_params = study.best_trial.params
        x = {'objective': 'binary',
             'metric': 'auc',
             'boosting_type': 'gbdt',
             'max_depth': 8,
             'num_leaves': int(0.7 * 2 ** 8),
             "verbosity": -1,
             'seed': 42
             }

        best_params.update(x)
        count += 1
    
    else:
        
        # 学習用
        lgb_train = lgb.Dataset(X_train_cv, y_train_cv,
                                free_raw_data=False)
        # 検証用
        lgb_eval = lgb.Dataset(X_eval_cv, y_eval_cv, reference=lgb_train,
                               free_raw_data=False)

        # 学習
        evaluation_results = {}                              # 学習の経過を保存する箱
        model = lgb.train(best_params,                            # 上記で設定したパラメータ
                          lgb_train,                         # 使用するデータセット
                          num_boost_round=50000,              # 学習の回数
                          valid_names=['train', 'valid'],    # 学習経過で表示する名称
                          valid_sets=[lgb_train, lgb_eval],  # モデル検証のデータセット
                          evals_result=evaluation_results,   # 学習の経過を保存
                          early_stopping_rounds=20           # アーリーストッピング
                         )                    # 学習の経過の非表示

        # 学習が終わったモデルをリストに保存
        models.append(model) 


        # 【ブロック３：　モデルで予測】
        # 学習したモデルで予測
        y_train_pred = model.predict(X_train_cv, num_iteration=model.best_iteration)
        y_eval_pred = model.predict(X_eval_cv, num_iteration=model.best_iteration)
        y_test_pred = model.predict(X_test, num_iteration=model.best_iteration)

        # 学習用データでの予測値をデータフレームに保存
        df_train_cv_pred = pd.DataFrame({round_no: y_train_pred},
                                         index=train_cv_no)
        df_train_preds = df_train_preds.join(df_train_cv_pred, how='left')

        # 検証用データでの予測値をデータフレームに保存
        df_eval_pred = pd.DataFrame({'y_eval': y_eval_cv,
                                     'y_eval_pred': y_eval_pred})
        df_eval_preds = df_eval_preds.append(df_eval_pred)    

        # テストデータでの予測値をデータフレームに保存
        df_test_cv_pred = pd.DataFrame({round_no: y_test_pred})
        df_test_preds = pd.concat([df_test_preds, df_test_cv_pred], axis=1)

        #  auc を計算
        train_auc= roc_auc_score(y_train_cv, y_train_pred)
        eval_auc = roc_auc_score(y_eval_cv, y_eval_pred)
        test_auc = roc_auc_score(y_test, y_test_pred)

        # スコアを表示
        print(' auc train: %.5f, eval: %.5f9, test: %.5f' 
              % (train_auc, eval_auc, test_auc))

        # aucの保存
        df_auc_cv =pd.DataFrame({'train': [train_auc],
                                  'eval': [eval_auc],
                                  'test': [test_auc]},
                                  index=[round_no])
        df_auc = df_auc.append(df_auc_cv)

        # ラウンド数のカウンタを更新
        round_no += 1
        count += 1

# 保存したaucの平均値   
auc_ave = df_auc.mean().to_numpy()
# 平均値を表示
print('Average:')
print(' auc train: %.5f, eval: %.5f, test: %.5f'
      % (auc_ave[0], auc_ave[1], auc_ave[2]))

In [None]:
"""
# 【ブロック２：　モデルの学習】
# 学習データの数だけの数列（0行から最終行まで連番）
row_no_list = list(range(len(y_train)))

# KFoldクラスをインスタンス化（これを使って5分割する）
K_fold = KFold(n_splits=5, shuffle=True,  random_state=42)

params = {
        'objective': 'binary',
        'metric': 'auc',
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'lambda_l1': 1.2123045275041023e-08,
        'lambda_l2': 5.594370568805385e-08, 
        'feature_fraction': 0.8190408267839706,
        'bagging_fraction': 0.9,
        'bagging_freq': 1,
        'min_child_samples': 50,
        'learning_rate': 0.05,
        'max_depth': 8,
        'num_leaves': int(0.7 * 2 ** 8),
        'seed': 42,
        'num_threads': 8
        }   

# KFoldクラスで分割した回数だけ実行（ここでは5回）
for train_cv_no, eval_cv_no in K_fold.split(row_no_list, y_train):
    # ilocで取り出す行を指定
    X_train_cv = X_train.iloc[train_cv_no, :]
    y_train_cv = pd.Series(y_train).iloc[train_cv_no]
    X_eval_cv = X_train.iloc[eval_cv_no, :]
    y_eval_cv = pd.Series(y_train).iloc[eval_cv_no]
    
    # 学習用
    lgb_train = lgb.Dataset(X_train_cv, y_train_cv,
                               free_raw_data=False)
    # 検証用
    lgb_eval = lgb.Dataset(X_eval_cv, y_eval_cv, reference=lgb_train,
                               free_raw_data=False)

    # 学習
    evaluation_results = {}                             # 学習の経過を保存する箱
    model = lgb.train(params,                      # 上記で設定したパラメータ
                     lgb_train,                         # 使用するデータセット
                     num_boost_round=50000,              # 学習の回数
                     valid_names=['train', 'valid'],    # 学習経過で表示する名称
                     valid_sets=[lgb_train, lgb_eval],  # モデル検証のデータセット
                     evals_result=evaluation_results,   # 学習の経過を保存
                     early_stopping_rounds=20          # アーリーストッピング
                    )                    # 学習の経過の非表示

    # 学習が終わったモデルをリストに保存
    models.append(model) 


    # 【ブロック３：　モデルで予測】
    # 学習したモデルで予測
    y_train_pred = model.predict(X_train_cv, num_iteration=model.best_iteration)
    y_eval_pred = model.predict(X_eval_cv, num_iteration=model.best_iteration)
    y_test_pred = model.predict(X_test, num_iteration=model.best_iteration)

    # 学習用データでの予測値をデータフレームに保存
    df_train_cv_pred = pd.DataFrame({round_no: y_train_pred},
                                     index=train_cv_no)
    df_train_preds = df_train_preds.join(df_train_cv_pred, how='left')

    # 検証用データでの予測値をデータフレームに保存
    df_eval_pred = pd.DataFrame({'y_eval': y_eval_cv,
                                 'y_eval_pred': y_eval_pred})
    df_eval_preds = df_eval_preds.append(df_eval_pred)    

    # テストデータでの予測値をデータフレームに保存
    df_test_cv_pred = pd.DataFrame({round_no: y_test_pred})
    df_test_preds = pd.concat([df_test_preds, df_test_cv_pred], axis=1)

    #  auc を計算
    train_auc= roc_auc_score(y_train_cv, y_train_pred)
    eval_auc = roc_auc_score(y_eval_cv, y_eval_pred)
    test_auc = roc_auc_score(y_test, y_test_pred)

    # スコアを表示
    print(' auc train: %.5f, eval: %.5f9, test: %.5f' 
          % (train_auc, eval_auc, test_auc))

    # aucの保存
    df_auc_cv =pd.DataFrame({'train': [train_auc],
                              'eval': [eval_auc],
                              'test': [test_auc]},
                              index=[round_no])
    df_auc = df_auc.append(df_auc_cv)

    # ラウンド数のカウンタを更新
    round_no += 1

# 保存したaucの平均値   
auc_ave = df_auc.mean().to_numpy()
# 平均値を表示
print('Average:')
print(' auc train: %.5f, eval: %.5f, test: %.5f'
      % (auc_ave[0], auc_ave[1], auc_ave[2]))
"""

In [None]:
y_pred = model.predict(X)
roc_auc_score(y, y_pred)

In [None]:
from lightgbm import plot_importance
import matplotlib.pyplot as plt
!pip install japanize-matplotlib
import japanize_matplotlib

def plot_features(booster, figsize):    
    fig, ax = plt.subplots(1,1,figsize=figsize)
    return plot_importance(booster=booster, ax=ax)

plot_features(model, (10,14))

In [None]:
ax = lgb.plot_metric(evaluation_results, figsize=(10, 5))

In [None]:
prediction = model.predict(test)
submission = pd.DataFrame({
    "id": sample_submission['id'], 
    "target": prediction
})

submission.head(10)
submission.to_csv('submission.csv', index=False)

In [None]:
submission