In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import lightgbm as lgb
import optuna.integration.lightgbm as lgbo

from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
mmscaler = MinMaxScaler(feature_range=(0, 1), copy=True)
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error # 平均絶対誤差
#from sklearn.metrics import mean_squared_error # 平均二乗誤差
#from sklearn.metrics import mean_squared_log_error # 対数平均二乗誤差
from sklearn.metrics import r2_score # 決定係数
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

import missingno as msno
import plotly.express as px

import json
from collections import OrderedDict

import warnings
warnings.filterwarnings("ignore")

In [None]:
# Pandas setting to display more dataset rows and columns
pd.set_option('display.max_rows', 150)
pd.set_option('display.max_columns', 600)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.float_format', lambda x: '%.5f' % x)

# 1. Process
(1) Adjust the parameters using the data extracted from the rows that do not contain missing values.<br>
(2) Create a model by training the columns containing missing values as objective variables in order.<br>(F_1_n:15types,F_3_n:25types,F_4_n:15types,Total 55types)<br>
(3) Collect the data containing missing values in the same column, pass it to the model created in (2), make a prediction, and complement "data".<br>

# 2. Import data

In [None]:
sample_submission = pd.read_csv("/kaggle/input/tabular-playground-series-jun-2022/sample_submission.csv")
data = pd.read_csv("/kaggle/input/tabular-playground-series-jun-2022/data.csv")

In [None]:
sample_submission

In [None]:
data

In [None]:
data.info()

There are no missing values in F_2_n.<br>
F_2_ｎには欠損値がありません。

# 3. EDA

In [None]:

# 欠損値の分布を可視化
# Search for missing data

msno.matrix(df=data, figsize=(10,6), color=(0,.3,.3))


In [None]:

# 特徴量ごとの分布を確認
# Check the distribution for each feature

data.drop(columns=['row_id']).describe().T\
        .style.bar(subset=['mean'], color=px.colors.qualitative.G10[0])\
        .background_gradient(subset=['std'], cmap='Greens')\
        .background_gradient(subset=['50%'], cmap='BuGn')


In [None]:

# 特徴量ごとの分布を可視化
# Visualization of distribution for each feature

figure = plt.figure(figsize=(16, 18))
count = 0
for i in range(4):
    if(i == 0 or i == 3):
        featCount = 15
    else:
        featCount = 25
    for j in range(featCount):
        feat_name = f'F_{i+1}_{j}'
        plt.subplot(16, 5, count+1)
        plt.hist(data[feat_name], bins=100)
        plt.title(f'{feat_name}')
        count += 1
figure.tight_layout(h_pad=1.0, w_pad=0.8)
plt.show()


F_1_nとF_3_nとF_4_nは、ほぼ正規分布していますが、一部外れ値もありそうです。<br>
F_2_nは0以上の整数値をとっているようです。<br>
とりあえず、外れ値の置き換え等を行わずに処理を進めます。<br>
F_1_n, F_3_n, and F_4_n are almost normally distributed, but there are some outliers.<br>
F_2_n seems to take an integer value greater than or equal to 0.<br>
For the time being, proceed with processing without replacing outliers.<br>

In [None]:

# 特徴量間の相関を可視化
# Heatmap

corr = data.corr().round(2)
plt.figure(figsize=(20,10))
sns.heatmap(corr, vmin=-1, vmax=1, center=0, square=False, annot=True, cmap='coolwarm')
plt.show()


特徴量同士の相関関係はほとんどありませんが、F_2_ｎ同士とF_4_ｎ同士では若干の相関関係がみられます。<br>
There is almost no correlation between features, but there is some correlation between F_2_ｎ and between F_4_ｎ.

# 4. Feature engineering

In [None]:
# 特徴量ごとの欠損値の数を確認
# Check the number of missing values for each feature

data.drop(columns='row_id').isnull().sum(axis=0)

In [None]:
# idごとの欠損値の数を確認
# Check the number of missing values for each id

data.isnull().sum(axis=1)

複数の欠損値を持つ行が存在します<br>
There are rows with multiple missing values

In [None]:
# F_4_n 以外の欠損値は平均値で埋める
# Fill in missing values other than F_4_n with mean values.
for col in data.columns:
    if "F_4" not in col:
        data[col] = data[col].fillna(data[col].mean())
data

In [None]:
data.info()

In [None]:
'''
# "data"から特徴量名を取り出す
# Extract feature name from "data"
colAry = []
for col in data.columns:
    if data[col].isna().sum() != 0:
        colAry.append(col)
colAry
'''

In [None]:
# "data"から特徴量名を取り出す
# Extract feature name from "data"
allCols = data.drop(columns='row_id').columns
colAry_target = []
colAry = []
for i in range(len(allCols)):
    inner = allCols[i]
    if inner.split('_')[1] == '4':
        colAry_target.append(inner)
    if inner.split('_')[1] == '2' or inner.split('_')[1] == '4':# including F_2
        colAry.append(inner)
colAry

In [None]:
f4data = data[colAry]# including F_2
f4data

In [None]:

# 欠損値の数を特徴量として追加する
# Add the number of missing values as a feature

NaN_cnt = pd.DataFrame(f4data.isna().sum(axis=1))

f4data['NaN_cnt'] = NaN_cnt
f4data


In [None]:
NaN_cnt.groupby([0]).size()

In [None]:

%%time

# 特徴量ごとの欠損値の有無を新たな特徴量として追加
# Add presence or absence of missing values for each feature as a new feature

for i in range(len(colAry_target)):
    colName = colAry_target[i]
    ary = []
    for j in range(len(f4data[colName])):
        if pd.isnull(f4data[colName][j]) == True:
            ary.append(1)
        else:
            ary.append(0)
    f4data[colName + '_NaN'] = ary
f4data


In [None]:
# 外れ値を置き換える
# Replace outliers

replacedData = f4data.copy()

def replace_outliers(df,stds):
    outValue = stds * 3.5#3.5
    replaceValuePlus = df.mean() + outValue#np.nan、df.mean()、df.median()
    replaceValueMinus = df.mean() - outValue
    df[(df - df.mean()) > outValue] = replaceValuePlus
    df[(df.mean() - df) > outValue] = replaceValueMinus
    return df

for i in range(len(colAry_target)):
    colName = colAry_target[i]
    colData = replacedData[colName].copy()
    std = colData.std()
    replacedData[colName] = replace_outliers(colData,std)
    

replacedData

In [None]:
f4data.describe()

In [None]:
replacedData.describe()

In [None]:
# パラメーターチューニングと学習用に欠損値を含まない行を抜き出す
# Extract rows that do not contain missing values for parameter tuning and modeling

dataForStudy = replacedData[replacedData.notnull().all(axis=1)]
dataForStudy

# 5. Parameters

In [None]:

%%time
# optunaでパラメータを探索・保存
# Save parameters using optuna

paramObj = {}
for i in range(len(colAry_target)):
    colName = colAry_target[i]
    print('')
    print('***** Target : {} *****'.format(colName))
    opt_rounds = 100#10
    opt_stop = 100#None
    
    X = dataForStudy.drop(columns=[colName])
    value = dataForStudy[colName]
    
    X_train, X_test, t_train, t_test = train_test_split(X, value, test_size=0.2, random_state=0)

    opt_params = {
        "objective":"regression",
        "metric":"rmse"
    }

    X_train,X_test,y_train,y_test = train_test_split(
        X,
        value,
        test_size=0.2
    )

    reg_train = lgb.Dataset(
        X_train,
        y_train
    )

    reg_eval = lgb.Dataset(
        X_test,
        y_test,
        reference=reg_train
    )

    opt=lgbo.train(
        opt_params,
        reg_train,
        valid_sets = reg_eval,
        verbose_eval=False,
        num_boost_round = opt_rounds,
        early_stopping_rounds = opt_stop
    )

    paramObj[colName] = opt.params
    
    print(opt.params)


# 6. Modering and prediction

In [None]:
%%time

for i in range(len(colAry_target)):
    colName = colAry_target[i]
    params = paramObj[colName]
    print('')
    print('***** Target : {} *****'.format(colName))
    print('----- Params --------------------------------------------------------------')
    print(params)
    print('----- Result --------------------------------------------------------------')
    print('Count : {} / 15'.format(i + 1))

    learning_rate = 0.2#0.2
    num_iterations = 20000#100
    e_s_rounds=100
    
    X = dataForStudy.drop(columns=[colName])
    value = dataForStudy[colName]

    X_train, X_test, t_train, t_test = train_test_split(X, value, test_size=0.2, random_state=0)

    lgb_train = lgb.Dataset(X_train, t_train)
    lgb_eval = lgb.Dataset(X_test, t_test, reference=lgb_train)
    
    params = {
        'task': 'train',
        'objective': 'regression',
        'metric': 'rmse',
        'boosting_type': 'gbdt',
        'learning_rate': learning_rate,
        'lambda_l1': paramObj[colName]['lambda_l1'],
        'lambda_l2': paramObj[colName]['lambda_l2'],
        'num_leaves': paramObj[colName]['num_leaves'],
        'feature_fraction': paramObj[colName]['feature_fraction'],
        'bagging_fraction': paramObj[colName]['bagging_fraction'],
        'bagging_freq': paramObj[colName]['bagging_freq'],
        'min_child_samples': paramObj[colName]['min_child_samples'],
        'num_iterations': num_iterations,
        'verbosity': -1
    }

    model = lgb.train(
        params,
        train_set=lgb_train,
        valid_sets=lgb_eval,
        early_stopping_rounds=e_s_rounds,
        verbose_eval=1000
    )
    
    data_null = pd.DataFrame(replacedData[replacedData[colName].isnull()])#ターゲットが欠損
    tmpAry = data_null.copy()
    X_test = data_null.drop(columns=[colName])#ターゲットのカラムを削除
    tmpAry[colName] = model.predict(X_test)#モデルに渡してターゲットを埋める
    
    #data[data[colName].isnull()][colName] = tmpAry[colName].copy()

    data[colName].update(tmpAry[colName])

In [None]:
%%time
for i in range(len(sample_submission)):
    id = sample_submission['row-col'][i]
    row_id = id.split('-')[0]
    colName = id.split('-')[1]
    value = data[colName][int(row_id)]
    sample_submission['value'][i] = value
sample_submission

# 7. Make submission file

In [None]:
sample_submission.to_csv('submission.csv', index=False)