In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import lightgbm as lgb
#import optuna.integration.lightgbm as lgb

from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error # 平均二乗誤差
#from sklearn.metrics import mean_squared_log_error # 対数平均二乗誤差
from sklearn.metrics import r2_score # 決定係数
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

import missingno as msno
import plotly.express as px

import warnings
warnings.filterwarnings("ignore")

# 1. Import data

In [None]:
sample_submission = pd.read_csv("/kaggle/input/tabular-playground-series-aug-2021/sample_submission.csv")
train = pd.read_csv("/kaggle/input/tabular-playground-series-aug-2021/train.csv")
test = pd.read_csv("/kaggle/input/tabular-playground-series-aug-2021/test.csv")

In [None]:
# Pandas setting to display more dataset rows and columns
pd.set_option('display.max_rows', 150)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.float_format', lambda x: '%.5f' % x)

In [None]:
sample_submission.head()

In [None]:
sample_submission.info()

In [None]:
train.head()

In [None]:
train.describe()

In [None]:
test.head()

In [None]:
test.describe()

# 2.EDA

In [None]:
# Colors to be used for plots
colors = ["lightcoral", "sandybrown", "darkorange", "mediumseagreen", "lightseagreen",
          "cornflowerblue", "mediumpurple", "palevioletred", "lightskyblue", "sandybrown",
          "yellowgreen", "indianred", "lightsteelblue", "mediumorchid", "deepskyblue"]

In [None]:

# Search for missing data

msno.matrix(df=train, figsize=(10,6), color=(0,.3,.3))


In [None]:

msno.matrix(df=test, figsize=(10,6), color=(0,.3,.3))


In [None]:

# Concat train and test
all = pd.concat([train,test],ignore_index=True)

for i in range(100):
    colName = "f" + str(i)
    f = all[colName]
    f = np.array(f)
    all[colName] = preprocessing.minmax_scale(f[:])

all


In [None]:

all.drop(columns=['id', 'loss']).describe().T#\
#        .style.bar(subset=['mean'], color=px.colors.qualitative.G10[0])\
#        .background_gradient(subset=['std'], cmap='Greens')\
#        .background_gradient(subset=['50%'], cmap='BuGn')


In [None]:

%%time
corr = train.drop(columns=["id"]).corr()
plt.figure(figsize=(20,10))
sns.heatmap(corr, vmin=-0.05, vmax=0.05, center=0, square=False, annot=False, cmap='coolwarm')
plt.show()


In [None]:
train["loss"].value_counts()

In [None]:

fig, ax = plt.subplots(figsize=(16, 8))

bars = ax.bar(train["loss"].value_counts().sort_index().index,
              train["loss"].value_counts().sort_index().values,
              color=colors,
              edgecolor="black")
ax.set_title("Loss distribution", fontsize=20, pad=15)
ax.set_ylabel("Count", fontsize=14, labelpad=15)
ax.set_xlabel("Loss value", fontsize=14, labelpad=10)
ax.bar_label(bars, [f"{x:2.2f}%" for x in train["loss"].value_counts().sort_index().values/(len(train)/100)],
                 padding=5, fontsize=10, rotation=90)
ax.margins(0.025, 0.12)
ax.grid(axis="y")

plt.show();


In [None]:

%%time
df = pd.concat([train.drop(["id", "loss"], axis=1), test.drop("id", axis=1)], axis=0)
columns = df.columns.values

cols = 5
rows = len(columns) // cols + 1

fig, axs = plt.subplots(ncols=cols, nrows=rows, figsize=(30,100), sharex=False)

plt.subplots_adjust(hspace = 0.3)
i=0

for r in np.arange(0, rows, 1):
    for c in np.arange(0, cols, 1):
        if i >= len(columns):
            axs[r, c].set_visible(False)
        else:
            hist1 = axs[r, c].hist(train[columns[i]].values,
                                   range=(df[columns[i]].min(),
                                          df[columns[i]].max()),
                                   bins=40,
                                   color="cornflowerblue",
                                   edgecolor="black",
                                   alpha=0.7,
                                   label="Train")
            hist2 = axs[r, c].hist(test[columns[i]].values,
                                   range=(df[columns[i]].min(),
                                          df[columns[i]].max()),
                                   bins=40,
                                   color="darkorange",
                                   edgecolor="black",
                                   alpha=0.7,
                                   label="Test")
            axs[r, c].set_title(columns[i], fontsize=14, pad=5)
            axs[r, c].tick_params(axis="y", labelsize=13)
            axs[r, c].tick_params(axis="x", labelsize=13)
            axs[r, c].grid(axis="y")
            axs[r, c].legend(fontsize=13)
                                  
        i+=1

plt.show();


# 3.Modeling

In [None]:
X = train.drop(columns=["id", "loss"])
value = train["loss"]

In [None]:
X_train, X_test, t_train, t_test = train_test_split(X, value, test_size=0.2, random_state=0)

lgb_train = lgb.Dataset(X_train, t_train)
lgb_eval = lgb.Dataset(X_test, t_test, reference=lgb_train)

params = {
        'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'metric': 'rmse',#l2
        'learning_rate': 0.1,
        'num_leaves': 3,
        'max_bin': 234,#234
        'num_iterations': 20000,
        'verbosity': -1
}

model = lgb.train(
    params,
    train_set=lgb_train,
    valid_sets=lgb_eval,
    early_stopping_rounds=100,
    verbose_eval=100
)


# Verification

pred = model.predict(X_test)

mse = mean_squared_error(t_test, pred) # MSE(平均二乗誤差)の算出
rmse = np.sqrt(mse) # √MSEの算出
print('RMSE : {}'.format(rmse))

#r2 = r2_score(t_test,pred)
#print('R2    : {}'.format(r2))

# 4.Prediction

In [None]:
X_test = test.drop(columns=["id"])
sample_submission['loss'] = model.predict(X_test)
sample_submission

# 5.Make submission file

In [None]:
sample_submission.to_csv('submission.csv', index=False)