In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Libray import

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

# Data loading

In [None]:
df = pd.read_csv('/kaggle/input/tabular-playground-series-jun-2022/data.csv')
sub = pd.read_csv('/kaggle/input/tabular-playground-series-jun-2022/sample_submission.csv')

# EDA

In [None]:
df.columns

F_1: from 0 to 14   
F_2: from 0 to 24  
F_3: from 0 to 24  
F_4: from 0 to 14  

First, let's look at the missing values for each data set.
As you can see, there are no missing values for a particular data set.

In [None]:
plt.bar(df.isnull().sum().index, df.isnull().sum())
plt.ylabel('missing value counts')
plt.show()

In [None]:
isnull_df = df.isnull().sum()
isnull_df[isnull_df==0]

We found that F_2 data is no missing value

In [None]:
f1_columns = [c for c in df.columns if 'F_1' in c]
f2_columns = [c for c in df.columns if 'F_2' in c]
f3_columns = [c for c in df.columns if 'F_3' in c]
f4_columns = [c for c in df.columns if 'F_4' in c]

Let's look at a histogram of each data

In [None]:
data_groups = [f1_columns, f2_columns, f3_columns, f4_columns]
for n, g in enumerate(data_groups):
    x = len(g)//5
    fig = plt.figure(figsize=(18,x*3))
    fig.suptitle(f'histgram of F_{n+1} ', fontsize =16)
    plt.subplots_adjust(wspace=0.4, hspace=0.3)
    for i, column in enumerate(g):
        plt.subplot(x, 5, i+1)
        plt.hist(df[column], bins =100)
        plt.ylabel('count')
        plt.xlabel(f'{column}')
    plt.show()

In [None]:
data_groups = [f1_columns, f2_columns, f3_columns, f4_columns]
outerline_ratio = 0.005
for n, g in enumerate(data_groups):
    x = len(g)//5
    fig = plt.figure(figsize=(18,x*3))
    fig.suptitle(f'histgram of F_{n+1} without 0.5% outlier', fontsize =16)
    plt.subplots_adjust(wspace=0.4, hspace=0.3)
    for i, column in enumerate(g):
        tempup = df[column].quantile(1-outerline_ratio)
        tempdown = df[column].quantile(outerline_ratio)
        temp = df[(df[column] <= tempup) & (df[column] >= tempdown)]
        plt.subplot(x, 5, i+1)
        plt.hist(temp[column], bins =100)
        plt.ylabel('count')
        plt.xlabel(f'{column}')
    plt.show()

There are a little outlier.   
I tried to remove 0.5% of the outliers, but the score did not improve.   
Therefore, I do not remove outliers.

In [None]:
submission_f1_df = pd.DataFrame(index=[], columns=['row-col', 'value'])
f1_column_results = []
for column in f1_columns:
    temp_test = df[df[column].isnull()]
    temp_train = df[~df[column].isnull()]
    result_df = temp_test[['row_id']]
    del temp_test[column]
    del temp_test['row_id']
    del temp_train['row_id']

    y = temp_train[column]
    X = temp_train.drop(column, axis = 1)

    train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.3, random_state=100)
    train_set = lgb.Dataset(train_X, train_y)
    valid_set = lgb.Dataset(test_X, test_y)
    params = {
        "objective" : "regression",
        "metric" : "rmse"}
    print(f'\n\n {column}_calulate')
    model = lgb.train(
        params = params,
        train_set = train_set,
        valid_sets = [train_set, valid_set],
        num_boost_round = 4000)
    
    pred_test = model.predict(test_X)
    mse = mean_squared_error(test_y, pred_test)
    rmse = np.sqrt(mse)
    f1_column_results.append([column, rmse])
    
    pred = model.predict(temp_test)
    result_df['row-col'] = result_df['row_id'].astype(str) + f'-{column}'
    result_df['value']= pred
    result_df.reset_index(inplace=True, drop=True)
    submission_f1_df = pd.concat([submission_f1_df, result_df], join='inner')
submission_f1_df

In [None]:
f1_column_results

In [None]:
submission_f3_df = pd.DataFrame(index=[], columns=['row-col', 'value'])
f3_column_results = []
for column in f3_columns:
    temp_test = df[df[column].isnull()]
    temp_train = df[~df[column].isnull()]
    result_df = temp_test[['row_id']]
    del temp_test[column]
    del temp_test['row_id']
    del temp_train['row_id']

    y = temp_train[column]
    X = temp_train.drop(column, axis = 1)
    train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.3, random_state=100)
    train_set = lgb.Dataset(train_X, train_y)
    valid_set = lgb.Dataset(test_X, test_y)
    params = {
        "objective" : "regression",
        "metric" : "rmse"}
    print(f'\n\n {column}_calulate')
    model = lgb.train(
        params = params,
        train_set = train_set,
        valid_sets = [train_set, valid_set],
        num_boost_round = 4000)
    
    pred_test = model.predict(test_X)
    mse = mean_squared_error(test_y, pred_test)
    rmse = np.sqrt(mse)
    f3_column_results.append([column, rmse])
    
    pred = model.predict(temp_test)
    result_df['row-col'] = result_df['row_id'].astype(str) + f'-{column}'
    result_df['value']= pred
    result_df.reset_index(inplace=True, drop=True)
    submission_f3_df = pd.concat([submission_f3_df, result_df], join='inner')
submission_f3_df

In [None]:
f3_column_results

In [None]:
submission_f4_df = pd.DataFrame(index=[], columns=['row-col', 'value'])
f4_column_results = []
for column in f4_columns:
    temp_test = df[df[column].isnull()]
    temp_train = df[~df[column].isnull()]
    result_df = temp_test[['row_id']]
    del temp_test[column]
    del temp_test['row_id']
    del temp_train['row_id']

    y = temp_train[column]
    X = temp_train.drop(column, axis = 1)
    
    train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.3, random_state=100)
    train_set = lgb.Dataset(train_X, train_y)
    valid_set = lgb.Dataset(test_X, test_y)
    params = {
        "objective" : "regression",
        "metric" : "rmse"}
    print(f'\n\n {column}_calulate')
    model = lgb.train(
        params = params,
        train_set = train_set,
        valid_sets = [train_set, valid_set],
        num_boost_round = 4000)
    
    pred_test = model.predict(test_X)
    mse = mean_squared_error(test_y, pred_test)
    rmse = np.sqrt(mse)
    f4_column_results.append([column, rmse])
        
    pred = model.predict(temp_test)
    result_df['row-col'] = result_df['row_id'].astype(str) + f'-{column}'
    result_df['value']= pred
    result_df.reset_index(inplace=True, drop=True)
    submission_f4_df = pd.concat([submission_f4_df, result_df], join='inner')
submission_f4_df

In [None]:
f4_column_results

In [None]:
submission_all_df = pd.concat([submission_f1_df, submission_f3_df, submission_f4_df], join='inner')
submission_all_df

In [None]:
del sub['value']

In [None]:
submission = pd.merge(sub, submission_all_df, on='row-col', how='left')

In [None]:
submission.to_csv("/kaggle/working/submission6.csv", index=False)

In [None]:
column_results = []
column_results.append(f1_column_results)
column_results.append(f3_column_results)
column_results.append(f4_column_results)

In [None]:
column_results

# Next
F4_0,F_4_1, F4_4, F4_5, F4_6, F4_7, F4_12   
Since these data are clearly inaccurate, something needs to be done.