In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')
from tqdm import tqdm
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import xgboost as xgb
import lightgbm as lgb

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
import warnings
warnings.filterwarnings('ignore')
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
path = r'../input/tabular-playground-series-jun-2022'

df = pd.read_csv(path + '/data.csv', index_col='row_id')
df.head()

In [None]:
df_sample = pd.read_csv(path + '/sample_submission.csv', index_col='row-col')
df_sample.head()

In [None]:
# check the missing value% in each feature vector

missing_value_percent = []

for i in tqdm(range(df.shape[1])):
    missing_value_percent.append(df.iloc[:, i].isnull().sum())

missing_value_percent = [i/df.shape[0] * 100 for i in missing_value_percent]

fig, ax = plt.subplots(figsize=(20, 4))
ax.bar(x=df.columns, height=missing_value_percent)
ax.set_xticklabels(df.columns, rotation=45, ha='right')

fig.show()

### not all of them contains missing values;F_2_xx series do not contain missing values
### for those vector containing missing values, the nan% is around 1.75%

### some of these vectors are centered around zero, ranging either (-5, 5) or (-10, 10);
### some contain outliers, pushing the distribution either upwards (0, 5) or downwards (-10 ,0);
### F_2_xx series are positive only (0, 10);

### some feature vectors, such as F_2_xx and F_4_xx, are correlated with other vectors in the same series;
### but others are not

In [None]:
# divide vectors into four series F_#_xx

df_f_1_cols = []
df_f_2_cols = []
df_f_3_cols = []
df_f_4_cols = []

for col in df.columns:
    if col[:3] == 'F_1':
        df_f_1_cols.append(col)
    elif col[:3] == 'F_2':
        df_f_2_cols.append(col)
    elif col[:3] == 'F_3':
        df_f_3_cols.append(col)
    elif col[:3] == 'F_4':
        df_f_4_cols.append(col)
    else:
        continue

In [None]:
# take a closer look at F_2_xx series

df_f_2 = df[df_f_2_cols]

fig, ax3 = plt.subplots(figsize=(20, 18))
mask = np.triu(df_f_2.corr())
sns.heatmap(df_f_2.corr(), mask=mask, cmap='coolwarm', linewidth=0.5, annot = True, fmt = '.2f')

In [None]:
df_f_2.plot(kind='box', figsize = (16,10))

In [None]:
# take a closer look at F_4_xx series

df_f_4 = df[df_f_4_cols]

fig, ax3 = plt.subplots(figsize=(16, 12))
mask = np.triu(df_f_4.corr())
sns.heatmap(df_f_4.corr(), mask=mask, cmap='coolwarm', linewidth=0.5, annot = True, fmt = '.2f')

In [None]:
df_f_4.plot(kind='box', figsize = (12,8))

In [None]:
# check F_1 and F_3 as well

df_f_1 = df[df_f_1_cols]
df_f_3 = df[df_f_3_cols]

In [None]:
df_f_1.plot(kind='box', figsize = (12,8))

In [None]:
df_f_3.plot(kind='box', figsize = (20,8))

In [None]:
# as the entire F_2 contains no missing values;
# they will be used as training data to impute other vectors;

# here are a few ideas:
# Strategy # 1: use F_2 to do lgb regression to impute other vectors, F_1, 3, and 4 >> 1.41652
# Strategy # 2: mean imputation for F_1 and F_3, leave F_2 alone, 


# imputation technique can be lightgbm regression as it is swift
# other candidates: xgboost, or neural networks

In [None]:
# Strategy # 1
# starting with the first strategie: keep using F_2 to impute other columns
# iterate through the F_1, F_3, and F_4 series

df_imputation = df.copy()

for vector in tqdm([df_f_1_cols, df_f_3_cols, df_f_4_cols]):
    for col in vector:
        df_f_2_merge_temp = df_f_2.merge(df_imputation[col], on ='row_id')

        train = df_f_2_merge_temp[df_f_2_merge_temp[col].isna() == False]
        test = df_f_2_merge_temp[df_f_2_merge_temp[col].isna() == True]

        X_train = train.iloc[:,:-1]
        y_train = train.iloc[:,-1]

        X_test = test.iloc[:,:-1]

        model = lgb.LGBMRegressor()
        model.fit(X_train, y_train)

        pred = model.predict(X_test)

        concat_temp = pd.concat([X_test, pd.Series(pred, index=X_test.index, name = col)], axis = 1)
        impute_temp = pd.concat([train, concat_temp])
        impute_temp.sort_index(inplace=True)

        df_imputation[col] = impute_temp[col]    

In [None]:
# Strategy # 2




In [None]:
for i in tqdm(df_sample.index):
    row = int(i.split('-')[0])
    col = i.split('-')[1]
    df_sample.loc[i, 'value'] = df_imputation.loc[row, col]

df_sample.to_csv('submission.csv')

In [None]:
# keep updating