In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

from lightgbm import LGBMRegressor

from tqdm import tqdm

pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '{:.2f}'.format)

In [None]:
data_df = pd.read_csv('../input/tabular-playground-series-jun-2022/data.csv', index_col='row_id')
submission = pd.read_csv('../input/tabular-playground-series-jun-2022/sample_submission.csv', index_col='row-col')
print(data_df.shape)
data_df.head()

In [None]:
data_df.info()

In [None]:
display(data_df.describe().T.style.bar())

In [None]:
float_cols = [col for col in data_df.columns if data_df[col].dtype == 'float64']
int_cols = [col for col in data_df.columns if data_df[col].dtype == 'int64']
print(f'Number of all columns: {len(data_df.columns)}\nNumber of float columns: {len(float_cols)}\nNumber of int columns: {len(int_cols)}')

In [None]:
_, ax = plt.subplots(11, 5, figsize=(14, 14))
plt.tight_layout()

for i , col in tqdm(zip(range(55), float_cols)):
    num_row = int((i - (i % 5)) / 5)
    num_col = i % 5
    sns.kdeplot(data=data_df, x=col, shade=True, ax=ax[num_row, num_col])

In [None]:
_, ax = plt.subplots(5, 5, figsize=(14, 14))
plt.tight_layout()

for i , col in tqdm(zip(range(25), int_cols)):
    num_row = int((i - (i % 5)) / 5)
    num_col = i % 5
    sns.countplot(data=data_df, x=col, ax=ax[num_row, num_col])

In [None]:
_, ax =  plt.subplots(figsize=(24, 24))

colormap = plt.cm.PuRd
sns.heatmap(data_df.corr(),
            annot=True,
            fmt=".2f",
            square=False,
            cmap=colormap,
            annot_kws={"size": 7, 'color': 'black'},
            cbar_kws={"shrink": .4},
            vmin=-1 ,
            vmax=1,
            ax=ax
           )

In [None]:
skew_imputer = SimpleImputer(strategy='median')
norm_imputer = SimpleImputer(strategy='mean')

skew_cols = pd.DataFrame(skew_imputer.fit_transform(data_df[int_cols]), columns=int_cols)
norm_cols = pd.DataFrame(norm_imputer.fit_transform(data_df[float_cols]), columns=float_cols)

data = pd.concat([skew_cols, norm_cols], axis=1)
print(data.isnull().sum().sum())
data.head()

In [None]:
for i in tqdm(submission.index):
    row = int(i.split('-')[0])
    col = i.split('-')[1]
    submission.loc[i, 'value'] = data.loc[row, col]

submission.to_csv('submission.csv')