In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# !pip install -U kaleido
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
import missingno as msno

In [None]:
data = pd.read_csv('/kaggle/input/tabular-playground-series-jun-2022/data.csv', index_col='row_id')

# 0. View Data

In [None]:
data.head()

In [None]:
data_one_percent = data.sample(frac=0.01, random_state=1).sort_index()

In [None]:
fig = make_subplots(rows=16, cols=5, subplot_titles = data_one_percent.columns)

for i in range(80):
    n_row = i//5+1
    n_col = i%5+1
    fig.add_trace(go.Scatter(x = data_one_percent.index, y = data_one_percent.iloc[:, i]), row=n_row, col=n_col)

fig.update_layout(title_text="Data Visualization",title_x=0.5, width=1000, height=2000, showlegend=False)
fig.show()

# 1. Continuous and Discrete Data

In [None]:
Nuni = data.nunique().to_frame('N_unique')

In [None]:
fig = px.bar(Nuni, y = 'N_unique')
fig.show()

In [None]:
cont_cols = list(Nuni.index[Nuni.N_unique > 1e5])
print('Continues columns are:')
print(cont_cols)

In [None]:
cont_min = min(Nuni.loc[cont_cols].values.squeeze())
print(f'The minmum no. of unique values of continues columns is {cont_min}.')

In [None]:
disc_cols = list(Nuni.index[Nuni.N_unique <= 1e5])
print('Discrete columns are:')
print(disc_cols)

In [None]:
disc_max = max(Nuni.loc[disc_cols].values.squeeze())
print(f'The maximum no. of unique values of discrete columns is {disc_max}.')

In [None]:
data_no_nan = data_one_percent.dropna()
fig = make_subplots(rows=16, cols=5, subplot_titles = data_one_percent.columns)
for i in range(80):
    n_row = i//5+1
    n_col = i%5+1
    if data_one_percent.columns[i] in cont_cols:
        fig.add_trace(ff.create_distplot([data_no_nan.iloc[:, i].values], [data_one_percent.columns[i]],show_hist=False, show_rug = False).data[0], row=n_row, col=n_col)
    else:
        fig.add_trace(go.Histogram(x = data_no_nan.iloc[:, i]), row=n_row, col=n_col) 

fig.update_layout(title_text="Data Distribution",title_x=0.5, width=1000, height=2000, showlegend=False, bargap=0.2)
fig.show()

## Insight
According to the number of the unqiue values, it can be seen that F_1_x, F_3_x, and F_4_x are continues variables and F_2_x are discrete variables.

# 2 Data Description

In [None]:
cont_describe = data[cont_cols].describe().transpose()
cont_describe.head()

In [None]:
fig = make_subplots(rows=4, cols=1, vertical_spacing = 0.17)

fig.add_trace(go.Scatter(x = cont_describe.index, y = cont_describe['mean'], name="mean"), row=1, col=1)
fig.add_trace(go.Scatter(x = cont_describe.index, y = cont_describe['std'], name="std"), row=2, col=1)
fig.add_trace(go.Scatter(x = cont_describe.index, y = cont_describe['max'], name="max"), row=3, col=1)
fig.add_trace(go.Scatter(x = cont_describe.index, y = cont_describe['min'], name="min"), row=4, col=1)

fig.update_layout(title_text="Continuous Variables Description",title_x=0.5, width=1000, height=700)
fig.show()

In [None]:
disc_describe = data[disc_cols].describe().transpose()
disc_describe.head()

In [None]:
fig = make_subplots(rows=4, cols=1, vertical_spacing = 0.1)

fig.add_trace(go.Scatter(x = disc_describe.index, y = disc_describe['mean'], name="mean"), row=1, col=1)
fig.add_trace(go.Scatter(x = disc_describe.index, y = disc_describe['std'], name="std"), row=2, col=1)
fig.add_trace(go.Scatter(x = disc_describe.index, y = cont_describe['max'], name="max"), row=3, col=1)
fig.add_trace(go.Scatter(x = disc_describe.index, y = cont_describe['min'], name="min"), row=4, col=1)

fig.update_layout(title_text="Discrete Variables Description",title_x=0.5, width=1000, height=700)
fig.show()

## Insight:
* Continuous variables F_1_x and F_3_x generally have 0 mean, 1 standard deviation, 5 maximum and -5 minimum.
* Continuous variables F_4_x have non-zero mean and non-one std.
* Discrete variables F_2_x have non-zero mean and non-one std.
* Except F_2_7, F_2_12, and F_2_13, other features has 5 maximum and -5 minimum.

# 3. Missing Data Summary

In [None]:
msno.matrix(data)

In [None]:
cont_nan = data[cont_cols].isnull().sum().to_frame('N_nan')

In [None]:
fig = px.line(cont_nan, y="N_nan", title='No. of Missing Values for Continuous Variables')
fig.show()

In [None]:
disc_nan = data[disc_cols].isnull().sum().to_frame('N_nan')

In [None]:
fig = px.line(disc_nan, y="N_nan", title='No. of Missing Values for Discrete Variables')
fig.show()

In [None]:
cont_nan_T = data[cont_cols].transpose().isnull().sum().to_frame('N_nan')

In [None]:
fig = px.histogram(cont_nan_T, x="N_nan", title='No. of Missing Values in Each Row')
fig.show()

In [None]:
print('The number of the missing values in each row at most is', max(cont_nan_T.N_nan))

## Insight
* Only continuous variables have missing values!!!
* One row may have more than 1 missing value. If we would like to predict the missing values from other columns by a regression model, it should be noticed that the regressors of the model may contain missing values, unless the regressors are discrete variables.

# 4. Correlation & Autocorrelation Analysis

In [None]:
r_matrix = data.corr()

In [None]:
fig = px.imshow(r_matrix, text_auto=".2f", width=2000, height=2000)
fig.update_layout(title="Pearson's Correlation Heatmap", title_x=0.5)
fig.show()

In [None]:
# fig.write_image("fig1.png")

In [None]:
fig = plt.figure(figsize=(15, 20), dpi=80)
for i in range(80):
    ax = fig.add_subplot(16, 5, i+1)
    ax.title.set_text(data.columns[i])
    pd.plotting.lag_plot(data.iloc[:, i], lag=1)
plt.suptitle("Autocorrelation of Data")
plt.show()

## Insight
* Variables F_2_x have low correlation with themselves. However, as the F_2_x have no missing values, such correlation will not help us to predict the missing values.
* Variables F_4_x have moderately correlation with themselves. The correlation may help us to build a predictive model to predict the missing values in F_4_x.
* No autocorrelation is found in this dataset, which implies the data is not time-series data and we cannot use the neighbour data to predict the missing values.

# 5. Imputation
References:
* MARTYNOV ANDREY's [notebook](https://www.kaggle.com/code/martynovandrey/tps-jun-22-splitted-dataset-24x-faster)
* Carl McBride Ellis's [discussion](https://www.kaggle.com/competitions/tabular-playground-series-jun-2022/discussion/328358)
* LIAM MORGAN's [notebook](https://www.kaggle.com/code/lmorgan95/missforest-the-best-imputation-algorithm/)
* sklearn: [Imputation of missing values](https://scikit-learn.org/stable/modules/impute.html)


## Split data
Code from MARTYNOV ANDREY's [notebook](https://www.kaggle.com/code/martynovandrey/tps-jun-22-splitted-dataset-24x-faster)

In [None]:
col_split = [[], [], [], [], []]
for feature in list(data.columns):
    for i in [1, 2, 3, 4]:
        if feature.split('_')[1] == str(i):
            col_split[i].append(feature)
data_split = [[], [], [], [], []]

for i in [1, 2, 3, 4]:
    data_split[i] = data[col_split[i]]

# sk-learn Imputer
* Use `SimpleImputer` for features F_1_* and F_3_* where the missing values is imputed by mean values.
* Use `IterativeImputer` for features F_4_* where features has internal correlations. The detailed algorithm for `IterativeImputer` can be found in LIAM MORGAN's [notebook](https://www.kaggle.com/code/lmorgan95/missforest-the-best-imputation-algorithm/).

In [None]:
from sklearn.impute import SimpleImputer
for i in [1, 3]:
    imp = SimpleImputer(missing_values=np.nan, strategy='mean')
    data_split[i][:] = imp.fit_transform(data_split[i])

In [None]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
imp = IterativeImputer(max_iter=10, random_state=0)
data_split[4][:] = imp.fit_transform(data_split[4])

## Submission

In [None]:
from tqdm import tqdm
submission = pd.read_csv('/kaggle/input/tabular-playground-series-jun-2022/sample_submission.csv', index_col='row-col')
d = pd.concat([data_split[1], data_split[2], data_split[3], data_split[4]], axis=1)

In [None]:
for i in tqdm(submission.index):
    row = int(i.split('-')[0])
    col = i.split('-')[1]
    submission.loc[i, 'value'] = d.loc[row, col]

submission.to_csv('submission.csv')