In [None]:
import re
import numpy as np 
import pandas as pd 

import seaborn as sns
import matplotlib.pyplot as plt

sns.set_palette("Paired")

In [None]:
df_train = pd.read_csv('/kaggle/input/tabular-playground-series-feb-2022/train.csv')
df_test = pd.read_csv('/kaggle/input/tabular-playground-series-feb-2022/test.csv')
df_sub = pd.read_csv('/kaggle/input/tabular-playground-series-feb-2022/sample_submission.csv')

In [None]:
df_train.head()

In [None]:
# any missing value
df_train.isnull().sum().any()

## Target Categories

In [None]:
df_train_raw = df_train.iloc[:, 1:-1]
df_train_target = df_train[['target']].copy()

In [None]:
fig, axes = plt.subplots(df_train_target.target.nunique(),1, figsize=(20,15), sharex=True)

for ax, i in zip(axes, df_train_target.target.unique()):
    p1 = sns.histplot(df_train[df_train.target == i].iloc[:, 1:-1].mean(axis=0), kde=True, label=i, legend=True, ax=ax)
    p1.legend()

axes[0].set_title("The average distribution for each target (column-wise)")
axes[-1].set_xlabel("column-wise average")
fig.tight_layout()

## Distribution of the two parts of the targets as prefix and suffix 

In [None]:
df_train_target.loc[:, 'target_prefix'] = df_train_target.target.str.split('_').apply(lambda t: t[0])
df_train_target.loc[:, 'target_suffix'] = df_train_target.target.str.split('_').apply(lambda t: t[1])

In [None]:
fig, axes = plt.subplots(df_train_target.target_prefix.nunique(),1, figsize=(20,15), sharex=True)

for ax, i in zip(axes, df_train_target.target_prefix.unique()):
    p1 = sns.histplot(df_train[df_train_target.target_prefix == i].iloc[:, 1:-1].mean(axis=0), kde=True, label=i, legend=True, ax=ax)
    p1.legend()

axes[0].set_title("The average of distribution for each target_prefix (column-wise)")
axes[-1].set_xlabel("average")
fig.tight_layout()

In [None]:
fig, axes = plt.subplots(df_train_target.target_suffix.nunique(),1, figsize=(20,15), sharex=True)

for ax, i in zip(axes, df_train_target.target_suffix.unique()):
    p1 = sns.histplot(df_train[df_train_target.target_suffix == i].iloc[:, 1:-1].mean(axis=0), kde=True, label=i, legend=True, ax=ax)
    p1.legend()

axes[0].set_title("The average of distribution for each target_suffix (column-wise)")
axes[-1].set_xlabel("column-wise average")
fig.tight_layout()

## Column Analysis

In [None]:
columns_len_9 = [c for c in df_train_raw.columns if len(c) == 9]
columns_len_8 = [c for c in df_train_raw.columns if len(c) == 8]

fig, ax = plt.subplots(1,1, figsize=(12, 5))

p1 = sns.countplot(x=[len(c) for c in df_train_raw.columns])
p1.set(title="Frequency of column lenghts", xlabel="column lengths")
ax.bar_label(ax.containers[0], fmt='%d', label_type='edge')
fig.tight_layout()

## Sum of molecules

In [None]:
dna_list = { 'a':[], 't': [], 'g': [], 'c': [] }

for col in columns_len_8 + columns_len_9:
    _, a_dna, t_dna, g_dna, c_dna, _ = re.split(r"^A([0-9]+)T([0-9]+)G([0-9]+)C([0-9]+)", col)
    dna_list['a'].append(int(a_dna))
    dna_list['t'].append(int(t_dna))
    dna_list['g'].append(int(g_dna))
    dna_list['c'].append(int(c_dna))   

In [None]:
# all variables consist of 10 molecules
(pd.DataFrame(dna_list).sum(axis=1) == 10).all()

In [None]:
fig, axes = plt.subplots(2,2, figsize=(12,8))

for ax, key in zip(axes.ravel(), dna_list.keys()):
    pd.DataFrame({key.upper(): dna_list[key]}).value_counts().plot.barh(ax=ax)

fig.suptitle("Molecule frequencies")
fig.tight_layout()

## Correlations of the variables

In [None]:
correlation = df_train_raw.sample(10000).corr(method='pearson')
mask = np.triu(np.ones_like(correlation, dtype=bool))
  
fig, ax = plt.subplots (figsize = (15, 10))
ax = sns.heatmap(correlation, mask=mask, center=0, linewidths=1)

ax.set(title = 'Pearson correlation of 10k samples regardless of the target distribution')
ax.set_xticklabels(ax.get_xticklabels(), rotation=80)
ax.set_yticklabels(ax.get_yticklabels(), rotation=0)
fig.tight_layout()

## Relations of target distibution

In [None]:
df_pivot_targets = pd.pivot_table(df_train, columns=['target'], values=df_train_raw.columns, aggfunc=np.mean)
df_pivot_targets.head()

In [None]:
correlation = df_pivot_targets.corr(method='pearson')
mask = np.triu(np.ones_like(correlation, dtype=bool))
  
fig, ax = plt.subplots (figsize = (15, 10))
ax = sns.heatmap(correlation, mask=mask, center=0, linewidths=1, fmt='.2f', annot=True)
ax.set(title = 'Average correlation of target over the ' + str(len(df_train_raw.columns)) + ' variables')
ax.set_xticklabels(ax.get_xticklabels(), rotation=80)
ax.set_yticklabels(ax.get_yticklabels(), rotation=0)
fig.tight_layout()

The following chart is indeed difficult to interpret however, we can at least see the general trends to infer which variables could play a role to find a right target. For example, to eliminate more variables (i.e. columns), we can only look at the columns where the value is less than `-0.0005` or grater than `0.0005`. With this help, we ignore a few variables having the same trends.

In [None]:
fig, axes = plt.subplots(len(df_pivot_targets.columns),1, figsize=(30, 30), sharex=False)

for ax, col in zip(axes.ravel(), df_pivot_targets.columns):
    ax.axhline(y=0, color='black')
    p1 = sns.lineplot(data=df_pivot_targets[col][(df_pivot_targets[col] < -0.0005) | (df_pivot_targets[col] > 0.0005)], ax=ax)
    plt.setp(p1.get_xticklabels(), rotation=90)

fig.suptitle("Trends of variables in the range of (> 0.0005) or (< -0.0005)")
fig.tight_layout()

Thanks!