# Table of Contents
* [Import and Data Preparation](#import)
* [Target](#target)
* [Visualize Features](#features)
* [Correlation of Features](#correlation)

In [None]:
# packages

# standard
import numpy as np
import pandas as pd
import time

# plots
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns

In [None]:
# show all columns of data frames
pd.set_option('display.max_columns', None)

In [None]:
# show files
!ls -l '../input/tabular-playground-series-feb-2022/'

# Import and Data Preparation

In [None]:
# import data
t1 = time.time()
df_train = pd.read_csv('../input/tabular-playground-series-feb-2022/train.csv')
df_test = pd.read_csv('../input/tabular-playground-series-feb-2022/test.csv')
df_sub = pd.read_csv('../input/tabular-playground-series-feb-2022/sample_submission.csv')
t2 = time.time()
print('Elapsed time [s]:', np.round(t2-t1,4))

In [None]:
# show first few rows
df_train.head(10)

In [None]:
# dimensions of train and test set
print('Train Set:', df_train.shape)
print('Test Set :', df_test.shape)

In [None]:
# structure / missing values
df_train.info(verbose=True, show_counts=True)

### => No missing values, nice!

In [None]:
# basic stats
df_train.describe()

<a id='target'></a>
# Target

In [None]:
# target - basic stats
print(df_train.target.value_counts())
df_train.target.value_counts().plot(kind='bar')
plt.grid()
plt.show()

### => Target is well balanced.

In [None]:
# add numeric version of target
df_train['target_num'] = df_train.target.astype('category').cat.codes

<a id='features'></a>
# Visualize Features

In [None]:
# extract features
features = df_train.columns
features = features.drop(['row_id','target','target_num'])
features = features.tolist()

### Plot all feature distributions:

In [None]:
fig, axs = plt.subplots(72, 4, figsize=(16,300))
i = 0
for f in features:
    current_ax = axs.flat[i]
    current_ax.hist(df_train[f], bins=100)
    current_ax.set_title(f)
    current_ax.grid()
    i = i + 1

### Visualize features via boxplots:

In [None]:
# plot features using boxplots
df_train.boxplot(column=features[0:100], figsize=(16,6))
plt.xticks(rotation=90)
plt.title('Boxplot of features - part 1')
plt.show()

In [None]:
# plot features using boxplots
df_train.boxplot(column=features[100:200], figsize=(16,6))
plt.xticks(rotation=90)
plt.title('Boxplot of features - part 2')
plt.show()

In [None]:
# plot features using boxplots
df_train.boxplot(column=features[200:285+1], figsize=(16,6))
plt.xticks(rotation=90)
plt.title('Boxplot of features - part 3')
plt.show()

### Visualize data points as lines colored by target:

In [None]:
colors = ['red','blue','green','orange','grey',
          'cyan','magenta','brown','darkgreen','lightblue']

In [None]:
# pick a few data points for first plot
plt.figure(figsize=(16,5))
for i in range(0,30):
    plt.plot(df_train.loc[i,features], color=colors[df_train.target_num[i]])

plt.xticks(rotation=90)
plt.show()

In [None]:
# zoom in first features only
plt.figure(figsize=(16,5))
for i in range(0,50):
    plt.plot(df_train.loc[i,features[0:25]], color=colors[df_train.target_num[i]])

plt.xticks(rotation=90)
plt.grid()
plt.show()

In [None]:
# show more rows
plt.figure(figsize=(16,5))
for i in range(0,2000):
    plt.plot(df_train.loc[i,features[0:25]], color=colors[df_train.target_num[i]])

plt.xticks(rotation=90)
plt.grid()
plt.show()

<a id='correlation'></a>
# Correlation of Features

In [None]:
# calc correlation matrix of features
corr_pearson = df_train[features].corr(method='pearson')

In [None]:
# plot all correlations
plt.figure(figsize=(15,15))
sns.heatmap(corr_pearson, annot=False, cmap='RdYlGn', vmin=-1, vmax=+1)
plt.title('Pearson Correlation')
plt.show()

In [None]:
# zoom in 1
plt.figure(figsize=(14,13))
sns.heatmap(corr_pearson.iloc[0:50,0:50], annot=False, cmap='RdYlGn', vmin=-1, vmax=+1)
plt.title('Pearson Correlation')
plt.show()

In [None]:
# zoom in 2
plt.figure(figsize=(14,13))
sns.heatmap(corr_pearson.iloc[50:100,50:100], annot=False, cmap='RdYlGn', vmin=-1, vmax=+1)
plt.title('Pearson Correlation')
plt.show()

In [None]:
# show example of strong positive correlation
cc = np.corrcoef(df_train.A1T2G6C1, df_train.A1T2G7C0)[0,1]
plt.scatter(df_train.A1T2G6C1, df_train.A1T2G7C0, alpha=0.05)
plt.title('Correlation:' + str(np.round(cc,5)))
plt.grid()
plt.show()

In [None]:
# zoom in 3
plt.figure(figsize=(14,13))
sns.heatmap(corr_pearson.iloc[200:250,50:100], annot=False, cmap='RdYlGn', vmin=-1, vmax=+1)
plt.title('Pearson Correlation')
plt.show()

In [None]:
# show example of strong negative correlation
cc = np.corrcoef(df_train.A4T4G1C1, df_train.A1T2G4C3)[0,1]
plt.scatter(df_train.A4T4G1C1, df_train.A1T2G4C3, alpha=0.05)
plt.title('Correlation:' + str(np.round(cc,5)))
plt.grid()
plt.show()

#### Let's extract the highest correlations in a systematic way:

In [None]:
# create data frame to store all results
n_features = len(features)
corr_stats = pd.DataFrame(data=np.zeros((n_features**2,4)), columns=['x','y','corr','sel'])
corr_stats.x = corr_stats.x.astype(str)
corr_stats.y = corr_stats.y.astype(str)

# rearrange all correlations in tabular form
row = 0
for i in range(n_features):
    # print(np.round(100*i/n_features,2)) # track progress
    var_i = features[i]
    for j in range(n_features):
        var_j = features[j]
        corr_x = corr_pearson.iloc[i,j]
        # store results
        corr_stats.loc[row,'x'] = var_i
        corr_stats.loc[row,'y'] = var_j
        corr_stats.loc[row,'corr'] = corr_x
        if (i>j):
            corr_stats.loc[row,'sel'] = 1 # we use this to later remove redundancies
                
        row = row + 1

# remove redundancies
corr_stats = corr_stats[corr_stats.sel==1] # only select "i > j" cases
corr_stats = corr_stats.drop(['sel'], axis=1)
# sort by correlation (descending)
corr_stats = corr_stats.sort_values(by=['corr'], ascending=False)
corr_stats = corr_stats.reset_index(drop=True)

#### Top 10 feature correlations:

In [None]:
# positive correlations
corr_stats.head(10)

In [None]:
# negative correlations
corr_stats.tail(10)

#### Distribution of correlations:

In [None]:
plt.figure(figsize=(8,5))
plt.plot(corr_stats['corr'])
plt.title('Feature Correlations - Sorted')
plt.grid()
plt.show()

In [None]:
plt.figure(figsize=(8,5))
plt.hist(corr_stats['corr'],bins=50)
plt.title('Feature Correlations - Histogram')
plt.grid()
plt.show()

#### Export results:

In [None]:
# export to file
corr_stats.to_csv('corr_stats.csv')
corr_pearson.to_csv('corr_pearson.csv')

### See also https://www.kaggle.com/docxian/tpg-2022-feb-eda-gbm-starter for a first modeling approach.