# Visual Exploration of the Data - Table of Contents
* [Import and first EDA](#import_eda)
* [Pick an example investment](#pick_ex)
* [Compare two investments](#compare)
* [Check some features](#feat)

In [None]:
# packages

# basics
import numpy as np
import pandas as pd
import time

# plots
import matplotlib.pyplot as plt
import seaborn as sns

# stats
import scipy.stats
from fitter import Fitter, get_common_distributions, get_distributions

In [None]:
# color for plots
my_color = 'darkcyan'

# percentile vector
my_percs = [0.01,0.05,0.1,0.25,0.5,0.75,0.9,0.95,0.99]

<a id='import_eda'></a>
# Import and first EDA

#### We load only a (column-wise) subset due to the massive size of the data set. The full training data contains additional row_id (being concatentation of time_id and investment_id) and 300 numerical features (f_0..f_299):

In [None]:
# load minimum version of data, this takes a few minutes nevertheless...
my_cols = ['time_id', 'investment_id', 'target']

t1 = time.time()
df = pd.read_csv('../input/ubiquant-market-prediction/train.csv', usecols=my_cols)
t2 = time.time()
print('Elapsed time [s]:', np.round(t2-t1,2))

In [None]:
# first glance
df.head()

In [None]:
# dimension
df.shape

#### We have more than 3 million rows here...

In [None]:
# time_id frequencies
df.time_id.value_counts()

In [None]:
# investment_id frequencies
df.investment_id.value_counts()

#### We have investments that are only sparsely available... Let's try to visualize (black: not available): 

In [None]:
# check "availability" of time / investment combinations:
gaps = pd.crosstab(df.time_id, df.investment_id)

plt.figure(figsize=(16,30))
sns.heatmap(gaps, cbar=False)
plt.show()

In [None]:
# export "availabilty matrix"
gaps.to_csv('gaps.csv')

In [None]:
# look at a "sparse" example
my_id = 232
df_sparse = df[df.investment_id==my_id]
df_sparse

In [None]:
# plot example with multiple gaps
plt.figure(figsize=(14,4))
plt.scatter(df_sparse.time_id, df_sparse.target, alpha=0.25, color=my_color)
plt.xlabel('time_id')
plt.ylabel('target')
plt.title('Investment id=' + str(my_id))
plt.grid()
plt.show()

### Target

In [None]:
# target
plt.figure(figsize=(10,4))
df.target.plot(kind='hist', bins=100, color=my_color)
plt.title('Target - All Investments')
plt.grid()

In [None]:
# boxplot
plt.figure(figsize=(10,4))
plt.boxplot(df.target, vert=False)
plt.title('Target - All Investments')
plt.grid()
plt.show()

In [None]:
# basic stats
df.target.describe(percentiles=my_percs)

In [None]:
# mean by investment
plt.figure(figsize=(12,4))
df.groupby(by='investment_id').target.mean().plot(color=my_color, alpha=0.5)
plt.title('Mean by investment')
plt.grid()
plt.show()

In [None]:
# let's check the extremely high value
print('Outlier mean =', df[df.investment_id==85].target.mean())
df[df.investment_id==85]

In [None]:
# standard deviation by investment
plt.figure(figsize=(12,4))
df.groupby(by='investment_id').target.std().plot(color=my_color, alpha=0.5)
plt.title('Stdev by investment')
plt.grid()
plt.show()

In [None]:
# let's check the stdev=0 outlier
print('Outlier stdev =', df[df.investment_id==1415].target.std())
df[df.investment_id==1415]

<a id='pick_ex'></a>
# Pick an example investment

In [None]:
my_id = 2140
df_ex = df[df.investment_id==my_id]

In [None]:
# plot time series
plt.figure(figsize=(14,4))
plt.scatter(df_ex.time_id, df_ex.target, alpha=0.25, color=my_color)
plt.plot(df_ex.time_id, df_ex.target, alpha=0.5, color=my_color)
plt.xlabel('time_id')
plt.ylabel('target')
plt.grid()
plt.title('Investment id=' + str(my_id))
plt.show()

In [None]:
# zoom in
ta = 0
tb = 200
plt.figure(figsize=(14,4))
plt.scatter(df_ex[ta:tb].time_id, df_ex[ta:tb].target, alpha=0.25, color=my_color)
plt.plot(df_ex[ta:tb].time_id, df_ex[ta:tb].target, alpha=0.5, color=my_color)
plt.xlabel('time_id')
plt.ylabel('target')
plt.grid()
plt.title('Investment id=' + str(my_id) + ' - Subset')
plt.show()

In [None]:
# plot target for specific investment
plt.figure(figsize=(10,4))
df_ex.target.plot(kind='hist', bins=50, color=my_color)
plt.title('Target - Investment id=' + str(my_id))
plt.grid()

In [None]:
# boxplot
plt.figure(figsize=(10,4))
plt.boxplot(df_ex.target, vert=False)
plt.title('Target - Investment id=' + str(my_id))
plt.grid()
plt.show()

In [None]:
# basic stats
df_ex.target.describe(percentiles=my_percs)

In [None]:
# try to fit a few distribution types to target
# for full list of available distributions use "get_distributions()"
dist_fits = Fitter(df_ex.target, distributions=['lognorm','norm','beta','t'])
dist_fits.fit()
plt.figure(figsize=(12,5))
dist_fits.summary()

In [None]:
# check for autocorrelations
plt.figure(figsize=(10,5))
plt.acorr(df_ex.target, maxlags=20, color=my_color)
plt.title('Autocorrelations of Target - Investment id=' + str(my_id))
plt.grid()
plt.show()

#### We observe a moderate correlation at lag 1, the others seem to be negligible...

<a id='compare'></a>
# Compare two investments

In [None]:
# pick two investments over the SAME time grid
df_1 = df[df.investment_id==2385]
df_2 = df[df.investment_id==1062]

In [None]:
# scatter plot - please note that we have chosen two completely aligned time series!
plt.figure(figsize=(7,6))
plt.scatter(df_1.target, df_2.target, alpha=0.25, color='darkcyan')
plt.title('Investment 1062 vs 2385')
plt.xlabel('Investment 2385 - target')
plt.ylabel('Investment 1062 - target')
plt.grid()
plt.show()

In [None]:
# correlation of the two investments
scipy.stats.pearsonr(df_1.target,df_2.target)

<a id='feat'></a>
# Check some features

In [None]:
# import a subset of features
my_feats = ['f_0','f_1','f_2','f_3','f_4',
            'f_5','f_6','f_7','f_8','f_9']

t1 = time.time()
df_f = pd.read_csv('../input/ubiquant-market-prediction/train.csv', usecols=my_feats)
t2 = time.time()
print('Elapsed time [s]:', np.round(t2-t1,2))

In [None]:
# basic stats
df_f.describe(percentiles=my_percs)

### Feature distributions:

In [None]:
# plot distributions
for f in my_feats:
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12,7))
    
    ax1.hist(df_f[f], bins=50, color=my_color)
    ax1.grid()
    ax1.set_title('Feature ' + f)
    
    ax2.boxplot(df_f[f], vert=False)
    ax2.grid()
    ax2.set_title('')
    
    plt.show()

### Correlation:

In [None]:
# calc correlation matrices (Pearson and rank correlation)
corr_pearson = df_f.corr(method='pearson')
corr_spearman = df_f.corr(method='spearman')

In [None]:
# plot corr matrices
plt.figure(figsize=(10,16))
ax1 = plt.subplot(2,1,1)
sns.heatmap(corr_pearson, annot=True, cmap='RdYlGn', vmin=-1, vmax=+1)
plt.title('Pearson Correlation')

ax2 = plt.subplot(2,1,2, sharex=ax1)
sns.heatmap(corr_spearman, annot=True, cmap='RdYlGn', vmin=-1, vmax=+1)
plt.title('Spearman Correlation')
plt.show()

In [None]:
# example of correlated features:
plt.scatter(df_f.f_2, df_f.f_3, alpha=0.2, color=my_color)
plt.xlabel('f_2')
plt.ylabel('f_3')
plt.title('Feature f_3 vs f_2')

# add regression line
xx = df_f.f_2
yy = df_f.f_3
mm,bb = np.polyfit(xx,yy,1)
plt.plot(xx, mm*xx + bb, c='darkblue')
plt.grid()
plt.show()

### Development of features over time:

In [None]:
# plot features as time series
for f in my_feats:
    plt.figure(figsize=(14,4))
    plt.scatter(df.time_id, df_f[f], alpha=0.1, color=my_color)
    plt.xlabel('time')
    plt.ylabel('feature')
    plt.title('Feature ' + f)
    plt.grid()
    plt.show()

In [None]:
# zoom in a feature development
plt.figure(figsize=(14,4))
plt.scatter(df.time_id, df_f.f_1, alpha=0.125, color=my_color)
plt.xlabel('time')
plt.ylabel('feature')
plt.title('Feature f_1 - Zoom')
plt.grid()
plt.xlim(0,30)
plt.show()

#### We can see that the features do not only depend on time but are also different depending on the specific investment.

### Plot Target vs. Features:

In [None]:
for f in my_feats:
    c = scipy.stats.pearsonr(df_f[f],df.target)[0]
    c = np.round(c,4)
    plt.scatter(df_f[f], df.target, color=my_color, alpha=0.1)
    plt.title('Target vs ' + f + '; corr = ' + str(c))
    plt.grid()
    plt.show()