In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
import warnings
import matplotlib as mpl
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
warnings.filterwarnings("ignore")


In [None]:
%matplotlib inline

## WIP, work in progress

Using @slawekbiel Feather dataset: https://www.kaggle.com/slawekbiel/ubiquant-trainfeather-32-bit

In [None]:
%%time
train_data = pd.read_feather('../input/ubiquant-trainfeather-32-bit/train32.feather')

In [None]:
font = {'family' : 'normal',
        'size'   : 15}
mpl.rc('font', **font)

number of unique values per time ids:

# **Assuming targtes are the returns from market, check what proportion are positive vs negative vs 0**

In [None]:
# target is 0 only 1736 times out of 3141410
sum(train_data['target'] == 0)

In [None]:
# convert target to binary
train_data['binary_target'] = train_data['target'] > 0

In [None]:
fig, ax = plt.subplots(figsize=(20,7))
sns.countplot(y = train_data['binary_target'], ax=ax)
ax.set_title("Positive Target vs Negative Target", fontsize = 20)
ax.ticklabel_format(style='plain', axis='x')
ax.set_xlabel("Count", fontsize = 20)
ax.set_ylabel("Target Positive > 0", fontsize = 20);

In [None]:
round(train_data['binary_target'].value_counts()*100/train_data.shape[0])

### Insights

* Target is rarely 0, which makes sense, because, you will either get positive or negative returns
* Target is negative for 56% Investment-TimeIDs and positive for 44% of cases

In [None]:
print(f"Number of unique investment IDs: {len(train_data.investment_id.unique())}")

# **Plot 4 investment targets over time**

**take 4 investments, that have maximum number of time_ids**

In [None]:
fig, axs = plt.subplots(nrows=4, ncols = 1, figsize=(15,15))
for i in range(0,4):
    inv_id = train_data.investment_id.value_counts().index[i]
    train_data.loc[train_data.investment_id==inv_id, ].plot(x= 'time_id',
                                                          y = 'target',
                                                          ax=axs[i])
    axs[i].set_title(f"Investment_ID_{inv_id}", fontsize= 20)
    axs[i].set_ylabel("target", fontsize= 20)
    axs[i].set_xlabel("time_id", fontsize= 20)
plt.tight_layout()

### Insights
    * Returns/target is volative ranging from -4 to 4

# how are the targets for these 4 investments correlated

In [None]:
tgt1 = train_data.loc[train_data.investment_id == train_data.investment_id.value_counts().index[0], 'target']
tgt2 = train_data.loc[train_data.investment_id == train_data.investment_id.value_counts().index[1], 'target']
tgt3 = train_data.loc[train_data.investment_id == train_data.investment_id.value_counts().index[2], 'target']
tgt4= train_data.loc[train_data.investment_id == train_data.investment_id.value_counts().index[3], 'target']

fig, ax = plt.subplots(figsize=(10,10))

sns.heatmap(np.corrcoef(np.vstack([tgt1[0:1209], tgt2, tgt3, tgt4])), 
            cbar_kws={"orientation": "horizontal"},
            cmap = "flare",
            annot=True,
            linewidths=2,
            linecolor='yellow');

### Insights
    * Returns/target for these 4 investment IDs is positively correlated, with a maximum correlation of 0.33

# Check distribution of targets

In [None]:
fig, axs = plt.subplots(nrows=4, ncols = 1, figsize=(15,15))
for i in range(0,4):
    inv_id = train_data.investment_id.value_counts().index[i]
    sns.distplot(train_data.loc[train_data.investment_id==inv_id, 'target'], ax=axs[i])
    axs[i].set_title(f"Investment_ID_{inv_id}", fontsize= 20)
    axs[i].set_ylabel("target", fontsize= 20)
    axs[i].set_xlabel("time_id", fontsize= 20)
plt.tight_layout()

### Insights
    * nothing specific

# Check auto correlation of targets

In [None]:
train_data['target'] = train_data['target'].astype('float')

In [None]:
fig, ax = plt.subplots(figsize=(15, 15), nrows=2, ncols = 2)
axis_list = [ax[0,0], ax[1,0], ax[0,1], ax[1,1]]

for i in range(0,4):
    inv_id = train_data.investment_id.value_counts().index[i]
    df = train_data.loc[train_data.investment_id==inv_id, ['target', 'time_id']].set_index('time_id')    
    plot_acf(df, lags = 12, title = f"Auto Correlation Investment ID : {inv_id}", ax = axis_list[i])
plt.tight_layout()

### Insights
    * No auto correlation is observed, except for lag 1 (about 0.2)

# how is the investment value of 100K doing over time period

**returns are calculated as amount += amount* target/100**

In [None]:
def total_returns(retrun_per):
    global initial_investment
    initial_investment += initial_investment * (retrun_per/100)
    return(np.trunc(initial_investment))

In [None]:
fig, ax = plt.subplots(figsize=(15, 15), nrows=8, ncols = 2)
axis_list = [ax[0,0], ax[0,1], ax[1,0], ax[1,1],
            ax[2,0], ax[2,1], ax[3,0], ax[3,1],
            ax[4,0], ax[4,1], ax[5,0], ax[5,1],
            ax[6,0], ax[6,1], ax[7,0], ax[7,1]]

for i in range(0,16):
    returns = []
    inv_id = train_data.investment_id.value_counts().index[i]
    df = train_data.loc[train_data.investment_id==inv_id, ['target', 'time_id']].set_index('time_id')
    initial_investment = 100000
    returns = df['target'].apply(lambda x: total_returns(x))
    sns.lineplot(x = df.index, y = returns, ax=axis_list[i])
plt.tight_layout()

### Insights
    * Assuming the returns calculation is correct, only 5/16 of the investments are giving appreciation of initial investment