In [None]:
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
from scipy.stats import probplot, pearsonr
import matplotlib.pyplot as plt
import seaborn as sns

## Ubiquant Market Prediction

## 1. Introduction

This competition's objective is predicting value of an obfuscated metric which is relevant for making trading decisions. Dataset contains anonymized features extracted from thousands of investments and an anonymized target. There are 3579 unique investments in training set, but investments in training set doesn't necessarily appear in public or private test set. There are also 1211 unique time IDs in training set.  All of the investments doesn't necessarily appear in all time IDs.

There are 303 columns in training and test set after removing row_id. row_id can be removed safely since it is combination of time_id and investment_id columns, and it doesn't contain any additional information. The mentioned columns are:

* `time_id`: Unique ID of the time bucket
* `investment_id`: Unique ID of the investment
* `target`: Anonymized target
* `f0` - `f299`: Anonymized features

In [None]:
train_dtypes = {f'f_{i}': np.float32 for i in range(300)}
train_dtypes['investment_id'] = np.uint16
train_dtypes['time_id'] = np.uint16
train_dtypes['target'] = np.float32

df_train = pd.read_csv('../input/ubiquant-market-prediction/train.csv', usecols=list(train_dtypes.keys()), dtype=train_dtypes)
print(f'Training Set Shape: {df_train.shape} - Memory Usage: {df_train.memory_usage().sum() / 1024 ** 2:.2f} MB')

As the training set is quite large, it takes more than 8 minutes to read it. Training set can be written as a pickle file so it can be read faster later.

In [None]:
df_train.to_pickle('train.pkl')

## 2. Evaluation

Submissions are scored on the mean of Pearson correlation coefficient for each time ID. Pearson correlation coefficient (Pearson's r) is a measure of linear correlation between two sets of values. It can be denoted as

$\huge \text{r} = \frac{n\sum{xy}-(\sum{x})(\sum{y})}{\sqrt{ [n \sum{x^2}-(\sum{x})^2 ][n \sum{y^2}-(\sum{y})^2 }]}$

* $r$ = Pearson Correlation Coefficient
* $n$ = Number of samples
* $x$ = First set of values
* $y$ = Second set of values

Mean of Pearson correlation coefficient across time IDs can be denoted as

$\huge \text{Mean r} = {\frac{1}{T} \sum_{i=1}^{T}} t_i r$

* $T$ = Number of time IDs
* $t_i r$ = ith time ID's Pearson correlation coefficient


Fastest way of calculating mean Pearson correlation coefficient is utilizing `groupby` method of `pandas.DataFrame`, however it is not very flexible. The implementations below require predictions column named as _predictions_ and they don't output Pearson correlation coefficient of every time ID separately.

In [None]:
def pearson_correlation_coefficient(df):
    return df.corr()['target']['predictions']

def mean_pearson_correlation_coefficient(df):
    return np.mean(df[['time_id', 'target', 'predictions']].groupby('time_id').apply(pearson_correlation_coefficient))

Pearson correlation coefficient is the ratio between covariances of two sets of values and the product of their standard deviations. That means Pearson correlation coefficient won't change when standard deviations and order of values are kept same for time IDs. Therefore, actual values of predictions doesn't matter and it is similar to a ranking metric in that sense.

In [None]:
df_train['predictions'] = np.random.rand(len(df_train))
score = mean_pearson_correlation_coefficient(df_train)
print(f'Pearson correlation coefficient: {score:.6f} - (Predictions mean: {df_train["predictions"].mean():.4f} std: {df_train["predictions"].std():.4f})')
df_train['predictions'] += 999999
score = mean_pearson_correlation_coefficient(df_train)
print(f'Pearson correlation coefficient: {score:.6f} - (Predictions mean: {df_train["predictions"].mean():.4f} std: {df_train["predictions"].std():.4f})')

df_train.drop(columns=['predictions'], inplace=True)

## 3. Target

Target is anonymized and defined as an obfuscated metric relevant for making trading decisions. Target is mean centric and follows a very symmetrical normal distribution which are strong evidences of standardization. Target has very long tails on both ends so distribution truncation or trimming can be quite useful for dealing with outliers.

In [None]:
def visualize_target(df, target):
    
    print(f'{target}\n{"-" * len(target)}')
        
    print(f'Mean: {df[target].mean():.4f}  -  Median: {df[target].median():.4f}  -  Std: {df[target].std():.4f}')
    print(f'Min: {df[target].min():.4f}  -  25%: {df[target].quantile(0.25):.4f}  -  50%: {df[target].quantile(0.5):.4f}  -  75%: {df[target].quantile(0.75):.4f}  -  Max: {df[target].max():.4f}')
    print(f'Skew: {df[target].skew():.4f}  -  Kurtosis: {df[target].kurtosis():.4f}')
    missing_count = df[df[target].isnull()].shape[0]
    total_count = df.shape[0]
    print(f'Missing Values: {missing_count}/{total_count} ({missing_count * 100 / total_count:.4f}%)')

    fig, axes = plt.subplots(ncols=2, figsize=(24, 6), dpi=100)

    sns.kdeplot(df[target], label=target, fill=True, ax=axes[0])
    axes[0].axvline(df[target].mean(), label='Mean', color='r', linewidth=2, linestyle='--')
    axes[0].axvline(df[target].median(), label='Median', color='b', linewidth=2, linestyle='--')
    axes[0].legend(prop={'size': 15})
    probplot(df[target], plot=axes[1])
    
    for i in range(2):
        axes[i].tick_params(axis='x', labelsize=12.5)
        axes[i].tick_params(axis='y', labelsize=12.5)
        axes[i].set_xlabel('')
        axes[i].set_ylabel('')
    axes[0].set_title(f'{target} Distribution', fontsize=15, pad=12)
    axes[1].set_title(f'{target} Probability Plot', fontsize=15, pad=12)
    
    plt.show()

visualize_target(df_train, 'target')

## 4. Time IDs

time_id column is the ID code for the time the data was gathered. Time IDs are in chronological order, but the real time gap between time IDs is not constant and will be shorter for the final private test set than in the training set. As mentioned before, there are 1211 unique time IDs in training set and it will be less in private test set.

In [None]:
def visualize_time_ids(df, column):
    
    print(f'{column}\n{"-" * len(column)}')
    print(f'Mean: {df[column].mean():.4f}  -  Median: {df[column].median():.4f}  -  Std: {df[column].std():.4f}')
    print(f'Min: {df[column].min():.4f}  -  25%: {df[column].quantile(0.25):.4f}  -  50%: {df[column].quantile(0.5):.4f}  -  75%: {df[column].quantile(0.75):.4f}  -  Max: {df[column].max():.4f}')
    print(f'Skew: {df[column].skew():.4f}  -  Kurtosis: {df[column].kurtosis():.4f}')
    missing_count = df[df[column].isnull()].shape[0]
    total_count = df.shape[0]
    print(f'Missing Values: {missing_count}/{total_count} ({missing_count * 100 / total_count:.4f}%)')

    fig, axes = plt.subplots(ncols=2, figsize=(24, 6), dpi=100)

    sns.kdeplot(df[column], label=column, fill=True, ax=axes[0])
    axes[0].axvline(df[column].mean(), label='Mean', color='r', linewidth=2, linestyle='--')
    axes[0].axvline(df[column].median(), label='Median', color='b', linewidth=2, linestyle='--')
    axes[0].legend(prop={'size': 15})
    axes[1].plot(df.set_index('time_id')[column], label=column)
    
    for i in range(2):
        axes[i].tick_params(axis='x', labelsize=12.5)
        axes[i].tick_params(axis='y', labelsize=12.5)
        axes[i].set_ylabel('')
    axes[0].set_xlabel('')
    axes[1].set_xlabel('time_id', fontsize=12.5)
    axes[0].set_title(f'{column} Distribution', fontsize=15, pad=12)
    axes[1].set_title(f'{column} as a Function of Time', fontsize=15, pad=12)
    
    plt.show()


Number of samples in time IDs are quite different because every investment doesn't necessarily appear in every time ID. Time IDs with less than 2000 samples look like outliers. More than 75% of time IDs have more than 2000 samples. All of the time IDs with less than 2000 samples are observed between time_id 350 and 550. There are couple outliers between time_id 1100 and 1200 as well.

In [None]:
df = df_train.groupby('time_id')['target'].count().reset_index().rename(columns={'target': 'sample_counts_in_time_ids'})
visualize_time_ids(df, 'sample_counts_in_time_ids')

Target means in time IDs are centered around 0 and they are quite balanced even though there are some outliers, but outliers look very natural. Time IDs with low number of samples match time IDs with high target mean. Target might be correlated with number of samples but it is hard to tell which causes which. Very high and very low target mean values are also observed in the same period (between time_id 350 and 550).

In [None]:
df = df_train.groupby('time_id')['target'].mean().reset_index().rename(columns={'target': 'target_means_in_time_ids'})
visualize_time_ids(df, 'target_means_in_time_ids')

Target standard deviations in time IDs are centered around 0.9 and they are quite balanced as well, except one of them. One of the time IDs have very low target standard deviation (0.45) and it skews the distribution to left. That outlier doesn't look very natural and it probably is an anomaly. Target standard deviation outliers are also observed in the same period but they are not affected as much as target means and sample counts.

In [None]:
df = df_train.groupby('time_id')['target'].std().reset_index().rename(columns={'target': 'target_stds_in_time_ids'})
visualize_time_ids(df, 'target_stds_in_time_ids')

## 5. Investments

investment_id column is the ID code for an investment. There are 3579 unique investments in training set and private test set will include new unseen investments. Investments appear only once in time IDs, so samples are time_id-investment_id combinations. In that case, visualization of sample counts as a function of time, is identical with unique number of investments in time IDs.

In [None]:
def visualize_investment_ids(df, column):
    
    print(f'{column}\n{"-" * len(column)}')
    print(f'Mean: {df[column].mean():.4f}  -  Median: {df[column].median():.4f}  -  Std: {df[column].std():.4f}')
    print(f'Min: {df[column].min():.4f}  -  25%: {df[column].quantile(0.25):.4f}  -  50%: {df[column].quantile(0.5):.4f}  -  75%: {df[column].quantile(0.75):.4f}  -  Max: {df[column].max():.4f}')
    print(f'Skew: {df[column].skew():.4f}  -  Kurtosis: {df[column].kurtosis():.4f}')
    missing_count = df[df[column].isnull()].shape[0]
    total_count = df.shape[0]
    print(f'Missing Values: {missing_count}/{total_count} ({missing_count * 100 / total_count:.4f}%)')

    fig, axes = plt.subplots(ncols=2, figsize=(24, 6), dpi=100)

    sns.kdeplot(df[column], label=column, fill=True, ax=axes[0])
    axes[0].axvline(df[column].mean(), label='Mean', color='r', linewidth=2, linestyle='--')
    axes[0].axvline(df[column].median(), label='Median', color='b', linewidth=2, linestyle='--')
    axes[0].legend(prop={'size': 15})
    axes[1].plot(df.set_index('investment_id')[column], label=column)
    
    for i in range(2):
        axes[i].tick_params(axis='x', labelsize=12.5)
        axes[i].tick_params(axis='y', labelsize=12.5)
        axes[i].set_ylabel('')
    axes[0].set_xlabel('')
    axes[1].set_xlabel('investment_id', fontsize=12.5)
    axes[0].set_title(f'{column} Distribution', fontsize=15, pad=12)
    axes[1].set_title(f'{column} as a Function of Investment', fontsize=15, pad=12)
    
    plt.show()


Number of samples per investment is quite different as expected. This is related to the period between time IDs 350 and 550 at when most of the investments are not observed. Number of samples per investment forms a bimodal distribution. Investments with low number of samples are centered around 400 sample count, and investments with high number of samples are centered around 1100 sample count.

In [None]:
df = df_train.groupby('investment_id')['target'].count().reset_index().rename(columns={'target': 'sample_counts_in_investment_ids'})
visualize_investment_ids(df, 'sample_counts_in_investment_ids')

Target means in investments are centered around 0 just like target means in time IDs. It has a very symmetrical and balanced distribution with some small peaks. One investment has a very high target mean (0.795) which could be an outlier. That investment might have higher number of samples between time IDs 350 and 550 because target means are higher at that period. 

In [None]:
df = df_train.groupby('investment_id')['target'].mean().reset_index().rename(columns={'target': 'target_means_in_investment_ids'})
visualize_investment_ids(df, 'target_means_in_investment_ids')

Target standard deviations in investments are more stable than target means except one investment. That investment has 0 target standard deviation and it has only two samples. That investment's low target standard deviation is related to its low sample count so it might not be an outlier.

In [None]:
df = df_train.groupby('investment_id')['target'].std().reset_index().rename(columns={'target': 'target_stds_in_investment_ids'})
visualize_investment_ids(df, 'target_stds_in_investment_ids')

## 6. Features

There are 300 anonymized continuous features in dataset and they are named from f_0 to f_299. All of the feature distributions, target interactions, feature means and standard deviations along time and investment axis are visualized. There are little summaries of statistical properties displayed before the visualizations.

In [None]:
def visualize_feature(df, column):
    
    print(f'{column}\n{"-" * len(column)}')
    print(f'Mean: {df[column].mean():.4f}  -  Median: {df[column].median():.4f}  -  Std: {df[column].std():.4f}')
    print(f'Min: {df[column].min():.4f}  -  25%: {df[column].quantile(0.25):.4f}  -  50%: {df[column].quantile(0.5):.4f}  -  75%: {df[column].quantile(0.75):.4f}  -  Max: {df[column].max():.4f}')
    print(f'Skew: {df[column].skew():.4f}  -  Kurtosis: {df[column].kurtosis():.4f}')
    missing_count = df[df[column].isnull()].shape[0]
    total_count = df.shape[0]
    print(f'Missing Values: {missing_count}/{total_count} ({missing_count * 100 / total_count:.4f}%)')

    fig, axes = plt.subplots(ncols=2, nrows=3, figsize=(24, 22), dpi=100)

    sns.kdeplot(df[column], label=column, fill=True, ax=axes[0][0])
    axes[0][0].axvline(df[column].mean(), label='Mean', color='r', linewidth=2, linestyle='--')
    axes[0][0].axvline(df[column].median(), label='Median', color='b', linewidth=2, linestyle='--')
    axes[0][0].legend(prop={'size': 15})
    sns.scatterplot(x=df[column], y=df['target'], ax=axes[0][1])
    
    df_feature_means_in_time_ids = df_train.groupby('time_id')[column].mean().reset_index().rename(columns={column: f'{column}_means_in_time_ids'})
    axes[1][0].plot(df_feature_means_in_time_ids.set_index('time_id')[f'{column}_means_in_time_ids'], label=f'{column}_means_in_time_ids')
    df_feature_stds_in_time_ids = df_train.groupby('time_id')[column].std().reset_index().rename(columns={column: f'{column}_stds_in_time_ids'})
    axes[1][1].plot(df_feature_stds_in_time_ids.set_index('time_id')[f'{column}_stds_in_time_ids'], label=f'{column}_stds_in_time_ids')
    
    df_feature_means_in_investment_ids = df_train.groupby('investment_id')[column].mean().reset_index().rename(columns={column: f'{column}_means_in_investment_ids'})
    axes[2][0].plot(df_feature_means_in_investment_ids.set_index('investment_id')[f'{column}_means_in_investment_ids'], label=f'{column}_means_in_investment_ids')
    df_feature_stds_in_investment_ids = df_train.groupby('investment_id')[column].std().reset_index().rename(columns={column: f'{column}_stds_in_investment_ids'})
    axes[2][1].plot(df_feature_stds_in_investment_ids.set_index('investment_id')[f'{column}_stds_in_investment_ids'], label=f'{column}_stds_in_investment_ids')

    for i in range(3):
        for j in range(2):
            axes[i][j].tick_params(axis='x', labelsize=12.5)
            axes[i][j].tick_params(axis='y', labelsize=12.5)
            axes[i][j].set_ylabel('')
            
    axes[0][0].set_xlabel('')
    axes[0][1].set_xlabel(column, fontsize=12.5)
    axes[0][1].set_ylabel('target', fontsize=12.5)
    
    for i in range(2):
        axes[1][i].set_xlabel('time_id', fontsize=12.5)
        axes[1][i].set_ylabel(column, fontsize=12.5)
        
    for i in range(2):
        axes[2][i].set_xlabel('investment_id', fontsize=12.5)
        axes[2][i].set_ylabel(column, fontsize=12.5)
        
    axes[0][0].set_title(f'{column} Distribution', fontsize=15, pad=12)
    axes[0][1].set_title(f'{column} vs Target', fontsize=15, pad=12)
    axes[1][0].set_title(f'{column} Means as a Function of Time', fontsize=15, pad=12)
    axes[1][1].set_title(f'{column} Stds as a Function of Time', fontsize=15, pad=12)
    axes[2][0].set_title(f'{column} Means as a Function of Investment', fontsize=15, pad=12)
    axes[2][1].set_title(f'{column} Stds as a Function of Investment', fontsize=15, pad=12)
    
    plt.show()


All of the features are zero-centered and they have standard deviation of one since they are standardized during the anonymization process. Most of the features have symmetrical normal distributions but some of them have very extreme outliers which are skewing their distributions.

Feature means and standard deviations vary between different time IDs and investments. It looks like feature means and standard deviations are dependent to time. They make sharp transitions on some periods. Feature standard deviations are more likely to make sharp transitions on different periods however feature mean outliers are observed in the same period most of the time. Feature means and standard deviations per investment looks randomly distributed among investments because it is related to those investment's time IDs.

In [None]:
for i in range(300):
    visualize_feature(df_train, f'f_{i}')

## 7. Time Series

Training set is already in time series format as it is sorted by time_id and investment_id columns. Time series of an investment can be selected by directly indexing the investment_id. However, it might be risky to use time series models for predicting target for couple reasons. First, there are lots missing time IDs in different investments. Second, gap between time IDs is not constant and will be shorter in final private test set. Third, some of the investments have very few samples. Those things makes it hard to utilize time property of data.

In [None]:
def visualize_time_series(df, investment_id):
    
    df_investment = df.loc[df['investment_id'] == investment_id]

    fig, ax = plt.subplots(figsize=(24, 8), dpi=100)
    ax.plot(df_investment.set_index('time_id')['target'], label='target', linewidth=3)
    for i in range(300):
        ax.plot(df_investment.set_index('time_id')[f'f_{i}'], alpha=0.05)

    ax.set_xlabel('time_id', fontsize=12.5)
    ax.set_ylabel('Features', fontsize=12.5)
    ax.tick_params(axis='x', labelsize=12.5)
    ax.tick_params(axis='y', labelsize=12.5)
    ax.set_title(f'Investment {investment_id} - Features and Target Along Time Axis', fontsize=15, pad=12.5)
    ax.legend(prop={'size': 15})
    
    plt.show()


In [None]:
visualize_time_series(df_train, 0)

## 8. Feature Sequences

Even though there is no order in features, feature sequences can show some insights about target. Samples with lowest and highest target value are visualized below. Their feature sequences are quite different from each other. Some of the peaks and pits are almost occured in the same places but those occurences don't necessarily suggest that features are sequential. It might be related to their time IDs being really close to each other since features were mostly dependent to time IDs.

In [None]:
def visualize_feature_sequence(df, idx):
    
    sample = df.loc[idx, [f'f_{i}' for i in range(300)]].reset_index(drop=True)
    target = df.loc[idx, 'target']
    investment_id = df.loc[idx, 'investment_id']
    time_id = df.loc[idx, 'time_id']

    fig, ax = plt.subplots(figsize=(24, 8), dpi=100)
    ax.plot(sample, linewidth=3)
    
    ax.set_xlabel('Features (f_#)', fontsize=12.5)
    ax.set_ylabel('Values', fontsize=12.5)
    ax.tick_params(axis='x', labelsize=12.5)
    ax.tick_params(axis='y', labelsize=12.5)
    ax.set_title(f'Investment {investment_id} - Time ID {time_id} - Target {target:.6f}', fontsize=15, pad=12.5)
    ax.legend(prop={'size': 15})
    
    plt.show()


In [None]:
visualize_feature_sequence(df_train, 1621688)

In [None]:
visualize_feature_sequence(df_train, 1639094)