In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import datatable as dt
from tqdm import tqdm
import matplotlib.gridspec as gridspec
from matplotlib.ticker import MaxNLocator

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA

from sklearn.cluster import KMeans

import plotly.express as px

from scipy import stats
from scipy.stats import norm, skew

import gc
plt.style.use('ggplot')

cust_color = ['#fdc029',
             '#f7c14c',
             '#f0c268',
             '#dfc498',
             '#d4c5af',
             '#c6c6c6',
             '#a6a6a8',
             '#86868a',
             '#68686d',
             '#4b4c52',
             '#303138',
             '#171820',
             ]

plt.rcParams['figure.figsize'] = (18, 14)
plt.rcParams['figure.dpi'] = 300
plt.rcParams['axes.grid'] = True
plt.rcParams['grid.color'] = cust_color[3]
plt.rcParams['grid.alpha'] = 0.5
plt.rcParams['grid.linestyle'] = '--'
plt.rcParams['font.family'] = 'monospace'

plt.rcParams['axes.edgecolor'] = 'black'
plt.rcParams['figure.frameon'] = False
plt.rcParams['axes.spines.left'] = False
plt.rcParams['axes.spines.bottom'] = False
plt.rcParams['axes.spines.top'] = False
plt.rcParams['axes.spines.right'] = False
plt.rcParams['axes.linewidth'] = 1.5

import warnings
warnings.filterwarnings('ignore')

In [None]:
train = dt.fread('../input/ubiquant-market-prediction/train.csv').to_pandas()

In [None]:
train.head()

In [None]:
print(f'Train number of instance : {train.shape[0]}')

In [None]:
print(f'Train missing value count : {train.isna().sum().sum()}')

In [None]:
print(f'Train number of unique investments : {train.investment_id.nunique()}')

In [None]:
print(f'Train number of unique investments : {train.time_id.nunique()}')

In [None]:
time_count = train.groupby('investment_id')['time_id'].count()
fig, ax = plt.subplots(figsize = (12, 9))
sns.histplot(time_count, color = cust_color[-1], kde = True)
plt.title('Number of time_id\'s per Investment Distribution')

In [None]:
sample_train = train.sample(frac = 0.05, random_state = 42)

In [None]:
from statsmodels.stats.weightstats import ztest

diff = np.mean(train.target) - np.mean(sample_train.target)
t, p = ztest(train.target, x2 = sample_train.target, value = diff)
(np.nanmean(sample_train.target) - np.nanmean(train.target)) / train.target.std()

In [None]:
del train
gc.collect()

In [None]:
features = [f'f_{i}' for i in range(300)]

for f in features:
    sample_train[f] = sample_train[f].astype('float16')

In [None]:
def plot_dist3(df, feature, title):
    
    fig = plt.figure(constrained_layout = True)
    grid = gridspec.GridSpec(ncols = 3, nrows = 2, figure = fig)
    
    ax1 = fig.add_subplot(grid[0, :2])
    
    ax1.set_title('Histogram')
    
    sns.distplot(df.loc[:, feature],
                hist = True,
                kde = True,
                fit = norm,
                hist_kws = {
                    'rwidth' : 0.85,
                    'edgecolor' : 'black',
                    'linewidth' : .5,
                    'alpha' : 0.8},
                 ax = ax1,
                 color = cust_color[0]
                )
    
    ax1.axvline(df.loc[:, feature].mean(), color = 'Green', linestyle = 'dashed', linewidth = 3)
    
    min_ylim, max_ylim = plt.ylim()
    ax1.text(df.loc[:, feature].mean() * 2, max_ylim * 0.95, 'Mean : {:.2f}'.format(df.loc[:, feature].mean()), color = 'Green', fontsize = '12',
            bbox = dict(boxstyle = 'round', facecolor = 'red', alpha = 0.5))
    ax1.legend(labels = ['Actual', 'Normal'])
    ax1.xaxis.set_major_locator(MaxNLocator(nbins = 12))
    
    ax2 = fig.add_subplot(grid[1, :2])
    
    ax2.set_title('Probability Plot')
    
    stats.probplot(df.loc[:, feature],
                  plot = ax2)
    ax2.get_lines()[0].set_markerfacecolor('#e74c3c')
    ax2.get_lines()[0].set_markersize(12.0)
    ax2.xaxis.set_major_locator(MaxNLocator(nbins = 16))
    
    ax3 = fig.add_subplot(grid[:, 2])
    
    ax3.set_title('Box Plot')
    
    sns.boxplot(y = feature, data = df, ax = ax3, color = cust_color[0])
    ax3.yaxis.set_major_locator(MaxNLocator(nbins = 24))
    
    plt.suptitle(f'{title}', fontsize = 24, fontname = 'monospace', weight = 'bold')

In [None]:
plot_dist3(sample_train, 'target', 'Target Distribution')

In [None]:
features_std = sample_train.iloc[:, 4:].apply(lambda x : x.std()).sort_values(ascending = False)
f_std = sample_train[features_std.iloc[:20].index.tolist()]

features_skew = np.abs(sample_train.iloc[:, 4:].apply(lambda x : skew(x)).sort_values(ascending = False))
skewed = sample_train[features_skew.iloc[:20].index.tolist()]

In [None]:
def feat_dist(df, cols, rows = 3, columns = 3, title = None):
    
    fig, axes = plt.subplots(rows, columns, figsize = (30, 25), constrained_layout = True)
    axes = axes.flatten()
    
    for i, j in zip(cols, axes):
        sns.distplot(df[i],
                    ax = j,
                    fit = norm,
                    hist = False,
                    color = cust_color[3],
                    kde_kws = {'linewidth' : 3}
                    )
        
        (mu, sigma) = norm.fit(df[i])
        j.set_title('Dist of {0} Norm Fit : $\mu=${1:.2g}, $\sigma=${2:.2f}'.format(i, mu, sigma), weight = 'bold')
        j.legend(labels = [f'{i}', 'Normal Dist'])
        fig.suptitle(f'{title}', fontsize = 24, weight = 'bold')

In [None]:
feat_dist(sample_train, f_std.columns.tolist(), rows = 5, columns = 4, title = 'Distribution Of High Std Features')

In [None]:
feat_dist(sample_train, skewed.columns.tolist(), rows = 5, columns = 4, title = 'Distribution of Skewed Features')

In [None]:
means = []

for i in sample_train.iloc[:, 4:].columns:
    means.append(sample_train[i].mean())
print(f'Mean of Feature Means{np.nanmean(means)}')

In [None]:
corr = sample_train.corrwith(sample_train['target']).iloc[:-1].to_frame()
corr['Abs Corr'] = corr[0].abs()
sorted_corr = corr.sort_values('Abs Corr', ascending = False)['Abs Corr']
fig, ax = plt.subplots(figsize = (12, 8))
sns.heatmap(sorted_corr.iloc[1:].to_frame()[sorted_corr >= .04], cmap = 'coolwarm', annot = True, vmin = -1, vmax = 1, ax = ax)
plt.title('Feature Corr With Target')
plt.show()

In [None]:
corr_train = sample_train.iloc[:, 4:].corr()
sns.clustermap(corr_train, metric = 'correlation', cmap = 'Reds', figsize = (20, 20))
plt.suptitle('Correlations Between Features', fontsize = 24, weight = 'bold')
plt.show()

In [None]:
corr = corr_train.abs()

corrs = corr.unstack()
pair = corrs.sort_values(ascending = False)
pair = pair.reset_index(name = 'correlation').rename(columns = {'level_0' : 'feature_a', 'level_1' : 'feature_b', 0 : 'correlation'})
pair = pair[pair['feature_a'] != pair['feature_b']].iloc[::2, :]
pair = pair[:10]
pair

In [None]:
sns.jointplot(sample_train[pair['feature_a'].iloc[0]], sample_train[pair['feature_b'].iloc[0]], kind = 'reg', color = cust_color[0], height = 8,
             joint_kws = {'scatter_kws' : dict(alpha = 0.5, edgecolor = 'r', linewidth = 0.5)})
plt.show()

In [None]:
def hex_plot(df, rows = 3, columns = 3, title = None):
    
    fig, axes = plt.subplots(rows, columns, figsize = (30, 25), constrained_layout = True)
    axes = axes.flatten()
    
    for i, j in enumerate(axes):
        j.hexbin(sample_train[pair['feature_a'].iloc[i]], sample_train[pair['feature_b'].iloc[i]],
                gridsize = 100, cmap = 'Reds', bins = 'log')
        j.set_xlabel(pair['feature_a'].iloc[i])
        j.set_ylabel(pair['feature_b'].iloc[i])
        
        fig.suptitle(f'{title}', fontsize = 24, weight = 'bold')

In [None]:
hex_plot(sample_train, rows = 5, columns = 2, title = 'Highly Correlated Features')

In [None]:
features = sample_train.iloc[:, 4:].columns.tolist()

pipe = Pipeline([('scaler', StandardScaler()), ('pca', PCA())])
pipe.fit(sample_train[features])
pca_samples = pipe.transform(sample_train[features])

fig, ax = plt.subplots(figsize = (14, 5))
plt.plot(range(sample_train[features].shape[1]), pipe.named_steps['pca'].explained_variance_ratio_.cumsum(), linestyle = '--', drawstyle = 'steps-mid',
        color = cust_color[-1], label = 'Cumulative Explained Variance', linewidth = 1.5)
sns.barplot(np.arange(1, sample_train[features].shape[1] + 1), pipe.named_steps['pca'].explained_variance_ratio_, alpha = 0.85, color = cust_color[0],
           label = 'Individual Explained Variance', edgecolor = 'black', saturation = 2, linewidth = 0.5)

plt.ylabel('Explained Variance Ratio', fontsize = 14, fontname = 'monospace', weight = 'semibold')
plt.xlabel('Number of Principal Components', fontsize = 14, fontname = 'monospace', weight = 'semibold')
ax.set_title('Explained Variance', fontsize = 20, fontname = 'monospace', weight = 'bold')
plt.xticks(fontsize = 8, rotation = 90)
plt.legend(fontsize = 13)
plt.axis([0, 99, 0, 1])

In [None]:
loadings = pd.DataFrame(pipe.named_steps['pca'].components_[0:3, :], columns = features)
maxPC = 1.01 * np.max(np.max(np.abs(loadings.loc[0:5, :])))

fig, axes = plt.subplots(3, 1, figsize = (12, 9))
for i, ax in enumerate(axes):
    pc_loadings = loadings.loc[i, :]
    colors = [cust_color[0] if l > 0 else cust_color[-1] for l in pc_loadings]
    sns.barplot(x = pc_loadings.index, y = pc_loadings, ax = ax, palette = colors)
    ax.axhline(color = '#888888')
    ax.set_ylabel(f'PC{i+1}')
    ax.set_ylim(-maxPC, maxPC)
    ax.xaxis.set_tick_params(labelsize = 3, rotation = 90)
    
plt.suptitle('Component Loadings')
plt.tight_layout()

In [None]:
kmeans_per_k = [Pipeline([('scaler', StandardScaler()), ('km', KMeans(n_clusters = k, random_state = 42, max_iter = 100, n_init = 5, tol = 1e-4))]).fit(sample_train[features]) for k in range(1, 8)]
inertias = [model.named_steps['km'].inertia_ for model in kmeans_per_k]

plt.figure(figsize = (6, 3))
sns.lineplot(range(1, 8), inertias, color = cust_color[0], linewidth = 1.5)
plt.xlabel('k', fontsize = 15)
plt.ylabel('Inertia', fontsize = 15)

plt.title('Inertias and n_clusters', fontname = 'monospace', weight = 'bold')
plt.show()

In [None]:
kmeans = Pipeline([('scaler', StandardScaler()), ('km', KMeans(n_clusters = 4, random_state = 42, max_iter = 100, tol = 1e-4))]).fit(sample_train[features])
clusters = kmeans.fit_predict(sample_train[features])
clusters = [str(number) for number in clusters]

In [None]:
pipe = Pipeline([('scaler', StandardScaler()), ('pca', PCA(n_components = 2))])
pipe.fit(sample_train[features])
pca_samples = pipe.transform(sample_train[features])
sns.scatterplot(pca_samples[:, 0], pca_samples[:, 1], hue = clusters)
plt.title('Clusters on Reduced Dimension')
plt.show()

In [None]:
centers = pd.DataFrame(kmeans.named_steps['km'].cluster_centers_, columns = features)
fig, axes = plt.subplots(4, 1, figsize = (12, 12))
for i, ax in enumerate(axes):
    center = centers.loc[i, :]
    maxPC = 1.01 * np.max(np.max(np.abs(center)))
    colors = [cust_color[0] if l > 0 else cust_color[-1] for l in center]
    ax.axhline(color = '#888888')
    sns.barplot(x = center.index, y = center, ax = ax, palette = colors)
    ax.set_ylabel(f'Cluster {i}')
    ax.set_ylim(-maxPC, maxPC)
    ax.xaxis.set_tick_params(labelsize = 3, rotation = 90)
    
plt.suptitle('Centroid Coordinates')
plt.tight_layout()

In [None]:
pipe = Pipeline([('scaler', StandardScaler()), ('pca', PCA(n_components = 4))])
pipe.fit(sample_train[features])
pca_samples = pipe.transform(sample_train[features])

total_var = pipe.named_steps['pca'].explained_variance_ratio_.sum() * 100

labels = {
    str(i) : f'PC {i+1} ({var : .1f}%)'
    for i, var in enumerate(pipe.named_steps['pca'].explained_variance_ratio_ * 100)
}
labels['color'] = 'Cluster'

fig = px.scatter_matrix(
    pca_samples,
    color = clusters,
    dimensions = range(4),
    labels = labels,
    title = f'Total Explained Variance : {total_var : .2f}% by Clusters',
    opacity = 0.5
)
fig.update_traces(diagonal_visible = False)
fig.show()

In [None]:
sample_train.sort_values(by = 'time_id', inplace = True)
sample_train['target_cumsum'] = sample_train.groupby(['investment_id'])['target'].transform('cumsum')

In [None]:
fig, ax = plt.subplots(3, 1, figsize = (12, 12))

sns.lineplot(sample_train.groupby('time_id')['investment_id'].nunique().index, sample_train.groupby('time_id')['investment_id'].nunique(), color = cust_color[-1], ax = ax[0])
ax[0].set_ylabel('Observation Count')
ax[0].set_title('Number of Observations by Time')

sns.regplot(sample_train.groupby('time_id')['target'].mean().index, sample_train.groupby('time_id')['target'].mean(), color = cust_color[0],
           scatter_kws = dict(alpha = 0.5, edgecolor = 'r', linewidth = 0.5), line_kws = dict(color = cust_color[-1]), ax = ax[1], order = 2, ci = None)
ax[1].set_ylabel('Mean Target')
ax[1].set_title('Target Values By Time')

sns.regplot(sample_train.groupby('time_id')['target_cumsum'].mean().index, sample_train.groupby('time_id')['target_cumsum'].mean(), color = cust_color[0],
           scatter_kws = dict(alpha = 0.5, edgecolor = 'r', linewidth = 0.5), line_kws = dict(color = cust_color[-1]), ax = ax[2], order = 2, ci = None)
ax[2].set_ylabel('Mean Cumulative Target')
ax[2].set_title('Cumulative Target')
plt.tight_layout()

In [None]:
sample_train['time_target_mean'] = sample_train.groupby(['time_id'])['target'].transform('mean')
plot_dist3(sample_train, 'time_target_mean', 'Mean Target by Time')