### The objective of this notebook is too see if there are relationships between investment IDs using the historical targets through time 
### We plot the results

In [None]:
import os
import pandas as pd
import numpy as np
import gc
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from mpl_toolkits import mplot3d
from scipy import stats
from pathlib import Path
import pickle
import math
import time
import umap
from sklearn.cluster import KMeans
import ipympl

In [None]:
%%time
n_features = 300
features = [f'f_{i}' for i in range(n_features)]
train = pd.read_pickle('../input/ubiquant-market-prediction-half-precision-pickle/train.pkl')
train.info()
train.head()

#### pivot target so that rows are investment ids and time ids are columns

In [None]:
inv_piv = train[['investment_id', 'target', 'time_id']].astype(np.float32).pivot(columns='time_id', index='investment_id', 
                                                                                 values='target').fillna(0)
invcols = inv_piv.columns.tolist()

#### We use umap to reduce dimensionality and kmeans to group investment ids. Then we plot the results.

#### There is some relationships, but this may be due to the distinction between frequent and infrequent investment id's appearing in the dataset

In [None]:
%matplotlib inline
pipe = Pipeline([('umap', umap.UMAP(n_components=3, min_dist=0, n_neighbors=10, random_state=21))])

pipe.fit(inv_piv[invcols])

kmeans = KMeans(n_clusters=5)
kmeans.fit(pipe['umap'].embedding_)


# Create the figure
fig = plt.figure(figsize=(8,8))
ax = plt.axes(projection='3d')

ax.scatter3D(pipe['umap'].embedding_[:, 0], pipe['umap'].embedding_[:, 1], 
           pipe['umap'].embedding_[:, 2], c = kmeans.labels_, s=0.5)
plt.show()

# Create the figure
fig = plt.figure(figsize=(8,8))
ax = plt.axes(projection='3d')

ax.scatter3D(pipe['umap'].embedding_[:, 1], pipe['umap'].embedding_[:, 0], 
           pipe['umap'].embedding_[:, 2], c = kmeans.labels_, s=0.5)
plt.show()

# Create the figure
fig = plt.figure(figsize=(8,8))
ax = plt.axes(projection='3d')

ax.scatter3D(pipe['umap'].embedding_[:, 2], pipe['umap'].embedding_[:, 1], 
           pipe['umap'].embedding_[:, 0], c = kmeans.labels_, s=0.5)
plt.show()

#### below makes the image interactive using plotly express

In [None]:
import plotly.express as px

pipe = Pipeline([('umap', umap.UMAP(n_components=3, min_dist=0, n_neighbors=30, random_state=21))])

pipe.fit(inv_piv[invcols])

kmeans = KMeans(n_clusters=5)
kmeans.fit(pipe['umap'].embedding_)

fig = px.scatter_3d(x=pipe['umap'].embedding_[:, 0], y=pipe['umap'].embedding_[:, 1], z=pipe['umap'].embedding_[:, 2], 
                    color = kmeans.labels_, size=kmeans.labels_, size_max=18, opacity=0.7)

# tight layout
fig.update_layout(margin=dict(l=0, r=0, b=0, t=0))