In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('../input/g-research-crypto-forecasting/train.csv')
df.head()

In [None]:
df.describe()

In [None]:
# plot close values as time series for all the assets
f = plt.figure(figsize=(15,30))

for asset_id in range(10):
    coin_df = df[df["Asset_ID"]==asset_id].set_index("timestamp")
    ax = f.add_subplot(7,2,asset_id+1)
    plt.plot(coin_df['Close'], label=asset_id)
    plt.legend()
    plt.xlabel('Time')

plt.tight_layout()
plt.show()

In [None]:
coins_df = pd.DataFrame()

for asset_id in range(10):
    coin_df = df[df["Asset_ID"]==asset_id].set_index("timestamp")[['Close']].rename(columns={'Close':asset_id})
    coins_df = pd.concat([coins_df,coin_df], axis=1)

coins_df = coins_df.pct_change() #get returns
coins_df = (coins_df-coins_df.min())/ (coins_df.max() - coins_df.min())
coins_df.dropna(inplace=True) #just ignoring the liberties I'm taking here

In [None]:
# induce a sudden spike anomaly
coins_df.loc[1555080360,2] = 0.7

In [None]:
# get a sense of how noisy this data is by colour coding the values
cm = sns.light_palette("green", as_cmap=True)
coins_df.head(20).style.background_gradient(cmap=cm,axis=None).set_precision(4)

In [None]:
coins_df.plot(figsize=(15,5), alpha=0.5);

In [None]:
from sklearn.decomposition import PCA
from numpy.testing import assert_array_almost_equal

# too much memory for the large dataset
X_train = coins_df.head(10000)

pca = PCA(n_components=3)
pca.fit(X_train)

X_train_pca = pca.transform(X_train)

X_projected = pca.inverse_transform(X_train_pca)


In [None]:
X_projected = pd.DataFrame(X_projected, index=coins_df.index[:len(X_projected)])
X_projected.head(20).style.background_gradient(cmap=cm,axis=None).set_precision(4)

In [None]:
(X_train - X_projected).head(20).style.background_gradient(cmap=cm,axis=None).set_precision(4)

In [None]:
(X_train).head(20).plot();

In [None]:
(X_train - X_projected).head(20).plot();