### Support the notebook if you like it. Upvote is FREE :)

# Imports

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

# Read data

In [None]:
assets = pd.read_csv('../input/g-research-crypto-forecasting/asset_details.csv')
sup_train = pd.read_csv('../input/g-research-crypto-forecasting/supplemental_train.csv')
sample = pd.read_csv('../input/g-research-crypto-forecasting/example_sample_submission.csv')
ex_test = pd.read_csv('../input/g-research-crypto-forecasting/example_test.csv')

In [None]:
train = pd.read_csv('../input/g-research-crypto-forecasting/train.csv')

# EDA

In [None]:
train.head()

In [None]:
train.shape

### We have 24 Million records with 10 columns

In [None]:
train['timestamp'].nunique()

### We have total of 1.9 Million timestamps

### Lets explore assets csv file

In [None]:
assets

### Lets check how weights are distributed

In [None]:
plt.subplots(figsize=(20,10))
sns.barplot(x='Asset_Name', y='Weight', data=assets.sort_values(by=['Weight'],ascending=False))

### Lets see the data distribution in each timestamp

In [None]:
train[train['timestamp'] == 1514764860]

Please note that there are total of 14 asset data (crypto currencies) given.
For a particular timestamp, we need not necessarily have a record of each asset. 
From above table, we have only 7 asset details. Target value for one of them is NaN. 

# Cleaning

### Lets check missing values

In [None]:
train.isnull().sum()

### Join Asset tables for Analysis.

In [None]:
df_train = pd.merge(train, assets, how="left", on=["Asset_ID"])
df_train.head()

# Vizualize

### Lets see total distribution per bitcoin

In [None]:
df = df_train.dropna()

In [None]:
plt.subplots(figsize=(20,10))
sns.countplot(x='Asset_Name', data = df)

### Comparatively there are less records in Maker, Dogecoin, IOTA & Monero

In [None]:
df.sample(20)

In [None]:
plt.subplots(figsize=(20,10))
sns.heatmap(df.corr())

### Correlation here is Obvious as Open, Close, High & Low, VWAP (avg volume) values should be very close within a timestamp. Please note that no signaificant correlation found for Target column

In [None]:
def reduce_mem_usage(df,do_categoricals=False):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            if do_categoricals==True:
                df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))   
    return df

dtrain = reduce_mem_usage(df)

In [None]:
dtrain['fulldate'] = pd.to_datetime(dtrain['timestamp'], unit='s')
dtrain['date'] = dtrain['fulldate'].apply(lambda d: d.date())
dtrain['time'] = dtrain['fulldate'].apply(lambda d: d.time())
dtrain.head()

In [None]:
dtrain_sample = dtrain.sample(10000)

In [None]:
dtrain_sample.head()

In [None]:
fig, axes = plt.subplots(7, 2, figsize=(20, 50))
for i,asset in enumerate(assets['Asset_Name']):
    df_crypt = dtrain_sample[dtrain_sample['Asset_Name'] == asset]
    sns.lineplot(x="date", y="VWAP", data=df_crypt, ax=axes[int(i/2),i%2])
    axes[int(i/2),i%2].set_title(asset)

In [None]:
print("Available data period")
for i,asset in enumerate(assets['Asset_Name']):
    df_crypt = dtrain[dtrain['Asset_Name'] == asset]
    print( "{} to {} ------> {}".format(df_crypt.sort_values(by=['timestamp'])['date'].iloc[0], df_crypt.sort_values(by=['timestamp'])['date'].iloc[-1], asset ))

# References

1. [Let's Talk Validation: GroupTimeSeriesSplit](https://www.kaggle.com/yamqwe/let-s-talk-validation-grouptimeseriessplit/notebook#References)