# Load Data

In [None]:

import pandas as pd
import numpy as np

import plotly.express as px


In [None]:
df=pd.read_csv('../input/g-research-crypto-forecasting/train.csv')
df.head()

In [None]:
df['y']=df['Target']
df=df.drop(columns=['Target'])

In [None]:
asset_details=pd.read_csv('../input/g-research-crypto-forecasting/asset_details.csv')
asset_details.head()

In [None]:
dic=dict(zip(asset_details['Asset_ID'],asset_details['Asset_Name']))

# Plot

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
subplots=plt.subplots(14,1,figsize=(10,40))[1]
for asset in range(14):
    crop=df[df['Asset_ID']==asset]
    subplots[asset].plot(pd.to_datetime(crop['timestamp'],unit='s').dt.date,crop['High']-np.min(crop['High'])/(np.max(crop['High'])-np.min(crop['High'])),label=dic[asset])
    subplots[asset].legend()

plt.show()
plt.close()

In [None]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype.name
        
        if col_type not in ['object', 'category', 'datetime64[ns, UTC]', 'datetime64[ns]']:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

# Features

In [None]:
def upper_shadow(df):
    return df['High'] - np.maximum(df['Close'], df['Open'])

def lower_shadow(df):
    return np.minimum(df['Close'], df['Open']) - df['Low']

def add_features(dataframe):
    dataframe['dat']=pd.to_datetime(dataframe['timestamp'],unit='s')
    dataframe['date']=dataframe['dat'].dt.date
    dataframe["Day of Week"] = dataframe['dat'].dt.dayofweek
    dataframe['weekend']=np.where(dataframe['Day of Week']>4,1,0)
    date_of_elon_musk_tweet='2018-6-04'
    dataframe['after']=(dataframe['dat']>date_of_elon_musk_tweet).astype(int)
    date_of_elon_musk_tweet='2021-03-29'
    dataframe['afterdoge']=(dataframe['dat']>date_of_elon_musk_tweet).astype(int)
  
    dataframe['Month']=dataframe['dat'].dt.month
    dataframe['upper_Shadow'] = upper_shadow(dataframe)
    dataframe['lower_Shadow'] = lower_shadow(dataframe)
    dataframe["high_div_low"] = dataframe["High"] / dataframe["Low"]
    #dataframe["open_sub_close"] = dataframe["Open"] - dataframe["Close"]
    dataframe['trade']=dataframe['Close']-dataframe['Open']
    dataframe['gtrade']=dataframe['trade']/dataframe['Count']
    dataframe['shadow1']=dataframe['trade']/dataframe['Volume']
    #dataframe['shadow2']=dataframe['upper_Shadow']/df['Low']
    dataframe['shadow3']=dataframe['upper_Shadow']/dataframe['Volume']
    #dataframe['shadow4']=dataframe['lower_Shadow']/dataframe['High']
    dataframe['shadow5']=dataframe['lower_Shadow']/dataframe['Volume']
    
 
    return dataframe.drop(columns=['dat','date'])
df=add_features(reduce_mem_usage(df))
df.head()


In [None]:
def draw_card(df1,df2=False):
    for column in list(df1.keys()):
      
        
        try:
            column2=df2[column]
            print(f'|{column} : {np.mean(np.nan_to_num(df1[column]))-np.mean(np.nan_to_num(column2))}|')
        except:
            try:
                print(f'|{column} : {np.mean(np.nan_to_num(df1[column]))}|')
            except:
                pass
                    
        

# Weekend

In weekend as it is a holiday Count feature or number of trades will decrease and price in return will decrease because crypto follows  only 1 rule 'low demand low price high demand high price'

In [None]:
for asset in range(14):
    crop=df[df['Asset_ID']==asset]
    print(dic[asset])
    crop=crop.dropna(how='any')

    
    
    draw_card(crop[crop['weekend']==1],crop[crop['weekend']==0])


# 2 elon musk tweets 

'You can now buy tesla with bitcoin' - We all know what happened to crypto price , come on it went to space

'Doge to the moon' - Dodgecoin price increased by 30 %


Observe the differences


In [None]:
print('Bitcoin After Elon')
asset=1
crop=df[df['Asset_ID']==asset]
print('|'+dic[asset]+'|')
    
draw_card(crop[crop['after']==1],crop[crop['after']==0])

print('-------\nDogecoin after Elon')

asset=4
crop=df[df['Asset_ID']==asset]
print('|'+dic[asset]+'|')
    
draw_card(crop[crop['afterdoge']==1],crop[crop['afterdoge']==0])


# Months

Not quite sure why this happens but mainly i think because natural disasters are prone at certain months which can effect crypto price

In [None]:
for asset in range(1,13):
    print(dic[asset])
  

    
    
    draw_card(df[df['Month']==asset])


# Training

https://www.kaggle.com/swaralipibose/new-features-training-notebook-and-feature-eval