In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from collections import deque
import io
import os
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVR
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import max_error
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error,make_scorer
from xgboost import XGBRegressor as xgb
from sklearn.cluster import MeanShift, KMeans
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df_asset=pd.read_csv('../input/g-research-crypto-forecasting/asset_details.csv')
test1=pd.read_csv('../input/g-research-crypto-forecasting/example_test.csv')#.set_index('timestamp')

In [None]:
df_train=pd.read_csv('../input/g-research-crypto-forecasting/train.csv')
df_train=df_train.tail(10000000)


In [None]:
#supplemental_train=pd.read_csv('../input/g-research-crypto-forecasting/supplemental_train.csv')
df_train.append(pd.read_csv('../input/g-research-crypto-forecasting/supplemental_train.csv'))

In [None]:

def analysis(data1,name):
    def moving_average(x,w):
        con=np.convolve(x, np.ones(w), 'valid') / w
        zeros_len=len(x)-len(con)
        con=np.append(np.zeros(zeros_len),con)
        return(con)
    
    name=name[0].strip().replace(' ','_')
    data1[['current_pos','%diff','diffhl','%upfromstart','EMA4','EMA24','MACD','MACDSignalLine','Histogram','sma4h','std','upboil','boilint','lowboil',
           'current_boil_pos','%diffSTD','Log_Ret','Volatility','std24upfromstart','std4upfromstart','std24diffhl','std4diffhl']]=np.nan
    npclose=np.array(data1['Close'])
    nphigh=np.array(data1['High'])
    nplow=np.array(data1['Low'])
    npopen=np.array(data1['Open'])
                   
    data1.loc[:,'current_pos']=((npclose-nplow)/(nphigh-nplow))*100
    data1.loc[:,'%diff']=((npclose-npopen)/npopen)*100
    data1.loc[:,'diffhl']=((nphigh-nplow)/nplow)*100
    data1.loc[:,'%upfromstart']=((nphigh-npopen)/npopen)*100
    data1.loc[:,'EMA4']= moving_average(npclose,4)
    data1.loc[:,'EMA24']=moving_average(npclose,24)
    data1.loc[:,'MACD'] = np.array(data1['EMA24']) - np.array(data1['EMA4'])
    data1.loc[:,'MACDSignalLine'] = moving_average(data1['MACD'],4)
    data1.loc[:,'Histogram'] = np.array(data1['MACD']) - np.array(data1['MACDSignalLine'])

    data1.loc[:,'sma4h'] = moving_average(np.array(data1['Close']),24)
    data1.loc[:,'std'] = data1['Close'].rolling(window=24).std()
    data1.loc[:,'upboil'] = np.array(data1['sma4h']) + 2 * np.array(data1['std'])
    data1.loc[:,'lowboil'] = np.array(data1['sma4h']) - 2 * np.array(data1['std'])
    data1.loc[:,'boilint']=0
#     data1.loc[data1['Close']<=data1['lowboil'],'boilint']=1
#     data1.loc[data1['Close']>=data1['upboil'],'boilint']=-1
    data1.loc[:,'current_boil_pos']=((np.array(data1['Close'])-np.array(data1['lowboil']))/(np.array(data1['upboil'])-np.array(data1['lowboil'])))*100
    data1.loc[:,'%diffSTD']=np.array(data1['%diff']).std()

    data1.loc[:,'Log_Ret'] = np.log(data1['Close'] / data1['Close'].shift(1))
    data1.loc[:,'Volatility'] = data1['Log_Ret'].rolling(window=24).std() * np.sqrt(24)

    data1.loc[:,'std24upfromstart'] = data1['%upfromstart'].rolling(window=24).std()
    data1.loc[:,'std4upfromstart']= data1['%upfromstart'].rolling(window=4).std()

    data1.loc[:,'std24diffhl'] = data1['diffhl'].rolling(window=24).std()
    data1.loc[:,'std4diffhl'] = data1['diffhl'].rolling(window=4).std()
    
    for col in data1.columns:
        data1.rename(columns={col:col+name},inplace=True)
    data1=pd.DataFrame(data1)
    return(data1)

In [None]:
def analysing(df_train_2,df_asset):
    df3=pd.DataFrame()
    for i in df_train_2['Asset_ID'].unique():
        #if i<=3:
            coin_name=df_asset['Asset_Name'].loc[df_asset['Asset_ID']==i].values
            coin_name_data=df_train_2[df_train_2['Asset_ID']==i].set_index('timestamp')
            try: 
                coin_name_data.drop(columns=['row_id'],inplace=True)
            except:
                pass
            
            coin_name_data=analysis(coin_name_data,coin_name)
                        
            if df3.empty:
                df3=coin_name_data
            else:
                df3=df3.merge(coin_name_data,how='inner',left_index=True, right_index=True)
                #df=pd.concat((df,coin_name_data),axis=1,join="outer")
            del coin_name_data
    return(df3)



In [None]:
def final_df_def(df):
    final_df=df.copy()
    try:
        for time in df_test.timestamp.unique():
            final_df.drop(index=time,inplace=True)
    except:print('ok')
        
    final_df=final_df.tail(8000)
    return(final_df)

In [None]:
def get_targets(df_asset):
    targets={}
    for name in df_asset['Asset_Name'].unique():
        name=name.replace(' ','_')
        targets[name]=final_df['Target'+name]
        del final_df['Target'+name]
    return(targets)



In [None]:
def rmse(predict, actual):
    predict = np.array(predict)
    actual = np.array(actual)

    distance = predict - actual

    square_distance = distance ** 2

    mean_square_distance = square_distance.mean()

    score = np.sqrt(mean_square_distance)

    return score

rmse_score = make_scorer(rmse, greater_is_better = False)

In [None]:
def model_training(cryp,data,targets,models,pca_dict,columns_dict,scaler_dict,best_params):
    X=data.copy()
    X['future'+cryp]=targets[cryp]
    y=targets[cryp]
    cordata=abs(X.corr(method='pearson')['future'+cryp]).sort_values(ascending=False)
    X=X[cordata[:20].index]
    try:
        X.drop(['future'+cryp], axis=1,inplace=True)
    except:
        pass


    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.95, test_size=0.05,shuffle=False )

    columns=X.columns
    pca = PCA(.98)
    scaler = StandardScaler()
    X_train=scaler.fit_transform(X_train)
    X_train=pca.fit_transform(X_train)
    X_train=pd.DataFrame(X_train) 
    
#     grid = GridSearchCV(SVR(),{'C': [1],'gamma': [1e-8],'epsilon':[0.001],'kernel': ['rbf']}, cv=10, return_train_score=False, verbose = 0,n_jobs=3,scoring=rmse_score)
#     grid.fit(X_train,y_train)
#     pp=grid.best_params_
#     print(cryp,pp,grid.best_score_)
    pp={'C': 1, 'gamma': 1e-8, 'kernel': 'rbf','epsilon':0.001}
    model=SVR()
    model.set_params(**pp)
    
    model=model.fit(X_train,y_train)
    models[cryp]=model
    pca_dict[cryp]=pca
    columns_dict[cryp]=columns
    scaler_dict[cryp]=scaler
    best_params[cryp]=pp

    del X,cordata
    del y
    return(models,pca_dict,columns_dict,scaler_dict,best_params)

In [None]:
def reduce_mem_usage(df):
  
    start_mem = df.memory_usage().sum() / 1024**2

    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)


    df["Count"] = df["Count"].astype(np.int16)

    end_mem = df.memory_usage().sum() / 1024**2
    

     
    return df



In [None]:
models={}
pca_dict={}
columns_dict={}
scaler_dict={}
best_params={}
df_train = reduce_mem_usage(df_train)
df=analysing(df_train,df_asset)
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.fillna(0,inplace=True)
final_df=final_df_def(df)
targets=get_targets(df_asset)

for name in df_asset['Asset_Name'].unique():
    name=name.replace(' ','_')
    models,pca_dict,columns_dict,scaler_dict,best_params=model_training(name,final_df,targets,models,pca_dict,columns_dict,scaler_dict,best_params)
del final_df


In [None]:
%%time
#import time
import gresearch_crypto
env = gresearch_crypto.make_env()
iter_test = env.iter_test()

for (df_test, df_pred) in iter_test:
    df_test=reduce_mem_usage(df_test)
    df_train=df_train.append(df_test)
    #start_time = time.time()
    df1=analysing(df_train.tail(len(df_test)+245),df_asset)
    #print("--- %s seconds ---" % (time.time() - start_time))
    df1.replace([np.inf, -np.inf], 0, inplace=True)
    df1.fillna(0,inplace=True)
        
    pred_time=df_test['timestamp'].iloc[0]
    predict_df=df1.loc[df1.index==pred_time]

    for _, row in df_test.iterrows():
        try:
            coin_id=row['Asset_ID']
            coin=df_asset['Asset_Name'].loc[df_asset['Asset_ID']==coin_id].values[0].replace(' ','_')
            model=models[coin].set_params(**best_params[coin])
            pca=pca_dict[coin]
            predictdf=predict_df.copy()
            predictdf=predictdf[columns_dict[coin]]
            scaler=scaler_dict[coin]
            predictdf=scaler.transform(predictdf)
            predictdf=pca.transform(predictdf)
            predictdf=pd.DataFrame(predictdf)
            prediction=model.predict(predictdf)
            prediction=round(prediction[0],16)
            df_pred.loc[df_pred['row_id'] == row['row_id'], 'Target'] = prediction
        except:
            df_pred.loc[df_pred['row_id'] == row['row_id'], 'Target'] = 0

    df_pred['Target'] = df_pred['Target'].fillna(0)
    df_pred['Target'].astype('float64')
    env.predict(df_pred)

