# Ensembles

In [1]:
import time
import datetime
import bz2

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import TimeSeriesSplit
import sklearn.metrics as metrics
from sklearn.metrics import make_scorer

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
%pylab inline

Populating the interactive namespace from numpy and matplotlib


## Data preparation and overall functions

In [2]:
df = pd.read_csv('data_ML_25_05.csv', index_col=['date'], parse_dates=['date'], dayfirst=True)
df.head()

Unnamed: 0_level_0,close,close_change,open,high,low,volume,bb_bbh,bb_bbl,bb_bbm,ATR_10,...,ROCI_40,ROCI_60,ROCI_120,Vortex_diff,Vortex_neg,Vortex_pos,ichimoku_a,ichimoku_b,ichimoku_bl,ichimoku_cl
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-01-19,1.15947,1,1.15614,1.16388,1.15514,141004,1.225656,1.149432,1.187544,0.009608,...,0.0,0.0,0.0,-0.412351,0.876489,0.464137,1.175315,1.18409,1.18409,1.16654
2015-06-08,1.128,1,1.11023,1.13068,1.10842,220017,1.139031,1.079038,1.109034,0.014026,...,5.179729,4.162819,-2.714171,0.011417,0.978159,0.989576,1.11383,1.09936,1.114305,1.113355
2015-06-09,1.12926,1,1.12802,1.13454,1.1214,224134,1.135281,1.081274,1.108277,0.013938,...,4.427676,5.162876,-2.189617,0.07101,0.967493,1.038503,1.11383,1.09936,1.114305,1.113355
2015-06-10,1.13099,1,1.12926,1.13865,1.126,246733,1.134962,1.081451,1.108207,0.013809,...,4.041175,5.017875,-2.624261,0.027184,0.995233,1.022417,1.113997,1.09936,1.114305,1.11369
2015-06-11,1.12458,0,1.13099,1.13315,1.1182,203152,1.136311,1.081135,1.108723,0.013923,...,3.500069,3.460077,-0.889245,-0.028576,1.015807,0.987231,1.114712,1.09936,1.114305,1.11512


In [3]:
def regression_results(y_true, y_pred):    # Regression metrics
    explained_variance=metrics.explained_variance_score(y_true, y_pred)
    mean_absolute_error=metrics.mean_absolute_error(y_true, y_pred) 
    mse=metrics.mean_squared_error(y_true, y_pred) 
    mean_squared_log_error=metrics.mean_squared_log_error(y_true, y_pred)
    median_absolute_error=metrics.median_absolute_error(y_true, y_pred)
    r2=metrics.r2_score(y_true, y_pred)    
    print('explained_variance: ', round(explained_variance,4))    
    print('mean_squared_log_error: ', round(mean_squared_log_error,4))
    print('r2: ', round(r2,4))
    print('MAE: ', round(mean_absolute_error,4))
    print('MSE: ', round(mse,4))
    print('RMSE: ', round(np.sqrt(mse),4))

In [4]:
def rmse(actual, predict):
    predict = np.array(predict)
    actual = np.array(actual)
    distance = predict - actual
    square_distance = distance ** 2
    mean_square_distance = square_distance.mean()
    score = np.sqrt(mean_square_distance)
    return score

rmse_score = make_scorer(rmse, greater_is_better = False)

In [5]:
def plot_result(best_model, df): 
    future = pd.DataFrame(index=date_list)
    future['forecast'] = best_model.predict(X_test)

    plt.figure(figsize(15,7))
    df.close.plot()
    future['forecast'].plot(color='r')
    plt.ylabel('close_forecast_RF')
    pylab.show()

In [6]:
train_part = int(len(df.close)*0.8)
test_part = len(df.close) - train_part

In [7]:
X_train = df.copy(deep=True)
X_train.drop(['close_change', 'close'],axis=1, inplace= True)
X_train.drop(X_train.index[-test_part:],inplace= True)
X_train = StandardScaler().fit_transform(X_train)

In [8]:
X_test = df.copy(deep=True)
X_test.drop(['close_change', 'close'],axis=1, inplace= True)
X_test.drop(X_test.index[:-test_part],inplace= True)
X_test = StandardScaler().fit_transform(X_test)
X_test

array([[-0.60802169, -0.6392241 , -0.79573485, ..., -1.16783921,
        -1.18014729, -0.91020212],
       [-0.79778799, -0.77083106, -0.85024771, ..., -1.16783921,
        -1.18014729, -0.91020212],
       [-0.9143619 , -0.84542337, -1.27909684, ..., -1.16783921,
        -1.18014729, -0.88886163],
       ...,
       [ 1.06227773,  1.03651121,  1.07484747, ...,  0.77669348,
         0.96489259,  1.02099866],
       [ 1.07651577,  1.18344228,  1.1352952 , ...,  0.83989109,
         1.02562642,  1.08047959],
       [ 1.22423538,  1.20868197,  1.23816623, ...,  0.85339898,
         1.08902604,  1.09319308]])

In [9]:
y_train = df['close_change'][:-test_part]
y_test = df['close_change'][-test_part:]

In [10]:
tscv = TimeSeriesSplit(n_splits=20)

In [11]:
date_list = df.index[-test_part:].to_list()

In [12]:
def sell(last_deal, revenue, revenue_list, successful_deals, profit, last_price,df,i, train_part):
    last_deal = 1 #продажа
    revenue = revenue + (df['close'][train_part + i] - last_price)*1000
    revenue_list.append(revenue)
    if df['close'][train_part + i] > last_price: #если сделка была успешна
        successful_deals +=1
        profit = profit+(df['close'][train_part + i] - last_price)*1000
    last_price = df['close'][train_part + i]
    return last_deal, revenue, revenue_list, successful_deals, profit, last_price

In [13]:
def buy(last_deal, revenue, revenue_list, successful_deals, profit, last_price,df, i, train_part):
    last_deal = 0
    revenue = revenue + (last_price - df['close'][train_part + i]) *1000
    revenue_list.append(revenue)
    if df['close'][train_part + i] < last_price: #если сделка была успешна
        successful_deals +=1
        profit = profit + (last_price - df['close'][train_part + i]) *1000 
    last_price = df['close'][train_part + i]
    return last_deal, revenue, revenue_list, successful_deals, profit, last_price

In [14]:
def trade(df, y_pred, train_part, pred_limit=20):
    
    profit = 0
    revenue = 0
    successful_deals = 0
    deals_overall = 0
    actions_list = []
    revenue_list = []
    last_price = 0
    last_deal = 2
    
    for i in range(len(y_pred[:pred_limit])):
        pred = y_pred[i] 
        if last_deal == 2:  #смотрим на первое предсказание
            last_price = df['close_change'][train_part + i]
            if pred == 0:  #если ожидаем, что цена пойдет вверх
                last_deal = 1  #последняя сделка - покупка
            else: 
                last_deal = 0
        if (last_deal == 0 and pred == 0): #если есть чем торговать (последняя -покупка) и ожидаем уменьшения цены
            last_deal, revenue, revenue_list, successful_deals, profit, last_price = sell(last_deal, revenue, revenue_list, successful_deals, profit, last_price, df, i,train_part)
            actions_list.append(1)
            deals_overall +=1
            continue
        if (last_deal == 1 and pred == 1):
            last_deal, revenue, revenue_list, successful_deals, profit, last_price = buy(last_deal, 
                                                                                         revenue, revenue_list, successful_deals, profit, last_price, df, i, train_part)
            deals_overall +=1
            actions_list.append(0)
            continue
        actions_list.append(-1)
    return revenue, revenue_list, successful_deals, deals_overall, profit, actions_list

## Gradient boosting 

In [15]:
param_search_GB = {
    'loss' : ['lad', 'huber', 'quantile'],
    'learning_rate' : [0.5, 0.1, 0.05, 0.01, 0.005],
    'n_estimators': [5, 10, 25, 50, 80],
    'min_samples_split' : [i for i in range(3,16,4)],
    'min_samples_leaf' : [i for i in range(3,16,4)]
}

In [None]:
model_GB = GradientBoostingRegressor()
gsearch_GB = GridSearchCV(estimator=model_GB, cv=tscv, param_grid=param_search_GB, scoring = rmse_score)
gsearch_GB.fit(X_train, y_train)
best_score_GB = gsearch_GB.best_score_
best_model_GB = gsearch_GB.best_estimator_

In [None]:
y_true_GB = y_test.values
y_pred_GB = best_model.predict(X_test)

regression_results(y_test, y_pred)

In [None]:
plot_result(best_model_GB, df)

In [None]:
best_model_GB.get_params()

In [None]:
best_model_GB.score(X_train, y_train)

In [None]:
y_pred_binary_GB = [0]
for i in range(1, len(y_pred_GB)):
    if y_pred_GB[i] > y_pred[i - 1]:
        y_pred_binary_GB.append(1)
    else:       
        y_pred_binary_GB.append(0)

In [None]:
pred_limit = 200
revenue, revenue_list, successful_deals, deals_overall, profit, actions_list = trade(df, y_pred_binary_GB, train_part, pred_limit)

In [None]:
print("revenue", revenue,";\tprofit", profit)
print("successful deals", successful_deals, ";\toveral dealls:", deals_overall)
print("% succsess", successful_deals*100/deals_overall)
print("traiding days", len(y_pred[:pred_limit]))

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import KFold, cross_val_score

In [None]:
param_search = { 
    'n_estimators': [5,10, 25, 50, 75, 100],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [i for i in range(4,15,2)]
}

In [None]:
model = RandomForestRegressor()
gsearch = GridSearchCV(estimator=model, cv=tscv, param_grid=param_search, scoring = rmse_score)
gsearch.fit(X_train, y_train)
best_score = gsearch.best_score_
best_model = gsearch.best_estimator_

In [None]:
y_true = y_test.values
y_pred = best_model.predict(X_test)

regression_results(y_test, y_pred)

In [None]:
plot_result(best_model, df)

In [None]:
best_model.get_params()

In [None]:
best_model.score(X_train, y_train)

In [None]:
y_pred_binary = [0]
for i in range(1, len(y_pred)):
    if y_pred[i] > y_pred[i - 1]:
        y_pred_binary.append(1)
    else:       
        y_pred_binary.append(0)

In [None]:
pred_limit = 200
revenue, revenue_list, successful_deals, deals_overall, profit, actions_list = trade(df, y_pred_binary, train_part, pred_limit)

In [None]:
print("revenue", revenue,";\tprofit", profit)
print("successful deals", successful_deals, ";\toveral dealls:", deals_overall)
print("% succsess", successful_deals*100/deals_overall)
print("traiding days", len(y_pred[:pred_limit]))