# Regressions

In [1]:
import time
import datetime
import bz2
from dateutil.relativedelta import relativedelta, MO

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
%pylab inline

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  warn("pylab import has clobbered these variables: %s"  % clobbered +


In [2]:
df = pd.read_csv('data_ML_19_05.csv', index_col=['date'], parse_dates=['date'], dayfirst=True)
df.head()

Unnamed: 0_level_0,close,close_change,open,high,low,volume,bb_bbh,bb_bbl,bb_bbm,ATR_10,...,ROCI_40,ROCI_60,ROCI_120,Vortex_diff,Vortex_neg,Vortex_pos,ichimoku_a,ichimoku_b,ichimoku_bl,ichimoku_cl
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-01-19,1.15947,1,1.190209,1.192707,1.183859,125293.7,1.224872,1.182937,1.203905,0.005052,...,0.0,0.0,0.0,-0.292063,0.566607,0.274544,1.199552,1.202186,1.202186,1.196918
2015-06-08,1.128,1,1.109748,1.115922,1.103201,178572.05,1.148248,1.085201,1.116725,0.011961,...,3.059706,3.102878,-6.589708,0.134893,0.93098,1.065874,1.113431,1.09936,1.111859,1.115003
2015-06-09,1.12926,1,1.108914,1.115403,1.102083,189387.35,1.147392,1.085082,1.116237,0.012098,...,2.95245,3.084663,-6.40161,0.114822,0.941023,1.055845,1.113217,1.09936,1.112258,1.114176
2015-06-10,1.13099,1,1.108156,1.115101,1.10189,191876.6,1.146536,1.08473,1.115633,0.01221,...,2.897022,3.109756,-6.191681,0.093968,0.952233,1.046201,1.113011,1.09936,1.112656,1.113366
2015-06-11,1.12458,0,1.108085,1.115129,1.102208,190246.55,1.145869,1.084101,1.114985,0.012281,...,2.946539,3.079194,-5.817274,0.076014,0.961502,1.037516,1.112856,1.09936,1.113054,1.112659


## Data preparation

In [3]:
X_train = df.copy(deep=True)
#X_train
X_train.drop(['close_change', 'close'],axis=1, inplace= True)
X_train.drop(X_train.index[-400:],inplace= True)
X_train = StandardScaler().fit_transform(X_train)
X_train

array([[ 1.41635965,  1.39027923,  1.34742337, ...,  1.74874292,
         1.71077293,  1.56976115],
       [-0.48643319, -0.42530734, -0.55962951, ..., -0.78963243,
        -0.46987699, -0.37383032],
       [-0.50614433, -0.43757913, -0.58605134, ..., -0.78963243,
        -0.46026247, -0.39345243],
       ...,
       [-0.53125918, -0.58188498, -0.50300287, ..., -0.49789111,
        -0.43675427, -0.5003004 ],
       [-0.53967809, -0.59358929, -0.51704723, ..., -0.4971999 ,
        -0.44375543, -0.51232992],
       [-0.5555345 , -0.60902952, -0.53360963, ..., -0.49823055,
        -0.45390711, -0.52601438]])

In [4]:
y_train = df['close_change'][:-400]
y_train

date
2015-01-19    1
2015-06-08    1
2015-06-09    1
2015-06-10    1
2015-06-11    0
             ..
2020-02-02    0
2020-02-03    0
2020-02-04    0
2020-02-05    0
2020-02-06    0
Name: close_change, Length: 1461, dtype: int64

In [5]:
X_test = df.copy(deep=True)
#X_train
X_test.drop(['close_change', 'close'],axis=1, inplace= True)
X_test.drop(X_test.index[:-400],inplace= True)
X_test = StandardScaler().fit_transform(X_test)
X_test

array([[-1.16259172, -1.22774016, -1.13566591, ..., -1.05685593,
        -1.0299594 , -1.12274453],
       [-1.18511136, -1.2519492 , -1.15558455, ..., -1.05915421,
        -1.04065562, -1.1360943 ],
       [-1.20538443, -1.27213431, -1.17471072, ..., -1.06346348,
        -1.05327571, -1.14998963],
       ...,
       [ 1.06868804,  1.05873857,  1.07311823, ...,  0.88507358,
         0.90702724,  1.03927539],
       [ 1.07568695,  1.07354752,  1.08003621, ...,  0.88452247,
         0.92791488,  1.05253242],
       [ 1.09007359,  1.08754951,  1.09496448, ...,  0.884628  ,
         0.95092831,  1.06548394]])

In [6]:
y_test = df['close_change'][-400:].to_numpy()

## Logistic Regression 

In [7]:
def Log_regression(X_train,y_train,cv, C_list, solver_list):
    best_score = 0
    scores= []
    for C in C_list:
        for solver in solver_list:
            start_time = datetime.datetime.now()

            model = LogisticRegression(penalty='l2', C=C,solver=solver, max_iter=10000)
            scores.append(cross_val_score(estimator=model, X=X_train,y=y_train,cv=cv,scoring="roc_auc"))

            print("\nCoefficient C=",C, "slover=", solver)
            print("Time've passed", datetime.datetime.now() - start_time)
            print("Kross validation score=", scores[-1])
            print("Mean=", np.mean(scores[-1]))
            
            if np.mean(scores[-1]) > best_score:
                best_model = model
                best_score = np.mean(scores[-1])
    print("best result is: ", max(np.mean(scores,axis = 1)))
    return scores, best_model, best_score

In [8]:
C_list = [0.01, 0.1, 1, 3, 5 ,10, 15, 20, 50, 100, 500, 1000, 10000]
cv = KFold(n_splits=5, shuffle=False)#, random_state=42)

In [9]:
solver_list = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']

In [10]:
scores_lr, best_model, best_scores = Log_regression(X_train,y_train,cv, C_list,solver_list)


Coefficient C= 0.01 slover= newton-cg
Time've passed 0:00:00.120000
Kross validation score= [0.50554986 0.49131151 0.49131151 0.44552365 0.47911953]
Mean= 0.4825632129041435

Coefficient C= 0.01 slover= lbfgs
Time've passed 0:00:00.060000
Kross validation score= [0.50550322 0.4914057  0.49126442 0.44571134 0.47902526]
Mean= 0.4825819876712467

Coefficient C= 0.01 slover= liblinear
Time've passed 0:00:00.030000
Kross validation score= [0.50522339 0.49154697 0.49234754 0.44406907 0.47996795]
Mean= 0.48263098498032414

Coefficient C= 0.01 slover= sag
Time've passed 0:00:00.070000
Kross validation score= [0.50554986 0.49135861 0.49131151 0.44552365 0.4790724 ]
Mean= 0.4825632044694788

Coefficient C= 0.01 slover= saga
Time've passed 0:00:00.100000
Kross validation score= [0.50559649 0.49135861 0.49131151 0.44552365 0.47911953]
Mean= 0.48258195880524835

Coefficient C= 0.1 slover= newton-cg
Time've passed 0:00:00.130000
Kross validation score= [0.50993377 0.5068519  0.47148575 0.45988176 0


Coefficient C= 100 slover= saga
Time've passed 0:00:18.009068
Kross validation score= [0.59528029 0.56844832 0.55064752 0.56695758 0.6441365 ]
Mean= 0.5850940413413216

Coefficient C= 500 slover= newton-cg
Time've passed 0:00:00.320001
Kross validation score= [0.61109038 0.60014128 0.58643749 0.61871246 0.67807315]
Mean= 0.6188909519277546

Coefficient C= 500 slover= lbfgs
Time've passed 0:00:02.160003
Kross validation score= [0.61151012 0.6        0.58690841 0.61833709 0.67797888]
Mean= 0.6189468994413041

Coefficient C= 500 slover= liblinear
Time've passed 0:00:00.280000
Kross validation score= [0.61109038 0.59995291 0.58629621 0.61866554 0.67807315]
Mean= 0.6188156386523882

Coefficient C= 500 slover= sag
Time've passed 0:00:12.740018
Kross validation score= [0.6017629  0.5857782  0.56873087 0.59862988 0.67175716]
Mean= 0.6053318009517843

Coefficient C= 500 slover= saga
Time've passed 0:00:23.693077
Kross validation score= [0.5998974  0.57786673 0.55818225 0.58108108 0.6599736 ]
M

In [18]:
#best_C = 1000

In [20]:
best_model.fit(X_train, y_train)

LogisticRegression(C=10000, max_iter=10000)

In [21]:
#model = LogisticRegression(C=best_C).fit(X_train, y_train)

result = best_model.predict_proba(X_test)
y_pred =  best_model.predict(X_test)

In [22]:
result_pd = pd.Series(result[:, 1])
result_pd.describe()

count    400.000000
mean       0.498226
std        0.172147
min        0.143973
25%        0.363284
50%        0.482537
75%        0.624173
max        0.927157
dtype: float64

In [23]:
best_model.score(X_train, y_train)

0.6146475017111568

In [24]:
print("accuracy", accuracy_score(y_test, y_pred)) 
print("precision", precision_score(y_test,y_pred))
print("recall", recall_score(y_test,y_pred))

accuracy 0.5675
precision 0.6348314606741573
recall 0.5113122171945701


In [25]:
profit = 0
revenue = 0
successful_deals = 0
deals_overall = 0
actions_list = []
revenue_list = []
last_price = 0
last_deal = 2
for i in range(len(y_pred)):
    pred = y_pred[i] 
    if last_deal == 2:  #смотрим на первое предсказание
        last_price = df['close_change'][1461 + i]
        if pred == 0:  #если ожидаем, что цена пойдет вверх
            last_deal = 1  #последняя сделка - покупка
        else: 
            last_deal = 0
    if (last_deal == 0 and pred == 0): #если есть чем торговать (последняя -покупка) и ожидаем уменьшения цены
        last_deal = 1 #продажа
        revenue = revenue + (df['close'][1461 + i] - last_price)*1000
        deals_overall +=1
        revenue_list.append(revenue)
        if df['close'][1461 + i] > last_price: #если сделка была успешна
            successful_deals +=1
            profit = profit+(df['close'][1461 + i] - last_price)*1000
        last_price = df['close_change'][1461 + i]
        actions_list.append(1)
        continue
    if (last_deal == 1 and pred == 1):
        last_deal = 0
        revenue = revenue + (last_price - df['close'][1461 + i]) *1000
        deals_overall +=1
        revenue_list.append(revenue)
        if df['close'][1461 + i] < last_price: #если сделка была успешна
            successful_deals +=1
            profit = profit + (last_price - df['close'][1461 + i]) *1000 
        last_price = df['close_change'][1461 + i]
        actions_list.append(0)
        continue
    actions_list.append(-1)
        
print("revenue", revenue)
print("successful deals", successful_deals, ";\toveral dealls:", deals_overall)
print("% succsess", successful_deals*100/deals_overall)
print("traiding days", len(y_pred))

revenue -12917.470000000003
successful deals 26 ;	overal dealls: 52
% succsess 50.0
traiding days 400


In [17]:
actions_list

[-1,
 -1,
 -1,
 -1,
 -1,
 -1,
 1,
 0,
 -1,
 1,
 0,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 1,
 0,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 1,
 0,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 1,
 0,
 1,
 0,
 1,
 -1,
 -1,
 -1,
 -1,
 -1,
 0,
 1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 0,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 1,
 -1,
 -1,
 0,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 1,
 0,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 1,
 -1,
 -1,
 -1,
 -1,
 -1,
 0,
 1,
 -1,
 -1,
 0,
 -1,
 -1,
 1,
 0,
 -1,
 1,
 0,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,