# Simple Linear Regression with xarray and sklearn

In [14]:
import xarray as xr
import numpy as np

from qnt.data import ds, load_assets, load_data, write_output, restore_origin_data
from qnt.stepper import test_strategy
from qnt.stats import calc_stat, print_correlation
from qnt.graph import make_plot, make_plot_double, make_plot_filled

from sklearn.linear_model import LinearRegression

# Data

In [16]:
loaded_desc_ordered_xarray_dataarray = load_data(min_date = "2017-01-01", max_date = "2018-12-31",
                                                 dims=('time', 'field', 'asset'))
loaded_desc_ordered_xarray_dataarray.coords

fetched chunk 1/4 0s
fetched chunk 2/4 0s
fetched chunk 3/4 0s
fetched chunk 4/4 1s
Data loaded 1s


Coordinates:
  * time     (time) datetime64[ns] 2018-12-31 2018-12-28 ... 2017-01-03
  * field    (field) object 'open' 'low' 'high' ... 'split_cumprod' 'is_liquid'
  * asset    (asset) <U12 'AMEX:ESL' 'AMEX:GDP' 'AMEX:ITG' ... 'US:VAC' 'US:WAB'

# Model

Let's check for first of k_days=250 prices items. We will normalize it on first price and will rate prices trands with the simpliest Linear Regression model.

If we'll see an increasing price trend, then we decide that:
* the next n_days=50 days the price will increase
* the weight for the next n_days=50 days will be equated to the tilt sign

After n_days=50 days, let's review the previous k_days=250 days, build the regression again and update the weights.

In [17]:
k_days = 250
X = np.arange(k_days)
n_days = 50


weights_dict = dict()

def step(desc_ordered_assets_xarray_dataarray, debug=False):
    global weights_dict

    desc_filtered_assets_xarray_dataarray = filter_liquids_xarray_assets_dataarray(
                                                    desc_ordered_assets_xarray_dataarray)
    assets_list = desc_filtered_assets_xarray_dataarray.coords['asset'].values

    # check that every n_days condition isn't raised yet
    if not debug and (desc_filtered_assets_xarray_dataarray.shape[0] - k_days - 1) % n_days != 0:

        # get xarray_dataarray representation
        weights_xarray_dataarray = from_dict_to_xarray_1d(weights_dict)

        # exclude all assets that became non-liquid since the last regression
        liquid_weights_xarray_dataarray = \
            exclude_weights_xarray_dataarray_from_nonliquids(weights_xarray_dataarray,
                                                                                                                 desc_ordered_assets_xarray_dataarray)

        return liquid_weights_xarray_dataarray

    # reset weights_dict
    # to prevent global var to keep past assets that may became non-liquid
    weights_dict = dict()

    for asset in assets_list:

        # for the training purpose we need an asceding order: [::-1]
        asc_last_k_prices = desc_filtered_assets_xarray_dataarray.loc[:, "open", asset][k_days-1::-1]        
        a_last_day_price = asc_last_k_prices[-1]

        # fill NaN values with 0, it isn't the best, but simple
        y_train = (asc_last_k_prices / a_last_day_price).fillna(0)
        model = LinearRegression()

        Xr = X.reshape(-1,1)
        yr = y_train.values.reshape(-1, 1)        

        model.fit(Xr, yr)
        weights_dict[asset] = np.sign(model.coef_[0][0])

    weights_xarray_dataarray = from_dict_to_xarray_1d(weights_dict)

    # if weights_dict is empty there is no need to exclude
    # weights_xarray_dataarray = \
    #     exclude_weights_xarray_dataarray_from_nonliquids(weights_xarray_dataarray,
    #     desc_ordered_assets_xarray_dataarray)

    # optional double check
    check_weights_xarray_dataarray_for_nonliquids(weights_xarray_dataarray, desc_ordered_assets_xarray_dataarray)

    return from_dict_to_xarray_1d(weights_dict)

## Test

In [18]:
debug_one_step_weights_xarray_dataarray = step(loaded_desc_ordered_xarray_dataarray, True)
debug_one_step_weights_xarray_dataarray.to_pandas().tail()

asset
US:SLB    -1.0
US:TRGP    1.0
US:UBS     0.0
US:VAC     0.0
US:WAB     0.0
dtype: float64

# Backtest

In [19]:
output = test_strategy(loaded_desc_ordered_xarray_dataarray, step=step, init_data_length=k_days)

Testing started...
Testing progress: 51/252 6s
Testing progress: 107/252 11s
Testing progress: 151/252 16s
Testing progress: 201/252 23s
Testing progress: 243/252 28s
Testing complete 30.858868837356567s


## Stats and plots

In [20]:
stat = calc_stat(loaded_desc_ordered_xarray_dataarray, output, slippage_factor=0.05)
display(stat.to_pandas().tail())

field,equity,relative_return,volatility,underwater,max_drawdown,sharpe_ratio,mean_return,bias,instruments,avg_turnover
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2018-12-24,0.940608,-0.008408,0.09059,-0.121125,0.121125,-0.668481,-0.060558,0.372624,822.0,0.032589
2018-12-26,0.962162,0.022916,0.093358,-0.100985,0.121125,-0.41171,-0.038436,0.372624,822.0,0.032733
2018-12-27,0.965577,0.003549,0.093244,-0.097795,0.121125,-0.373546,-0.034831,0.372624,822.0,0.032712
2018-12-28,0.965213,-0.000376,0.093057,-0.098134,0.121125,-0.376756,-0.03506,0.124767,828.0,0.032694
2018-12-31,0.967987,0.002873,0.09292,-0.095543,0.121125,-0.345876,-0.032139,0.124767,828.0,0.034538


In [21]:
make_plot_filled(stat.coords['time'].to_pandas(), stat.loc[:, 'equity'].values,  color="blue", name="PnL (Equity)", type="log")

In [22]:
make_plot_filled(stat.coords['time'].to_pandas(), stat.loc[:, 'underwater'].values, color="red", name="Underwater Chart", range_max= 0)

In [23]:
make_plot_filled(stat.coords['time'].to_pandas(), stat.loc[:, 'sharpe_ratio'].values[20:], color="purple", name="Rolling SR")

In [24]:
make_plot_filled(stat.coords['time'].to_pandas(), stat.loc[:, 'bias'].values, color="gray", name="Bias")

In [25]:
print_correlation(output, loaded_desc_ordered_xarray_dataarray)


The number of systems with a larger Sharpe ratio and correlation larger than 0.8: 4
The max correlation value (with systems with a larger Sharpe ratio): 0.9528544272409256
Current sharpe ratio(3y): -0.3458761891708789



## Submit

In [26]:
data  = load_data(min_date="2015-01-01", dims=("time", "field", "asset"))
output = test_strategy(data, step=step, init_data_length=k_days)
write_output(output)

fetched chunk 1/8 0s
fetched chunk 2/8 0s
fetched chunk 3/8 0s
fetched chunk 4/8 1s
fetched chunk 5/8 1s
fetched chunk 6/8 1s
fetched chunk 7/8 1s
fetched chunk 8/8 1s
Data loaded 1s
Testing started...
Testing progress: 51/875 6s
Testing progress: 101/875 11s
Testing progress: 151/875 18s
Testing progress: 201/875 24s
Testing progress: 247/875 29s
Testing progress: 275/875 34s
Testing progress: 302/875 39s
Testing progress: 340/875 44s
Testing progress: 360/875 50s
Testing progress: 389/875 55s
Testing progress: 407/875 60s
Testing progress: 434/875 65s
Testing progress: 451/875 70s
Testing progress: 476/875 75s
Testing progress: 501/875 82s
Testing progress: 524/875 87s
Testing progress: 546/875 92s
Testing progress: 560/875 97s
Testing progress: 582/875 103s
Testing progress: 601/875 109s
Testing progress: 621/875 114s
Testing progress: 641/875 119s
Testing progress: 653/875 125s
Testing progress: 672/875 130s
Testing progress: 690/875 135s
Testing progress: 701/875 140s
Testing prog