# Simple Linear Regression with pandas and sklearn

In [1]:
import xarray as xr
import pandas as pd
import numpy as np

from qnt.data import ds, load_assets, load_data, write_output, restore_origin_data
from qnt.stepper import test_strategy
from qnt.stats import calc_stat, print_correlation
from qnt.graph import make_plot, make_plot_double, make_plot_filled

from sklearn.linear_model import LinearRegression

# Data

In [3]:
loaded_desc_ordered_xarray_dataarray = load_data(min_date = "2017-01-01", max_date = "2018-12-31",
                                                 dims=('time', 'field', 'asset'))
loaded_desc_ordered_xarray_dataarray.coords

fetched chunk 1/4 5s
fetched chunk 2/4 10s
fetched chunk 3/4 15s
fetched chunk 4/4 18s
Data loaded 18s


Coordinates:
  * time     (time) datetime64[ns] 2018-12-31 2018-12-28 ... 2017-01-03
  * field    (field) object 'open' 'low' 'high' ... 'split_cumprod' 'is_liquid'
  * asset    (asset) <U12 'AMEX:ESL' 'AMEX:GDP' 'AMEX:ITG' ... 'US:VAC' 'US:WAB'

# Model

Let's check for first of k_days=250 prices items. We will normalize it on first price and will rate prices trands with the simpliest Linear Regression model.

If we'll see an increasing price trend, then we decide that:
* the next n_days=50 days the price will increase
* the weight for the next n_days=50 days will be equated to the tilt sign

After n_days=50 days, let's review the previous k_days=250 days, build the regression again and update the weights.

In [4]:
k_days = 250
X = np.arange(k_days)
n_days = 50


weights_dict = dict()

def step(desc_ordered_assets_xarray_dataarray, debug=False):
    global weights_dict
    
    filtered_assets_xarray_dataarray = filter_liquids_xarray_assets_dataarray(
                                                        desc_ordered_assets_xarray_dataarray)

    # convert to dict of pandas.DataFrame
    filtered_assets_dict_of_pandas_df = from_xarray_3d_to_dict_of_pandas_df(filtered_assets_xarray_dataarray)
    assets_list = list(filtered_assets_dict_of_pandas_df.keys())

    # every n_days:
    # get any key to check the days condition
    an_asset = assets_list[0]
    # check that every n_days condition isn't raised yet
    if not debug and (filtered_assets_dict_of_pandas_df[an_asset].shape[0] - k_days-1) % n_days == 0:

        # get xarray_dataarray representation
        weights_xarray_dataarray = from_dict_to_xarray_1d(weights_dict)

        # exclude all assets that became non-liquid since the last regression
        liquid_weights_xarray_dataarray = \
            exclude_weights_xarray_dataarray_from_nonliquids(weights_xarray_dataarray,
                                                             desc_ordered_assets_xarray_dataarray)

        return liquid_weights_xarray_dataarray

    # reset weights_dict
    # to prevent global var to keep past assets that may became non-liquid
    weights_dict = dict()

    for asset in assets_list:

        last_k_prices = filtered_assets_dict_of_pandas_df[asset]["open"].iloc[k_days-1::-1]
        last_k_prices_forward_filled = last_k_prices
        last_day_price = last_k_prices_forward_filled.iloc[-1:]  # AFAIK, [-1] doesn't work because of a bug in pandas

        y_train = (last_k_prices / last_day_price.values[0]).fillna(0)
        model = LinearRegression()

        Xr = X.reshape(-1,1)
        yr = y_train.values.reshape(-1, 1)

        model.fit(Xr, yr)
        weights_dict[asset] = np.sign(model.coef_[0][0])
       
    weights_xarray_dataarray = from_dict_to_xarray_1d(weights_dict)

    # if weights_dict is empty there is no need to exclude
    # weights_xarray_dataarray = \
    #     exclude_weights_xarray_dataarray_from_nonliquids(weights_xarray_dataarray,
    #     desc_ordered_assets_xarray_dataarray)

    # optional double check
    check_weights_xarray_dataarray_for_nonliquids(weights_xarray_dataarray, desc_ordered_assets_xarray_dataarray)

    return weights_xarray_dataarray

## Test

In [5]:
debug_one_step_weights_xarray_dataarray = step(loaded_desc_ordered_xarray_dataarray, True)
debug_one_step_weights_xarray_dataarray.to_pandas().tail()

asset
US:SLB    -1.0
US:TRGP    1.0
US:UBS     0.0
US:VAC     0.0
US:WAB     0.0
dtype: float64

# Backtest

In [6]:
output = test_strategy(loaded_desc_ordered_xarray_dataarray, step=step, init_data_length=k_days)

Testing started...
Testing progress: 6/252 6s
Testing progress: 11/252 11s
Testing progress: 16/252 16s
Testing progress: 21/252 21s
Testing progress: 26/252 27s
Testing progress: 31/252 32s
Testing progress: 36/252 37s
Testing progress: 41/252 43s
Testing progress: 46/252 48s
Testing progress: 51/252 53s
Testing progress: 56/252 59s
Testing progress: 61/252 65s
Testing progress: 66/252 70s
Testing progress: 71/252 76s
Testing progress: 76/252 82s
Testing progress: 81/252 87s
Testing progress: 86/252 93s
Testing progress: 91/252 99s
Testing progress: 96/252 105s
Testing progress: 101/252 110s
Testing progress: 106/252 116s
Testing progress: 111/252 122s
Testing progress: 116/252 128s
Testing progress: 121/252 134s
Testing progress: 126/252 140s
Testing progress: 131/252 146s
Testing progress: 136/252 152s
Testing progress: 141/252 158s
Testing progress: 146/252 164s
Testing progress: 151/252 170s
Testing progress: 156/252 176s
Testing progress: 161/252 182s
Testing progress: 166/252 18

## Stats and plots

In [7]:
stat = calc_stat(loaded_desc_ordered_xarray_dataarray, output, slippage_factor=0.05)
display(stat.to_pandas().tail())

field,equity,relative_return,volatility,underwater,max_drawdown,sharpe_ratio,mean_return,bias,instruments,avg_turnover
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2018-12-24,0.971135,-0.003947,0.081433,-0.081022,0.081022,-0.361527,-0.02944,0.16946,926.0,0.039357
2018-12-26,0.980921,0.010076,0.081912,-0.071762,0.081022,-0.236646,-0.019384,0.158287,926.0,0.039592
2018-12-27,0.982213,0.001317,0.081759,-0.070539,0.081022,-0.220158,-0.018,0.154562,926.0,0.039596
2018-12-28,0.982425,0.000216,0.081596,-0.070339,0.081022,-0.217105,-0.017715,0.154562,926.0,0.039562
2018-12-31,0.98487,0.002489,0.081473,-0.068025,0.081022,-0.186444,-0.01519,0.124767,926.0,0.03944


In [8]:
make_plot_filled(stat.coords['time'].to_pandas(), stat.loc[:, 'equity'].values,  color="blue", name="PnL (Equity)", type="log")

In [9]:
make_plot_filled(stat.coords['time'].to_pandas(), stat.loc[:, 'underwater'].values, color="red", name="Underwater Chart", range_max= 0)

In [9]:
make_plot_filled(stat.coords['time'].to_pandas(), stat.loc[:, 'sharpe_ratio'].values[20:], color="purple", name="Rolling SR")

In [10]:
make_plot_filled(stat.coords['time'].to_pandas(), stat.loc[:, 'bias'].values, color="gray", name="Bias")

In [12]:
print_correlation(output, loaded_desc_ordered_xarray_dataarray)


The number of systems with a larger Sharpe ratio and correlation larger than 0.8: 4
The max correlation value (with systems with a larger Sharpe ratio): 0.9068236356214682
Current sharpe ratio(3y): -0.1864435321218335



## Submit

In [13]:
data  = load_data(min_date="2015-01-01", dims=("time", "field", "asset"))
output = test_strategy(data, step=step, init_data_length=k_days)
write_output(output)

fetched chunk 1/8 5s
fetched chunk 2/8 14s
fetched chunk 3/8 27s
fetched chunk 4/8 31s
fetched chunk 5/8 37s
fetched chunk 6/8 40s
fetched chunk 7/8 43s
fetched chunk 8/8 45s
Data loaded 46s
Testing started...
Testing progress: 6/875 6s
Testing progress: 11/875 12s
Testing progress: 16/875 17s
Testing progress: 21/875 23s
Testing progress: 26/875 28s
Testing progress: 31/875 34s
Testing progress: 36/875 39s
Testing progress: 41/875 45s
Testing progress: 46/875 51s
Testing progress: 51/875 56s
Testing progress: 56/875 62s
Testing progress: 61/875 68s
Testing progress: 66/875 73s
Testing progress: 71/875 79s
Testing progress: 76/875 85s
Testing progress: 81/875 91s
Testing progress: 86/875 97s
Testing progress: 91/875 103s
Testing progress: 96/875 108s
Testing progress: 101/875 114s
Testing progress: 106/875 120s
Testing progress: 111/875 126s
Testing progress: 116/875 131s
Testing progress: 121/875 137s
Testing progress: 126/875 143s
Testing progress: 131/875 149s
Testing progress: 136/