# Simple Linear Regression with pandas and sklearn

In [None]:
import xarray as xr
import pandas as pd
import numpy as np

from qnt.data import ds, load_assets, load_data, write_output, restore_origin_data, \
    from_xarray_3d_to_dict_of_pandas_df, filter_liquids_xarray_assets_dataarray, from_dict_to_xarray_1d, \
    check_weights_xarray_dataarray_for_nonliquids, exclude_weights_xarray_dataarray_from_nonliquids
from qnt.stepper import test_strategy
from qnt.stats import calc_stat, print_correlation
from qnt.graph import make_plot, make_plot_double, make_plot_filled

from sklearn.linear_model import LinearRegression

# Data

In [None]:
loaded_desc_ordered_xarray_dataarray = load_data(min_date = "2016-06-01", max_date = "2018-12-31",
                                                 dims=('time', 'field', 'asset'))
loaded_desc_ordered_xarray_dataarray.coords

# Model

Let's check for first of k_days=250 prices items. We will normalize it on first price and will rate prices trands with the simpliest Linear Regression model.

If we'll see an increasing price trend, then we decide that:
* the next n_days=50 days the price will increase
* the weight for the next n_days=50 days will be equated to the tilt sign

After n_days=50 days, let's review the previous k_days=250 days, build the regression again and update the weights.

In [None]:
k_days = 250
X = np.arange(k_days)
n_days = 50


weights_dict = dict()

# this function is optional, just for tests and to avoid forward-looking;
# later it will be used together with test_strategy(...)
def step(desc_ordered_assets_xarray_dataarray, debug=False):
    global weights_dict
    
    filtered_assets_xarray_dataarray = filter_liquids_xarray_assets_dataarray(
                                                        desc_ordered_assets_xarray_dataarray)

    # convert to dict of pandas.DataFrame
    filtered_assets_dict_of_pandas_df = from_xarray_3d_to_dict_of_pandas_df(filtered_assets_xarray_dataarray)
    assets_list = list(filtered_assets_dict_of_pandas_df.keys())

    # every n_days:
    # get any key to check the days condition
    an_asset = assets_list[0]
    # check that every n_days condition isn't raised yet
    if not debug and (filtered_assets_dict_of_pandas_df[an_asset].shape[0] - k_days-1) % n_days == 0:

        # get xarray_dataarray representation
        weights_xarray_dataarray = from_dict_to_xarray_1d(weights_dict)

        # exclude all assets that became non-liquid since the last regression
        liquid_weights_xarray_dataarray = \
            exclude_weights_xarray_dataarray_from_nonliquids(weights_xarray_dataarray,
                                                             desc_ordered_assets_xarray_dataarray)

        return liquid_weights_xarray_dataarray

    # reset weights_dict
    # to prevent global var to keep past assets that may became non-liquid
    weights_dict = dict()

    for asset in assets_list:

        last_k_prices = filtered_assets_dict_of_pandas_df[asset]["open"].iloc[k_days-1::-1]
        last_k_prices_forward_filled = last_k_prices
        last_day_price = last_k_prices_forward_filled.iloc[-1:]  # AFAIK, [-1] doesn't work because of a bug in pandas

        y_train = (last_k_prices / last_day_price.values[0]).fillna(0)
        model = LinearRegression()

        Xr = X.reshape(-1,1)
        yr = y_train.values.reshape(-1, 1)

        model.fit(Xr, yr)
        weights_dict[asset] = np.sign(model.coef_[0][0])
       
    weights_xarray_dataarray = from_dict_to_xarray_1d(weights_dict)

    # if weights_dict is empty there is no need to exclude
    # weights_xarray_dataarray = \
    #     exclude_weights_xarray_dataarray_from_nonliquids(weights_xarray_dataarray,
    #     desc_ordered_assets_xarray_dataarray)

    # optional double check
    check_weights_xarray_dataarray_for_nonliquids(weights_xarray_dataarray, desc_ordered_assets_xarray_dataarray)

    return weights_xarray_dataarray

## Test

In [None]:
debug_one_step_weights_xarray_dataarray = step(loaded_desc_ordered_xarray_dataarray, True)
debug_one_step_weights_xarray_dataarray.to_pandas().tail()

# Backtest

In [None]:
output = test_strategy(loaded_desc_ordered_xarray_dataarray, step=step, init_data_length=k_days)

## Stats and plots

In [None]:
stat = calc_stat(loaded_desc_ordered_xarray_dataarray, output, slippage_factor=0.05)
display(stat.to_pandas().tail())

In [None]:
make_plot_filled(stat.coords['time'].to_pandas(), stat.loc[:, 'equity'].values,  color="blue", name="PnL (Equity)", type="log")

In [None]:
make_plot_filled(stat.coords['time'].to_pandas(), stat.loc[:, 'underwater'].values, color="red", name="Underwater Chart", range_max= 0)

In [None]:
make_plot_filled(stat.coords['time'].to_pandas(), stat.loc[:, 'sharpe_ratio'].values[20:], color="purple", name="Rolling SR")

In [None]:
make_plot_filled(stat.coords['time'].to_pandas(), stat.loc[:, 'bias'].values, color="gray", name="Bias")

In [None]:
print_correlation(output, loaded_desc_ordered_xarray_dataarray)

## Submit

In [None]:
write_output(output)