# Model Construction

In [None]:
import os
import pandas as pd
import numpy as np
from scipy.stats import ttest_1samp

## Data Preparation & Target Transformation

In [None]:
period = 4
forward = pd.read_parquet('../input/kline-daily/market_daily.parquet')
forward = forward.adjclose.groupby(level=1).shift(-1 - period) / forward.adjopen.groupby(level=1).shift(-1) - 1
forward = forward.dropna().unstack()
forward = forward.mask(forward <= 0., 0).mask(forward > 0.0, 1)
forward.head()

In [None]:
features_dir = '../input/features/'
features = list(filter(lambda x: x.endswith('.parquet'), os.listdir('../input/features/')))

datas = []
for feat in features:
    data = pd.read_parquet(features_dir + feat)
    data_med = data.median(axis=1)
    mad = data.subtract(data_med, axis=0).abs().median(axis=1)

    data = data.clip(data_med - 5 * mad, data_med + 5 * mad, axis=0)
    data = data.subtract(data.mean(axis=1), axis=0).divide(data.std(axis=1), axis=0)
    data = data.fillna(0, axis=0)
    datas.append(data)

In [None]:
train_dataset = datas + [forward]
train_dataset = pd.concat(train_dataset, axis=0, keys=[feat[:-8] for feat in features] + ['target'])
train_dataset = train_dataset.swaplevel().sort_index()
train_dataset.head()

In [None]:
forward.loc["2010-01-11"].hist()

## Model Constructing

In [None]:
from tqdm import tqdm
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

The following function provides a method for training a model on a given day

In [None]:
def fit_for_one_day(date: str, forward_period: int, training_period: int):
    i = train_dataset.index.levels[0].get_loc(date)
    training_end = train_dataset.index.levels[0][i - forward_period - 1]
    training_start = train_dataset.index.levels[0][i - forward_period - training_period - 1]
    csdata = train_dataset.loc[training_start:training_end].unstack().stack(level=0).dropna()
    if csdata.columns.size > 1:
        pca = PCA()
        train_x_pca = pd.DataFrame(pca.fit_transform(csdata.loc[:, csdata.columns != 'target']), index=csdata.index)
        model = RandomForestClassifier()
        model.fit(train_x_pca, csdata['target'])
        return model, csdata.columns[csdata.columns != 'target']
    return [None, None]

In [None]:
forward_period = period
training_period = period
dates = train_dataset.index.levels[0][forward_period + training_period + 1:]
models = []
for date in tqdm(dates):
    models.append(fit_for_one_day(date, forward_period, training_period))

Constructing a model time series

In [None]:
models = []
features = []
forward_index_str = forward.index.intersection(total_feature.index.levels[0]).strftime('%Y%m%d')

for date in tqdm(forward_index_str):
    model, feat = fit_for_one_day(date)
    models.append(model)
    features.append(feat)

Use the model on the last relocate date to predict the next period's forward return

In [None]:
valuation = pd.Series(index=forward_index_str[1:], dtype='float32')
predictions = pd.DataFrame(index=forward_index_str, columns=total_feature.columns)
for i in range(len(forward_index_str) - 1):
    date = forward_index_str[i + 1]
    if features[i].isin(total_feature.loc[date].index).all():
        data = pd.concat([total_feature.loc[date].T.loc[:, features[i]], forward.loc[date]], axis=1).dropna()
        model = models[i]
        y_pred = pd.Series(model.predict(data.loc[:, features[i]].values), index=data.index)
        valuation.loc[date] = accuracy_score(data.iloc[:, -1].values, y_pred.values)
        predictions.loc[date] = y_pred
    else:
        valuation.loc[date] = np.nan

Visualization and valuation

In [None]:
ax = valuation.plot.bar(figsize=(12, 6))
ax.set_xticks(ax.get_xticks()[::50])
_ = ax.set_xticklabels(valuation.index[::50], rotation=45)

In [None]:
(valuation.mean(), valuation.std(), valuation.mean() / valuation.std(), 
    valuation[valuation > valuation.mean()].size / valuation.size, ttest_1samp(valuation.dropna(), 0.5))

Prediction Matrix

In [None]:
predictions.head()