In [None]:
import torch
import sklearn
import numpy as np
import os,json

from alphagen.data.expression import *
from alphagen.models.alpha_pool import AlphaPool
from alphagen.utils import reseed_everything
from alphagen_generic.operators import funcs as generic_funcs
from alphagen_generic.features import *
from gan.utils.data import get_data_by_year

# Utility functions

In [None]:
import pandas as pd
from tqdm import tqdm
def get_ml_data(data):
    df = data.df_bak.copy()
    df.columns = ['open','close','high','low','volume','vwap']
    vwap_unstack = df['vwap'].unstack()
    tmp = (vwap_unstack.shift(-21)/vwap_unstack.shift(-1))-1
    label = tmp.stack().reindex(df.index)
    df['label'] = label

    feature = df[['open','close','high','low','volume','vwap']]
    tmp = feature.unstack()
    feature = (tmp/tmp.shift(1)-1)#.stack().reindex(df.index)

    result_feature = []
    cur = feature.stack().reindex(df.index)
    cur.columns = [f'{col}0' for col in cur.columns]
    for past in tqdm(range(1,60)):
        cur = feature.shift(past).stack().reindex(df.index)
        cur.columns = [f'{col}{past}' for col in cur.columns]
        result_feature.append(cur)
    result_feature = pd.concat(result_feature,axis=1)
    df = pd.concat([result_feature,df['label']],axis=1)
    start_date = data._dates[data.max_backtrack_days]
    end_date = data._dates[-data.max_future_days]
    return df.loc[start_date:end_date]

def normalize_data(df_train, df_valid, df_test):
    # Get the column names of the features
    labels = [df_train.iloc[:, -1], df_valid.iloc[:, -1], df_test.iloc[:, -1]]
    df_train_features = df_train.iloc[:, :-1]
    df_valid_features = df_valid.iloc[:, :-1]
    df_test_features = df_test.iloc[:, :-1]

    _mean = df_train_features.mean()
    _std = df_train_features.std()
    print('1')
    df_train_norm = (df_train_features - _mean) / _std
    print('2')
    df_valid_norm = (df_valid_features - _mean) / _std
    print('3')
    df_test_norm = (df_test_features - _mean) / _std

    df_train_norm.fillna(0, inplace=True)
    df_valid_norm.fillna(0, inplace=True)
    df_test_norm.fillna(0, inplace=True)


    df_train_norm['label'] = np.nan_to_num(labels[0],nan=0,posinf=0,neginf=0)
    df_valid_norm['label'] = np.nan_to_num(labels[1],nan=0,posinf=0,neginf=0)
    df_test_norm['label'] = np.nan_to_num(labels[2],nan=0,posinf=0,neginf=0)

    df_train_norm['label'] = df_train_norm['label'].groupby('datetime').transform(lambda x: (x - x.mean()) / x.std()).clip(-4, 4)

    df_train_norm = df_train_norm.clip(-4, 4)
    df_valid_norm = df_valid_norm.clip(-4, 4)
    df_test_norm = df_test_norm.clip(-4, 4)

    return df_train_norm, df_valid_norm, df_test_norm

# Train Lightgbm Model

In [None]:


import lightgbm as lgb
import pandas as pd

def train_lightgbm_model(df_train, df_valid, df_test):
    # Fill NaN values with 0
    df_train_filled = df_train.fillna(0)
    df_valid_filled = df_valid.fillna(0)
    df_test_filled = df_test.fillna(0)

    # Separate features and labels
    X_train = df_train_filled.drop(columns=['label'])
    y_train = df_train_filled['label']
    X_valid = df_valid_filled.drop(columns=['label'])
    y_valid = df_valid_filled['label']
    X_test = df_test_filled.drop(columns=['label'])
    y_test = df_test_filled['label']

    # Convert data to LightGBM Dataset format
    train_data = lgb.Dataset(X_train, label=y_train)
    valid_data = lgb.Dataset(X_valid, label=y_valid)

    # Set hyperparameters for LightGBM model
    params = {
        'objective': 'regression',
        'metric': 'mse',
        'num_leaves': 210,
        'max_depth': 8,
        'learning_rate': 0.05,
        'feature_fraction': 0.9,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'verbose': 0
    }

    # Train the LightGBM model
    model = lgb.train(params, train_data, valid_sets=[train_data, valid_data], num_boost_round=1000, early_stopping_rounds=100, verbose_eval=100)

    # Evaluate the model on the test set
    y_pred = model.predict(X_test)
    
    pred = pd.concat([pd.Series(y_pred,index=df_test.index),df_test['label']],axis=1)
    # Print the RMSE score
    # rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    # print(f"RMSE: {rmse}")

    return  model,pred

for instruments in ['csi300','csi500']:
    for train_end in range(2016,2021):
        returned = get_data_by_year(
            train_start = 2010,train_end=train_end,valid_year=train_end+1,test_year =train_end+2,
            instruments=instruments, target=target,freq='day',)
        data,data_valid,data_valid_withhead,data_test,data_test_withhead,_ = returned
        df_train = get_ml_data(data)
        df_valid = get_ml_data(data_valid)
        df_test = get_ml_data(data_test)
        df_train, df_valid, df_test = normalize_data(df_train, df_valid, df_test)
        
        model_name = 'lgbm'
        name = f"{instruments}_{model_name}_{train_end}"
        os.makedirs(f"out_ml/{name}",exist_ok=True)
        model,pred = train_lightgbm_model(df_train, df_valid, df_test)
        model.save_model(f"out_ml/{name}/{model_name}.pt")
        pred.to_pickle(f"out_ml/{name}/pred.pkl")

# Train Xgboost Model

In [None]:

import xgboost as xgb
import pandas as pd

def train_xgboost_model(df_train, df_valid, df_test):
    # Fill NaN values with 0
    df_train_filled = df_train.fillna(0)
    df_valid_filled = df_valid.fillna(0)
    df_test_filled = df_test.fillna(0)

    # Separate features and labels
    X_train = df_train_filled.drop(columns=['label'])
    y_train = df_train_filled['label']
    X_valid = df_valid_filled.drop(columns=['label'])
    y_valid = df_valid_filled['label']
    X_test = df_test_filled.drop(columns=['label'])
    y_test = df_test_filled['label']

    # Convert data to DMatrix format
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dvalid = xgb.DMatrix(X_valid, label=y_valid)

    # Set hyperparameters for XGBoost model
    params = {
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse',
        'colsample_bytree': 0.8879,
        'learning_rate': 0.2,
        'subsample': 0.8789,
        'lambda': 205.6999,
        'alpha': 580.9768,
        'max_depth': 8,
        'num_boost_round': 1000,
        'early_stopping_rounds': 100,
        'verbose_eval': 100
    }

    # Train the XGBoost model
    model = xgb.train(params, dtrain, evals=[(dtrain, 'train'), (dvalid, 'valid')], early_stopping_rounds=params['early_stopping_rounds'], verbose_eval=params['verbose_eval'])

    # Convert test data to DMatrix format
    dtest = xgb.DMatrix(X_test)

    # Make predictions on the test set
    y_pred = model.predict(dtest)

    # Combine the predictions with the actual labels
    # pred = pd.concat([df_test['label'], pd.Series(y_pred, index=df_test.index)], axis=1)
    pred = pd.concat([pd.Series(y_pred,index=df_test.index),df_test['label']],axis=1)

    return model, pred



for instruments in ['csi300','csi500']:
    for train_end in range(2016,2021):
        returned = get_data_by_year(
            train_start = 2010,train_end=train_end,valid_year=train_end+1,test_year =train_end+2,
            instruments=instruments, target=target,freq='day',)
        data,data_valid,data_valid_withhead,data_test,data_test_withhead,_ = returned
        df_train = get_ml_data(data)
        df_valid = get_ml_data(data_valid)
        df_test = get_ml_data(data_test)
        df_train, df_valid, df_test = normalize_data(df_train, df_valid, df_test)
        
        model_name = 'xgb'
        name = f"{instruments}_{model_name}_{train_end}"
        os.makedirs(f"out_ml/{name}",exist_ok=True)
        model,pred = train_xgboost_model(df_train, df_valid, df_test)
        model.save_model(f"out_ml/{name}/{model_name}.pt")
        pred.to_pickle(f"out_ml/{name}/pred.pkl")

# Train MLP Model

In [None]:
import torch
from sklearn.metrics import mean_squared_error

import torch.nn as nn
import torch.optim as optim

def train_mlp_model(df_train, df_valid, df_test):
    # Fill NaN values with 0
    df_train_filled = df_train.fillna(0)
    df_valid_filled = df_valid.fillna(0)
    df_test_filled = df_test.fillna(0)

    # Separate features and labels
    X_train = df_train_filled.drop(columns=['label']).values
    y_train = df_train_filled['label'].values
    X_valid = df_valid_filled.drop(columns=['label']).values
    y_valid = df_valid_filled['label'].values
    X_test = df_test_filled.drop(columns=['label']).values
    y_test = df_test_filled['label'].values

    # Convert data to tensors
    X_train = torch.tensor(X_train, dtype=torch.float32)
    y_train = torch.tensor(y_train, dtype=torch.float32)
    X_valid = torch.tensor(X_valid, dtype=torch.float32)
    y_valid = torch.tensor(y_valid, dtype=torch.float32)
    X_test = torch.tensor(X_test, dtype=torch.float32)
    y_test = torch.tensor(y_test, dtype=torch.float32)

    # Define the MLP model
    model = nn.Sequential(
        nn.Linear(X_train.shape[1], 64),
        nn.ReLU(),
        nn.Linear(64, 32),
        nn.ReLU(),
        nn.Linear(32, 1)
    )

    # Define the loss function and optimizer
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)


    # Move the model to GPU if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Train the model
    num_epochs = 10
    batch_size = 512
    for epoch in range(num_epochs):
        # Shuffle the training data
        indices = torch.randperm(X_train.shape[0])
        X_train_shuffled = X_train[indices]
        y_train_shuffled = y_train[indices]

        # Mini-batch training
        for i in tqdm(range(0, X_train.shape[0], batch_size)):
            # Get the mini-batch
            X_batch = X_train_shuffled[i:i+batch_size]
            y_batch = y_train_shuffled[i:i+batch_size]

            # Move the mini-batch to GPU if available
            X_batch = X_batch.to(device)
            y_batch = y_batch.to(device)

            # Forward pass
            outputs = model(X_batch)
            loss = criterion(outputs.flatten(), y_batch.flatten())

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

    # Evaluate the model on the test set
    with torch.no_grad():
        # Move the test data to GPU if available
        test_outputs = model(X_test.to(device)).detach().cpu().numpy().flatten()
        pred_df = pd.concat([df_test['label'],pd.Series(test_outputs,index=df_test.index)],axis=1)
    torch.cuda.empty_cache()
    return model, pred_df

for instruments in ['csi300','csi500']:
    for train_end in range(2016,2021):
        returned = get_data_by_year(
            train_start = 2010,train_end=train_end,valid_year=train_end+1,test_year =train_end+2,
            instruments=instruments, target=target,freq='day',)
        data,data_valid,data_valid_withhead,data_test,data_test_withhead,_ = returned
        df_train = get_ml_data(data)
        df_valid = get_ml_data(data_valid)
        df_test = get_ml_data(data_test)
        df_train, df_valid, df_test = normalize_data(df_train, df_valid, df_test)
        for seed in range(5):
            reseed_everything(seed)
            model_name = 'mlp'
            name = f"{instruments}_{model_name}_{train_end}"
            os.makedirs(f"out_ml/{name}",exist_ok=True)
            model,pred = train_mlp_model(df_train, df_valid, df_test)
            # model.save_model(f"out_ml/{name}/model.pt")
            pred.to_pickle(f"out_ml/{name}/pred_{seed}.pkl")

# Show LightGbm Result

In [None]:
import pandas as pd
instruments = 'csi300'

result = []
for year in range(2016,2021):
    result.append(pd.read_pickle(f'out_ml/{instruments}_lgb_{year}/pred.pkl'))
df = pd.concat(result,0)

def get_final_metrics(df):
    ic_raw = df.groupby('datetime').corr()['label'].unstack().iloc[:,:-1]
    ic_s = ic_raw.mean(axis=0)
    ic_s_std = ic_raw.std(axis=0)
    icir_s = ic_s/ic_s_std

    ric_raw = df.groupby('datetime').corr('spearman')['label'].unstack().iloc[:,:-1]
    ric_s = ric_raw.mean(axis=0)
    ric_s_std = ric_raw.std(axis=0)
    ricir_s = ric_s/ric_s_std
    return {'IC':ic_s.mean(),'ICIR':icir_s.mean(),'RankIC':ric_s.mean(),'RankICIR':ricir_s.mean()}

print('LightGBM Result:\n')
print(get_final_metrics(df))

# Show XGBoost Result

In [None]:
import pandas as pd
instruments = 'csi300'

result = []
for year in range(2016,2021):
    result.append(pd.read_pickle(f'out_ml/{instruments}_xgb_{year}/pred.pkl'))
df = pd.concat(result,0)

def get_final_metrics(df):
    ic_raw = df.groupby('datetime').corr()['label'].unstack().iloc[:,:-1]
    ic_s = ic_raw.mean(axis=0)
    ic_s_std = ic_raw.std(axis=0)
    icir_s = ic_s/ic_s_std

    ric_raw = df.groupby('datetime').corr('spearman')['label'].unstack().iloc[:,:-1]
    ric_s = ric_raw.mean(axis=0)
    ric_s_std = ric_raw.std(axis=0)
    ricir_s = ric_s/ric_s_std
    return {'IC':ic_s.mean(),'ICIR':icir_s.mean(),'RankIC':ric_s.mean(),'RankICIR':ricir_s.mean()}

print('XGBoost Result:\n')
print(get_final_metrics(df))

# Show MLP Result

In [None]:
import pandas as pd
result_all = []
for seed in range(5):
    result = []
    for year in range(2016,2021):
        result.append(pd.read_pickle(f'out_ml/csi500_mlp_{year}/pred_{seed}.pkl'))
    df = pd.concat(result,0)#.groupby('datetime').corr('spearman')['label'].unstack().mean()
    df.columns = ['pred','label']
    df=df[['pred','label']]



    def get_final_metrics(df):
        ic_raw = df.groupby('datetime').corr()['label'].unstack().iloc[:,:-1]
        ic_s = ic_raw.mean(axis=0)
        ic_s_std = ic_raw.std(axis=0)
        icir_s = ic_s/ic_s_std

        ric_raw = df.groupby('datetime').corr('spearman')['label'].unstack().iloc[:,:-1]
        ric_s = ric_raw.mean(axis=0)
        ric_s_std = ric_raw.std(axis=0)
        ricir_s = ric_s/ric_s_std
        return {'IC':ic_s.mean(),'ICIR':icir_s.mean(),'RankIC':ric_s.mean(),'RankICIR':ricir_s.mean()}
    tmp = get_final_metrics(df)
    result_all.append(tmp)
print('MLP Result:\n')
print(result_all)