In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm
import os
import joblib
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from data_loader import StockDataset
from model.GHATModel import GAT
from config import Config

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def build_adj():
    # connection = [
    # (1, 0),
    # (9, 0), (12, 0), 
    # (8, 9), (8, 12), (5, 9), (11, 12), 
    # (4, 5), (4, 8), (7, 8), (7, 11), (10, 11),
    # (3, 4), (3, 7), (6, 7), (6, 10), (2, 3), (2, 6)]
    
    # 无向图
    connection = [
        (1, 0), (0, 1),
        (9, 0), (12, 0), (0, 9), (0, 12),
        (8, 9), (8, 12), (5, 9), (11, 12), (9, 8), (12, 8), (9, 5), (12, 11),
        (4, 5), (4, 8), (7, 8), (7, 11), (10, 11), (5, 4), (8, 4), (8, 7), (11, 7), (11, 10),
        (3, 4), (3, 7), (6, 7), (6, 10), (2, 3), (2, 6), (4, 3), (7, 3), (7, 6), (10, 6), (3, 2), (6, 2)
        ]
    adj_matrix = torch.zeros(13, 13).float()
    for source, target in connection:
        adj_matrix[source][target] = 1
    return adj_matrix

large_market_cap_stocks = [
    "000951", "002841", "300133", "300343", "000998", "300433",
    "601021", "603197", "300166", "600026", "000998", "600171",
    "300917", "603087", "002309", "300451", "002549", "603466"
]

medium_market_cap_stocks = [
    "300540", "603359", "000046", "300263", "002679", "603053",
    "000403", "603306", "600970", "002703", "000931", "002186",
    "300633", "603195", "300133", "600360", "600729", "603777"
]

small_market_cap_stocks = [
    "300174", "603095", "000753", "600622", "002282", "002882",
    "300912", "603926", "002451", "002672", "000551", "300758",
    "001207", "300865", "002247", "002379", "300389", "300491"
]
market_cap = {'large': large_market_cap_stocks, 'medium': medium_market_cap_stocks, 'small': small_market_cap_stocks}

# 高流动股票代码列表（前六位）
high_flow_stocks = [
    "300133", "300343", "000046", "300263", "000753", "600622", 
    "300166", "600026", "600970", "002703", "002451", "002672", 
    "002309", "300451", "300133", "600360", "002247", "002379"
]

# 中流动股票代码列表（前六位）
medium_flow_stocks = [
    "000998", "300433", "002679", "603053", "002282", "002882", 
    "000998", "600171", "000931", "002186", "000551", "300758", 
    "002549", "603466", "600729", "603777", "300389", "300491"
]

# 低流动股票代码列表（前六位）
low_flow_stocks = [
    "000951", "002841", "300540", "603359", "300174", "603095", 
    "601021", "603197", "000403", "603306", "300912", "603926", 
    "300917", "603087", "300633", "603195", "001207", "300865"
]
flow_dict = {'high': high_flow_stocks, 'medium': medium_flow_stocks, 'low': low_flow_stocks}

# test

In [53]:
pred_dir = './pred/'
scaler_dir = './data/volume/0308/Scaler/'

for path in os.listdir(pred_dir):
    if path.endswith('.csv'):
        date_suffix = path[-10:-4]
        scaler_path = os.path.join(scaler_dir, f'{date_suffix}.m')
        
        stand = joblib.load(scaler_path)
        data = pd.read_csv(os.path.join(pred_dir, path))
        
        data.iloc[:, 0] = stand.transform(data.iloc[:, 0].values.reshape(-1, 1)).flatten().astype(float)
        data.iloc[:, 1] = stand.transform(data.iloc[:, 1].values.reshape(-1, 1)).flatten().astype(float)
        
        aps_value = np.abs(data.iloc[:, 0] - data.iloc[:, 1])
        mape = np.mean(aps_value/data.iloc[:, 1])
        # mean_difference = np.mean(np.abs(data.iloc[:, 0] - data.iloc[:, 1])/data.iloc[:, 1])
        print(f"{path} com is {mape}")

model_pred_300263.csv com is 0.935416413980514
model_pred_002882.csv com is 1.550863929888626
model_pred_002841.csv com is -1.0267474556341747
model_pred_002282.csv com is 3.756918435396213
model_pred_300174.csv com is -0.4021384301654137
model_pred_000998.csv com is -0.12983024485678749
model_pred_000951.csv com is -1.3434334328673543
model_pred_000046.csv com is -0.46476788085150367
model_pred_300133.csv com is 0.8758025108458504
model_pred_000753.csv com is 1.9249466546347918


# Analysis Tensorboard

save_models/saved_models-64-0.2-MSELoss/000951/scalar 558.3517 0.0579

save_models/saved_models-128-0.1-L1Loss/002882/scalar 0.6444 0.111  

save_models/saved_models-32-0.1-L1Loss/000951/scalar  0.3645  0.1776

save_models/saved_models-128-0.3-L1Loss/300263/scalar 5.4386  0.2057

save_models/saved_models-32-0.1-L1Loss/300174/scalar  0.6048  0.2104  

save_models/saved_models-128-0.1-L1Loss/000046/scalar  0.3891  0.215

save_models/saved_models-32-0.1-L1Loss/300133/scalar 0.4245 0.2317

save_models/saved_models-128-0.1-L1Loss/000753/scalar  1.1202  0.2362

save_models/saved_models-32-0.1-L1Loss/000753/scalar  0.6045 0.2512

save_models/saved_models-128-0.1-L1Loss/002841/scalar  0.4857  0.461

In [45]:
model_path = pd.read_excel('./model.xlsx')
path_dict = {}
for path in model_path['path']:
    for _ in os.listdir(path.split('/scalar')[0]):
        if 'train' in _ and '.tar' not in _:
            pt_path = os.path.join(path, _).replace('/scalar', '')
            pt_path = f"./{pt_path}"
            pred_path = f'./pred/model_pred_{_[:6]}.csv'
            data_path = f'./data/volume/0308/Input/{_[:6]}_3_3_inputs.npy'
            path_dict[_[:6]] = [pt_path, data_path, pred_path]

for key, value in path_dict.items():
    pt_path, data_path, pred_path = value[0], value[1], value[2]
    
    # load data
    data = np.load(data_path, allow_pickle= True)
    data = np.array([value[_] for item in data for value in item for _ in [0, 1, 2, 3, 4, 5, 6, 7, 8]], dtype= np.float32).reshape(data.shape[0], 13, 9)
    data = torch.from_numpy(data).to(device)

    # load model
    model = GAT(n_feat= len([0, 1, 2, 3, 4, 5, 6, 7, 8]), n_hid= 16, out_features= len([1]), 
                pred_length= 1, n_heads= 4)
    model = model.to(device= device)

    state_dict = torch.load(pt_path)
    model.load_state_dict(state_dict)
    model.eval()

    # model pred
    model_pred = model(data, build_adj())
    model_pred = model_pred.cpu().detach().numpy()

    # to csv
    pred_data = pd.read_csv(pred_path)
    pred_data[f'{key}-pred'] = model_pred
    pred_data.to_csv(f'./result/{key}.csv', index= None)

    # com model performance
    non_zero_mask = pred_data[f'{key}-true'] != 0
    mape = np.mean(np.abs((pred_data[f'{key}-true'][non_zero_mask]- pred_data[f'{key}-pred'][non_zero_mask])/ pred_data[f'{key}-true'][non_zero_mask]))
    # mape = np.mean(np.abs((pred_data[f'{key}-true']- pred_data[f'{key}-pred'])/ (pred_data[f'{key}-true']+ 1e-8)))
    mae = np.mean(pred_data[f'{key}-true']- pred_data[f'{key}-pred'])
    mse = np.mean((pred_data[f'{key}-true']- pred_data[f'{key}-pred'])** 2)

    print(f"{key} com MAE is {mae}, the MAPE is {mape}, the MSE is {mse}")

000951 com MAE is 0.17173053386621098, the MAPE is 2.518633466308861, the MSE is 0.5221717812411385
002882 com MAE is 0.12423068589359212, the MAPE is 2.1046718006878993, the MSE is 0.26982633313342996
300263 com MAE is 0.3505307760632711, the MAPE is 3.1935994183582874, the MSE is 0.6466603122536327
300174 com MAE is 0.2546607789525313, the MAPE is 2.1168768878390187, the MSE is 0.7743779097974093
000046 com MAE is 0.14571692178444937, the MAPE is 2.8024512486183615, the MSE is 0.46706784226470577
300133 com MAE is 0.4677060045788819, the MAPE is 6.006919753169601, the MSE is 0.8068875438268603
000753 com MAE is 0.3408899679655601, the MAPE is 4.5000158535369925, the MSE is 0.997200295958236
002841 com MAE is 0.3678779471036437, the MAPE is 4.670743773679899, the MSE is 0.7112576372651055
000998 com MAE is 0.11683616764812571, the MAPE is 2.3214763711409097, the MSE is 0.39026816087632676


| stock |  MAE   | MAPE | MSE  |
| :--:  | :--:   | :--: | :--: |
| 000046 | 0.067 | 2.784| 0.444|
| 002882 | 0.094 | 4.669| 0.710|
| 000951 | 0.194 | 2.633| 0.568|
| 300263 | 0.330 | 2.967| 0.609|
| 300174 | 0.229 | 2.031| 0.746|
| 300133 | 0.430 | 5.966| 0.785|
| 000753 | 0.357 | 4.565| 1.047|
| 002841 | 0.369 | 4.688| 0.713|
| 000998 | 0.091 | 2.411| 0.367|

In [46]:
result = './end_result/'
for data_path in os.listdir(result):
    pred_data = pd.read_csv(f'{result}{data_path}')
    key = data_path.split('.csv')[0]
    # com model performance
    non_zero_mask = pred_data[f'{key}-true'] != 0
    mape = np.mean(np.abs((pred_data[f'{key}-true'][non_zero_mask]- pred_data[f'{key}-pred'][non_zero_mask])/ (pred_data[f'{key}-true'][non_zero_mask])))
    mae = np.mean(pred_data[f'{key}-true']- pred_data[f'{key}-pred'])
    mse = np.mean((pred_data[f'{key}-true']- pred_data[f'{key}-pred'])** 2)

    print(f"{key} com MAE is {mae}, the MAPE is {mape}, the MSE is {mse}")

000753 com MAE is 0.3568296744179487, the MAPE is 4.565257975045892, the MSE is 1.0468893311222307
300263 com MAE is 0.3301482182794872, the MAPE is 2.966666300469162, the MSE is 0.6086557043995929
000998 com MAE is 0.0915205719877487, the MAPE is 2.4114674667919633, the MSE is 0.36674533100505274
000951 com MAE is 0.19498004116868423, the MAPE is 2.6327771803610047, the MSE is 0.5684222432400793
300133 com MAE is 0.4397617898415026, the MAPE is 5.966244515032575, the MSE is 0.7851925197273011
000046 com MAE is 0.06712500368747397, the MAPE is 2.7842108061709787, the MSE is 0.4440601307134777
002882 com MAE is 0.09368558017979427, the MAPE is 2.097611695986554, the MSE is 0.3116063670139982
002841 com MAE is 0.3670204628341969, the MAPE is 4.669078104983375, the MSE is 0.7103900892096832
300174 com MAE is 0.22933013198451282, the MAPE is 2.030887703447141, the MSE is 0.7463646253501821


In [7]:
path_dir = './data/0308/0308-data/'
path_list = os.listdir(path_dir)
for path in path_list:
    test = pd.read_csv(f'{path_dir}{path}')
    corr_value = test.iloc[:, 1:].corr()
    print(f'{path} the corr is {corr_value.iloc[0, 4]}')

300433_XSHE_25_daily.csv the corr is -0.05018412780538093
000951_XSHE_25_daily.csv the corr is 0.006499070456356722
000046_XSHE_25_daily.csv the corr is -0.025292199108096332
000998_XSHE_25_daily.csv the corr is -0.06417301369701807
002679_XSHE_25_daily.csv the corr is -0.04395794009055617
300263_XSHE_25_daily.csv the corr is -0.044277313292397036
300540_XSHE_25_daily.csv the corr is 0.01619292344657734
603053_XSHG_25_daily.csv the corr is -0.03826706361246143
600622_XSHG_25_daily.csv the corr is -0.02819947489846059
300174_XSHE_25_daily.csv the corr is -0.03089981921673612
002882_XSHE_25_daily.csv the corr is -0.024562945968328902
000753_XSHE_25_daily.csv the corr is -0.012347011070974035
603359_XSHG_25_daily.csv the corr is -0.007381870227684722
300133_XSHE_25_daily.csv the corr is -0.023778421675317782
002282_XSHE_25_daily.csv the corr is -0.015833455705401456
002841_XSHE_25_daily.csv the corr is -0.02189742569235223
300343_XSHE_25_daily.csv the corr is -0.026467016324222114
603095_

# Analysis

In [13]:
data_low = pd.read_csv('./data/0308/0308-data/000046_XSHE_25_daily.csv')
data_medium = pd.read_csv('./data/0308/0308-data/002882_XSHE_25_daily.csv')
data_heigh = pd.read_csv('./data/0308/0308-data/000998_XSHE_25_daily.csv')
data_analysis = {'low': data_low, 'medium': data_medium, 'heigh': data_heigh}

In [20]:
def plot_bbox(data_dict: dict):
    fig, axes = plt.subplots(nrows=1, ncols=len(data_dict), figsize=(12, 5))
    for i, (key, value) in enumerate(data_dict.items()):
        sns.boxplot(value['daily_volume'], ax= axes[i])
        axes[i].set_xlabel(f'{key}')
        axes[i].set_ylabel('daily_volume')
    # 调整子图之间的间距
    plt.tight_layout()
    # 显示图表
    plt.show()

def com_return(path_dir: str):
    path_list = os.listdir(path_dir)
    with tqdm(total=len(path_list)) as pbar:
        for path in path_list:
            file_path = os.path.join(path_dir, path)
            data = pd.read_excel(file_path)
            # data['收益率'] = data['收盘点位'].pct_change()
            data['日收益率'] = (data['收盘价'] - data['昨收价']) / data['昨收价'] * 100
            data = data.dropna()

            data.to_excel(f"./StockData/{path.split('.')[0]}.xlsx")
            pbar.update(1)

def com_stock_return(file_path_target: str, file_path_all: str):
    market_dict = {}
    for root, dirs, files in os.walk(file_path_target):
        for file in files:
            if file.endswith('.csv'):
                target_path = os.path.join(root, file)
                test1 = pd.read_csv(target_path)
                end, start = str(test1['time'].max())[:8], str(test1['time'].min())[:8]
                end, start = int(end), int(start)

                stock_info = target_path.split('_')[1][-6:]

                # get com return data path
                if os.path.exists(os.path.join(file_path_all, f"{stock_info}.SZ.xls")):
                    path_used = os.path.join(file_path_all, f"{stock_info}.SZ.xls")
                else:
                    path_used = os.path.join(file_path_all, f"{stock_info}.SH.xls")

                # open file
                try:
                    df = pd.read_excel(path_used)

                    # com daily_return and yead_return
                    daily_return = (df['收盘价'] - df['昨收价']) / df['昨收价'] * 100
                    year_return = daily_return.std()* np.sqrt(252)
                    # print(f"The {stock_info} Annualized Volatility {year_return}")
                    market_dict[stock_info] = year_return
                except Exception:
                    continue
    return market_dict

# plot_bbox(data_dict= data_analysis)
# com_return(path_dir= './stockData/allstock/')


market_return = com_stock_return(file_path_target= './data/raw_data/', file_path_all= './stockData/allstock/')

In [21]:
market_return

{'000403': 83.94171399855212,
 '600171': 49.431082769848935,
 '000551': 46.50375391336568,
 '002703': 246.06839649718341,
 '000998': 68.47825078391368,
 '002672': 41.42633479227145,
 '600026': 49.93037604196431,
 '000931': 711.7914954246801,
 '600970': 59.77701983835026,
 '603306': 49.6107296401739,
 '002186': 88.21341202517382,
 '300758': 66.66853128226039,
 '601021': 46.600327475973614,
 '300166': 58.26820108949775,
 '603197': 58.51819068581007,
 '603926': 48.08719028720553,
 '002451': 59.7939809524463,
 '300633': 58.063514324622176,
 '300389': 57.66784187527512,
 '600360': 64.38524516755488,
 '600729': 40.57379679498191,
 '603195': 87.85985838233813,
 '002247': 50.5248307861731,
 '002379': 50.15308049963824,
 '603466': 59.71602967430237,
 '300451': 68.05646239340707,
 '002309': 44.45894984085891,
 '300491': 66.12479336938773,
 '300133': 53.877745104515355,
 '603777': 59.42841792615807,
 '603087': 181.59597335111425,
 '002549': 50.55504593970418,
 '300433': 61.93785048792762,
 '00095