In [1]:
import pandas as pd
import numpy as np
import time
import math
import random
import torch 
import torch.nn as nn
import matplotlib.pyplot as plt
import torch.nn.functional as F
import torch.utils.data as Data
from torch.autograd import Variable


import warnings
warnings.filterwarnings('ignore')

train = pd.read_csv('../task2/HS300V2_clean_data/clean_train.csv')
test = pd.read_csv('../task2/HS300V2_clean_data/clean_test.csv')

In [2]:
# timer function
def timeSince(since):
    now = time.time()
    s = now-since
    m = math.floor(s/60)
    s-= m*60
    return '%dm %ds' % (m,s)

In [3]:
class pearson_loss(nn.Module):
    def __init__(self):
        super().__init__()
        
    def forward(self,x,y):
        cov = torch.mean(x*y)-torch.mean(x)*torch.mean(y)
        var = torch.std(x)*torch.std(y)
        return -1*cov/var+1

In [300]:
class LSTM1(nn.Module):
    def __init__(self,num_classes,input_size,hidden_size,num_layers,seq_length):
        super(LSTM1,self).__init__()
        self.num_classes = num_classes
        self.num_layers = num_layers
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.seq_length = seq_length
        
        self.lstm = nn.LSTM(input_size=input_size,hidden_size=hidden_size,
                           num_layers=num_layers,batch_first=True,dropout=0.5)
        self.fc_1 = nn.Linear(hidden_size,128)
        self.fc = nn.Linear(128,num_classes)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.5)
    
    
    def forward(self,x):
        h_0 = Variable(torch.zeros(self.num_layers,x.size(0),self.hidden_size))
        c_0 = Variable(torch.zeros(self.num_layers,x.size(0),self.hidden_size))
        output,(hn,cn) = self.lstm(x,(h_0,c_0))
        hn = hn.view(-1,self.hidden_size)
        out = self.relu(hn)
        out = self.fc_1(out)
        out = self.dropout(out)
        out = self.relu(out)
        out = self.fc(out)
        return out

In [301]:
def box(stock_code,num_epochs=100,learning_rate=1e-4,hidden_size=128):
    code = train[train['stock_code']==stock_code]
    X = np.array(code.iloc[:,3:])
    y = np.array(code.iloc[:,2]).reshape((code.shape[0],1))
    split = math.floor(code.shape[0]*0.92)
    X_train = X[:split,:]
    X_valid = X[split:,:]
    y_train = y[:split,:]
    y_valid = y[split:,:]
    
    y_train_tensors = Variable(torch.Tensor(y_train))
    y_valid_tensors = Variable(torch.Tensor(y_valid))
    

    X_train_tensors = torch.reshape(Variable(torch.Tensor(X_train)),(X_train.shape[0],1,X_train.shape[1]))
    X_valid_tensors = torch.reshape(Variable(torch.Tensor(X_valid)),(X_valid.shape[0],1,X_valid.shape[1]))
    
    input_size = train.shape[1]-3
    num_layers=1
    num_classes =1
    
    lstm1 = LSTM1(num_classes,input_size,hidden_size,num_layers,X_train_tensors.shape[1])
    
    criterion = pearson_loss()
    optimizer = torch.optim.Adam(lstm1.parameters(),lr=learning_rate)
    
    for epoch in range(num_epochs):
        outputs = lstm1.forward(X_train_tensors)
        optimizer.zero_grad()
        loss = criterion(outputs,y_train_tensors)
        loss.backward()
        optimizer.step()
    
    valid_predict = lstm1(X_valid_tensors).data.numpy()
    cov = np.mean(valid_predict*y_valid)-np.mean(valid_predict)*np.mean(y_valid)
    var = np.std(valid_predict)*np.std(y_valid)+1e-7
    corr = cov/var
    
    
    test_code = test[test['stock_code']==stock_code]
    test_X = np.array(test_code.iloc[:,2:])
    test_X_tensors = torch.reshape(Variable(torch.Tensor(test_X)),(test_X.shape[0],1,test_X.shape[1]))
    test_predict = lstm1(test_X_tensors).data.numpy()
    pred = []
    for i in range(test_code.shape[0]):
        pred.append([str(test_code.iloc[i,1]),str(test_code.iloc[i,0]),str(test_predict[i][0])])
    
    return corr,pred

In [302]:
train_stock_code = np.sort(list(set(train['stock_code'])))
test_stock_code = np.sort(list(set(test['stock_code'])))

In [303]:
# 找出历史数据超过50天的股票编号
select_code = []
num = []
for i in train_stock_code:
    temp = train[train['stock_code']==i]
    if temp.shape[0]>50:
        select_code.append(i)
        num.append(temp.shape[0])
select_code = np.array(select_code)

In [304]:
# 找出既在select_code又在test_stock_code的股票编号
final_code = []
for i in test_stock_code:
    if (select_code==i).any():
        final_code.append(i)

In [370]:
def tune_parameters(num_epoch,lr,hidden_size):
    temp = []
    for i in range(20):
        corr,pred = box(0,50,1e-4,128)
        temp.append(corr)
    return temp

In [371]:
def randomChoice(l):
    return l[random.randint(0, len(l) - 1)]

In [372]:
lr = [1e-2,1e-3,1e-4,1e-5]
num_epoch = [50,100,150]
hidden_size = [32,64,128,256]

In [374]:
iterations = 15
choice=np.zeros((iterations,4))
for i in range(iterations):
    para = []
    para.append(randomChoice(num_epoch))
    para.append(randomChoice(lr))
    para.append(randomChoice(hidden_size))
    temp = tune_parameters(para[0],para[1],para[2])
    choice[i][0] = para[0]
    choice[i][1] = para[1]
    choice[i][2] = para[2]
    choice[i][3] = np.mean(temp)

In [379]:
idx = np.argmax(choice[:,3])

In [388]:
better = list([int(choice[idx,0]),choice[idx,1],int(choice[idx,2])])

In [390]:
corr,pred=[],[]
start = time.time()
count = 0
for i in final_code:
    temp1,temp2=box(i,better[0],better[1],better[2])
    corr.append(temp1)
    pred = pred+temp2
    count+=1
    if count%10 == 0:
        print('completed %.f%%,used time %s' % (count/len(final_code)*100,timeSince(start)))

completed 3%,used time 0m 15s
completed 7%,used time 0m 29s
completed 10%,used time 0m 41s
completed 14%,used time 0m 59s
completed 17%,used time 1m 12s
completed 20%,used time 1m 24s
completed 24%,used time 1m 39s
completed 27%,used time 1m 52s
completed 31%,used time 2m 3s
completed 34%,used time 2m 13s
completed 37%,used time 2m 25s
completed 41%,used time 2m 38s
completed 44%,used time 2m 53s
completed 47%,used time 3m 12s
completed 51%,used time 3m 30s
completed 54%,used time 3m 46s
completed 58%,used time 4m 3s
completed 61%,used time 4m 16s
completed 64%,used time 4m 33s
completed 68%,used time 4m 50s
completed 71%,used time 5m 7s
completed 75%,used time 5m 21s
completed 78%,used time 5m 36s
completed 81%,used time 5m 50s
completed 85%,used time 6m 6s
completed 88%,used time 6m 24s
completed 92%,used time 6m 39s
completed 95%,used time 6m 55s
completed 98%,used time 7m 11s


In [391]:
np.mean(corr)

0.006030078908757777

In [392]:
from BenchmarkTestTool import submit_benchmark_test

data = pred
submitter = "qiuyuan"
dataset = "HS300V2"
comment = 'LSTM'
submit_benchmark_test(submitter,data,dataset,comment,submit=False)

TEST STATUS: success
************************************************************
COMPLETENESS CHECK
************************************************************
average_daily_miss_num:  29
max_miss_day:  3763
max_miss_day_num:  57
max_miss_stock:  526
max_miss_stock_num:  362
pred_miss_num:  10779
pred_miss_ratio:  0.0996
pred_num:  97448
true_num:  108227

************************************************************
SUBMIT SIGNAL PERFORMANCE
************************************************************
autocorrelation:  0.7669149466301716
comment:  LSTM
dataset:  HS300V2
pearson:  0.008657709637721214
pearson_250:  0.012412289982877878
pearson_decay:  4.288885791481114e-05
pearson_std:  0.05906351951125461
sid:  511c2b10ec88f4ba566b6b31a5e975a1
spearman:  0.010587426046371537
submit_time:  2021-04-21 10:12:18.598496
submitter:  qiuyuan
top30_win_rate:  0.48149171270718233

************************************************************
MODEL SCORE'S EXPOSURE ON RISKS AND INDUSTRIAL FACTO

In [249]:
# from BenchmarkTestTool import check_test_rank

# check_test_rank(submitter="all",rank_by="pearson")

**************************************************
BENCHMARK测试榜排名:
**************************************************
*指标说明:
pearson:皮尔森相关系数
prs_dcy:信号相关性每日衰减系数
prs_250:最近250天（一年）的pearson均值
sprm:序相关性，斯皮尔曼相关系数
top30_win:信号每日前30名有正超额收益的胜率
ac:信号自相关性

0. 
+-------------------+-----------+---------+--------------------------+---------+
|    submit_time    | submitter | dataset |         comment          | pearson |
+-------------------+-----------+---------+--------------------------+---------+
| 21-04-18 13:23:54 |   ZhuLei  | HS300V2 | Person_1e-4_MLP_leakRelu | 0.05882 |
+-------------------+-----------+---------+--------------------------+---------+
+---------+---------+---------+--------+-----------+--------+
| prs_std | prs_dcy | prs_250 |  sprm  | top30_win |   ac   |
+---------+---------+---------+--------+-----------+--------+
|  0.0776 |  -8e-05 |  0.0502 | 0.0468 |   0.497   | 0.5959 |
+---------+---------+---------+--------+-----------+--------+
1. 
+-------------------+--------