In [1]:
"""
Sample from a trained model
"""
import os
import pickle
from contextlib import nullcontext
import torch
import tiktoken
from model import GPTConfig, GPT
import json

# -----------------------------------------------------------------------------
init_from = 'resume' # either 'resume' (from an out_dir) or a gpt2 variant (e.g. 'gpt2-xl')
out_dir = 'out-stock' # ignored if init_from is not 'resume'
start = "\n" # or "<|endoftext|>" or etc. Can also specify a file, use as: "FILE:prompt.txt"
num_samples = 10 # number of samples to draw
max_new_tokens = 500 # number of tokens generated in each sample
temperature = 0.8 # 1.0 = no change, < 1.0 = less random, > 1.0 = more random, in predictions
top_k = 10 # retain only the top_k most likely tokens, clamp others to have 0 probability
seed = 1337
device = 'cpu' # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1', etc.
dtype = 'bfloat16' if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else 'float16' # 'float32' or 'bfloat16' or 'float16'
compile = False # use PyTorch 2.0 to compile the model to be faster
# -----------------------------------------------------------------------------


In [2]:
import pandas as pd

torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul
torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn
device_type = 'cpu' # for later use in torch.autocast
ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast(device_type=device_type, dtype=ptdtype)

# model
if init_from == 'resume':
    # init from a model saved in a specific directory
    ckpt_path = os.path.join(out_dir, 'ckpt.pt')
    checkpoint = torch.load(ckpt_path, map_location=device)
    gptconf = GPTConfig(**checkpoint['model_args'])
    model = GPT(gptconf)
    state_dict = checkpoint['model']
    unwanted_prefix = '_orig_mod.'
    for k,v in list(state_dict.items()):
        if k.startswith(unwanted_prefix):
            state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
    model.load_state_dict(state_dict)


model.eval()
model.to(device)


datadir = os.path.join('data', 'stock')

# meta数据
meta = {}
with open(os.path.join(datadir, 'meta.pkl'), 'r') as f:
    meta = json.load(f)
    meta_vocab_size = meta['vocab_size']
    meta_vocab_size = 4096
def decode(id):
    return meta['itos'][str(id)]
def decode_arr(ids):
    return [decode(id) for id in ids]
def encode(s):
    return [meta['stoi'][c] for c in s]

pd_train_data = pd.read_csv(os.path.join(datadir, 'train.csv')).iloc[1:,:meta_vocab_size+1]
pd_train_data



config is GPTConfig(block_size=5, vocab_size=4096, n_layer=8, n_head=8, n_embd=512, dropout=0, bias=False)
number of parameters: 27.27M


Unnamed: 0,trade_date,000001.SZ,000002.SZ,000004.SZ,000005.SZ,000006.SZ,000007.SZ,000008.SZ,000009.SZ,000010.SZ,...,603320.SH,603321.SH,603322.SH,603323.SH,603324.SH,603325.SH,603326.SH,603327.SH,603328.SH,603329.SH
1,20230104,1.0399,1.0461,1.0020,1.0000,1.0065,0.9740,1.0339,0.9943,1.0027,...,1.0483,1.0070,1.0156,1.0235,0.9767,-100.0000,1.0524,0.9876,0.9941,1.0006
2,20230105,1.0112,1.0136,0.9890,1.0214,0.9610,0.9987,0.9959,1.0091,0.9786,...,1.0079,1.0167,1.0188,0.9958,1.0061,-100.0000,0.9858,1.0094,1.0149,0.9851
3,20230106,1.0097,0.9943,0.9756,0.9895,0.9696,0.9949,0.9918,1.0016,1.0601,...,1.0079,0.9945,1.0000,0.9916,1.0052,-100.0000,1.0036,0.9926,0.9985,0.9928
4,20230109,1.0123,0.9771,1.0083,0.9947,0.9930,1.0179,0.9959,0.9927,0.9588,...,1.0211,1.0221,0.9933,1.0085,1.0104,-100.0000,0.9688,1.0050,1.0029,1.0061
5,20230110,0.9757,1.0011,0.9907,0.9894,0.9860,1.0013,0.9917,0.9951,0.9677,...,1.0031,1.0148,0.9921,0.9853,1.0054,-100.0000,0.9901,0.9900,1.0000,0.9813
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
296,20240326,1.0192,1.0128,1.0137,-100.0000,1.0233,0.9890,0.9959,1.0122,1.0086,...,0.9984,1.0030,0.9310,1.0155,1.0040,0.9846,1.0549,1.0364,0.9986,0.9954
297,20240327,0.9934,0.9736,0.9912,-100.0000,0.9697,0.9777,0.9672,0.9656,0.9489,...,0.9724,0.9881,1.0119,0.9913,0.9530,0.9836,1.0409,0.9494,0.9560,0.9885
298,20240328,0.9962,1.0033,1.0532,-100.0000,1.0104,1.0274,1.0212,1.0106,1.0314,...,1.0242,1.0165,1.0102,0.9824,1.0197,1.0007,1.0095,1.0810,1.0534,1.0301
299,20240329,1.0029,0.9740,0.9741,-100.0000,1.0180,0.9933,1.0290,1.0086,1.0261,...,1.0285,1.0236,0.9732,1.0089,0.9934,1.0323,0.9009,1.0558,0.9873,1.0068


In [3]:
def trans_frame_to_id(dataframe):
    train_data = dataframe.iloc[:, 1:]
    # 对所有行，都取前10个最大的
    def top_n(row, n):
        # return row.nlargest(n).values
        return row.nlargest(n).index.tolist()

    n = 20
    data_top_10 = train_data.apply(top_n, axis=1, n=n)

    # 将结果转换为 [266, 10] 的形状
    data_transformed = pd.DataFrame(data_top_10.tolist(), index=train_data.index)

    def to_id(row):
        return encode(row)
    
    data_transformed = data_transformed.apply(to_id, axis=1)
    data_transformed = torch.stack([torch.tensor(row) for row in data_transformed])
    return data_transformed

train_data = trans_frame_to_id(pd_train_data)


In [4]:
torch.manual_seed(333)

block_size = 5

def get_batch(split, i):
    data = train_data

    indices = torch.randint(len(data)-1-block_size, (1, ))
    indices = torch.tensor([i])

    # (batch, block)
    x = torch.stack([data[i:i+block_size] for i in indices])
    x = x.gather(2, torch.randint(x.shape[2], (x.shape[0], x.shape[1], 1))).squeeze(-1)

    # (batch, block)
    y = torch.stack([data[i+1:i+1+block_size] for i in indices])
    y = y.gather(2, torch.randint(y.shape[2], (y.shape[0], y.shape[1], 1))).squeeze(-1)


    return x, y

get_batch('val', 2)



(tensor([[ 468,  948,  747,  620, 1057]]),
 tensor([[1140, 1405,  652,  361,  229]]))

In [5]:
from operator import itemgetter

size_data = len(pd_train_data)
index = size_data - block_size - 0
print(index)

haha = {}
count_print = 1

def predect():
    data_type = 'val'

    pd_data = pd_train_data 
    x, y = get_batch(data_type, index)

    idx = model.generate(x, 1)
    id = idx[-1][-1].item()
    haha[str(id)] = haha.get(str(id), 0) + 1

    global count_print
    if count_print == 0:
        return
    count_print -= 1
    for i in range(block_size):
        print(f'date={pd_data.iloc[index+i, 0]}, chg={pd_data.loc[index+i, decode(idx[0][i].item())]:<6}, code={decode(idx[0][i].item())}, code_id={idx[0][i].item()}')

for i in range(300):
    # print(f'{i}-----')
    predect()
print(haha)
sorted_items = sorted(haha.items(), key=itemgetter(1), reverse=True)
# 输出前5个最大值及其键
print("前5个最大的值及其键：")
for key, value in sorted_items[:5]:
    print(f"code={decode(key)}, Key: {key}, Value: {value}")
    # print(f'date={pd_train_data.iloc[index+block_size, 0]}, chg={pd_train_data.loc[index+block_size, decode(key)]:<6}, code={decode(key)}, code_id={key}')






295
date=20240326, chg=1.025 , code=300530.SZ, code_id=2005
date=20240327, chg=1.0   , code=300942.SZ, code_id=2401
date=20240328, chg=0.9388, code=300735.SZ, code_id=2203
date=20240329, chg=1.0231, code=300638.SZ, code_id=2111
date=20240401, chg=1.0397, code=300995.SZ, code_id=2451
{'2271': 1, '1701': 5, '1780': 7, '2591': 1, '1955': 2, '2598': 3, '2830': 2, '2005': 6, '2028': 13, '2837': 4, '1943': 3, '2029': 7, '2309': 1, '2142': 4, '2495': 6, '3912': 1, '2077': 3, '2566': 6, '2071': 2, '1901': 1, '2478': 3, '983': 1, '1566': 1, '2273': 1, '1966': 1, '2570': 2, '2344': 2, '2299': 2, '1062': 8, '1648': 1, '38': 2, '2234': 1, '1678': 2, '1204': 1, '2023': 5, '2750': 4, '2333': 2, '690': 2, '1534': 1, '2764': 1, '2716': 2, '2411': 1, '2782': 1, '2083': 1, '2384': 3, '2366': 6, '1805': 1, '2047': 2, '2407': 3, '1518': 5, '2169': 1, '1477': 1, '1963': 2, '2444': 1, '1613': 1, '1946': 3, '2519': 2, '2471': 2, '2797': 2, '816': 2, '2182': 1, '1915': 4, '2573': 2, '2643': 1, '1908': 3, '180

In [6]:
import easyquotation
quotation = easyquotation.use('tencent') # 新浪 ['sina'] 腾讯 ['tencent', 'qq'] 

# quotation.market_snapshot(prefix=True) # prefix 参数指定返回的行情字典中的股票代码 key 是否带 sz/sh 前缀

pd_val_data.columns

codes = []
for key, value in sorted_items[:5]:
    code = decode(key)[:-3]
    codes.append(code)

all = quotation.stocks(codes) 
for code, info in all.items():
    # info = eval(info)
    print(f"code is {info['code']}, name is {info['name']}, chg is {info['涨跌(%)']}")


NameError: name 'pd_val_data' is not defined