In [1]:
"""
Sample from a trained model
"""
import os
import pickle
from contextlib import nullcontext
import torch
import tiktoken
from model import GPTConfig, GPT
import json
from operator import itemgetter
import numpy as np
from collections import Counter


# -----------------------------------------------------------------------------
init_from = 'resume' # either 'resume' (from an out_dir) or a gpt2 variant (e.g. 'gpt2-xl')
out_dir = 'out-stock' # ignored if init_from is not 'resume'
start = "\n" # or "<|endoftext|>" or etc. Can also specify a file, use as: "FILE:prompt.txt"
num_samples = 10 # number of samples to draw
max_new_tokens = 500 # number of tokens generated in each sample
temperature = 0.8 # 1.0 = no change, < 1.0 = less random, > 1.0 = more random, in predictions
top_k = 10 # retain only the top_k most likely tokens, clamp others to have 0 probability
seed = 1337
device = 'cpu' # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1', etc.
dtype = 'bfloat16' if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else 'float16' # 'float32' or 'bfloat16' or 'float16'
compile = False # use PyTorch 2.0 to compile the model to be faster
# -----------------------------------------------------------------------------


In [2]:
import pandas as pd

torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul
torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn
device_type = 'cpu' # for later use in torch.autocast
ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast(device_type=device_type, dtype=ptdtype)

# model
if init_from == 'resume':
    # init from a model saved in a specific directory
    ckpt_path = os.path.join(out_dir, 'ckpt.pt')
    checkpoint = torch.load(ckpt_path, map_location=device)
    gptconf = GPTConfig(**checkpoint['model_args'])
    model = GPT(gptconf)
    state_dict = checkpoint['model']
    unwanted_prefix = '_orig_mod.'
    for k,v in list(state_dict.items()):
        if k.startswith(unwanted_prefix):
            state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
    model.load_state_dict(state_dict)


model.eval()
model.to(device)


datadir = os.path.join('data', 'stock')

# meta数据
meta = {}
with open(os.path.join(datadir, 'meta.pkl'), 'r') as f:
    meta = json.load(f)
    meta_vocab_size = meta['vocab_size']
    meta_vocab_size = 4096
def decode(id):
    return meta['itos'][str(id)]
def decode_arr(ids):
    return [decode(id) for id in ids]
def encode(s):
    return [meta['stoi'][c] for c in s]

pd_train_data = pd.read_csv(os.path.join(datadir, 'train.csv'))
for append_name in sorted([name for name in os.listdir(datadir) if name.startswith('real_time_')]):
    data_append = pd.read_csv(os.path.join(datadir, append_name))
    pd_train_data = pd.concat([pd_train_data, data_append], ignore_index=True)

pd_train_data = pd_train_data.iloc[1:,:meta_vocab_size+1]
pd_train_data.reset_index(drop=True, inplace=True)
pd_train_data



config is GPTConfig(block_size=5, vocab_size=4096, n_layer=8, n_head=8, n_embd=512, dropout=0, bias=False)
number of parameters: 27.27M


Unnamed: 0,trade_date,000001.SZ,000002.SZ,000004.SZ,000005.SZ,000006.SZ,000007.SZ,000008.SZ,000009.SZ,000010.SZ,...,603320.SH,603321.SH,603322.SH,603323.SH,603324.SH,603325.SH,603326.SH,603327.SH,603328.SH,603329.SH
0,20230104,1.0399,1.0461,1.0020,1.0000,1.0065,0.9740,1.0339,0.9943,1.0027,...,1.0483,1.0070,1.0156,1.0235,0.9767,-100.0000,1.0524,0.9876,0.9941,1.0006
1,20230105,1.0112,1.0136,0.9890,1.0214,0.9610,0.9987,0.9959,1.0091,0.9786,...,1.0079,1.0167,1.0188,0.9958,1.0061,-100.0000,0.9858,1.0094,1.0149,0.9851
2,20230106,1.0097,0.9943,0.9756,0.9895,0.9696,0.9949,0.9918,1.0016,1.0601,...,1.0079,0.9945,1.0000,0.9916,1.0052,-100.0000,1.0036,0.9926,0.9985,0.9928
3,20230109,1.0123,0.9771,1.0083,0.9947,0.9930,1.0179,0.9959,0.9927,0.9588,...,1.0211,1.0221,0.9933,1.0085,1.0104,-100.0000,0.9688,1.0050,1.0029,1.0061
4,20230110,0.9757,1.0011,0.9907,0.9894,0.9860,1.0013,0.9917,0.9951,0.9677,...,1.0031,1.0148,0.9921,0.9853,1.0054,-100.0000,0.9901,0.9900,1.0000,0.9813
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
297,20240328,0.9962,1.0033,1.0532,-100.0000,1.0104,1.0274,1.0212,1.0106,1.0314,...,1.0242,1.0165,1.0102,0.9824,1.0197,1.0007,1.0095,1.0810,1.0534,1.0301
298,20240329,1.0029,0.9740,0.9741,-100.0000,1.0180,0.9933,1.0290,1.0086,1.0261,...,1.0285,1.0236,0.9732,1.0089,0.9934,1.0323,0.9009,1.0558,0.9873,1.0068
299,20240401,1.0114,0.9978,0.9953,-100.0000,1.0051,1.0000,1.0000,1.0226,1.0127,...,1.0245,1.0259,1.0144,1.0177,1.0173,1.0056,1.0288,0.9876,1.0228,1.0067
300,20240402,0.9915,0.9465,0.9733,1.0000,0.9899,0.9821,0.9798,1.0037,1.0126,...,1.0193,1.0084,1.0039,1.0022,1.0349,0.9962,0.9720,0.9600,0.9874,1.0081


In [3]:
def trans_frame_to_id(dataframe):
    train_data = dataframe.iloc[:, 1:]
    # 对所有行，都取前10个最大的
    def top_n(row, n):
        # return row.nlargest(n).values
        return row.nlargest(n).index.tolist()

    n = 20
    data_top_10 = train_data.apply(top_n, axis=1, n=n)

    # 将结果转换为 [266, 10] 的形状
    data_transformed = pd.DataFrame(data_top_10.tolist(), index=train_data.index)

    def to_id(row):
        return encode(row)
    
    data_transformed = data_transformed.apply(to_id, axis=1)
    data_transformed = torch.stack([torch.tensor(row) for row in data_transformed])
    return data_transformed

train_data = trans_frame_to_id(pd_train_data)
train_data

tensor([[2573, 2089, 2356,  ..., 2372, 4023,  333],
        [2438, 2499, 2393,  ..., 1245, 1382, 4003],
        [2682, 1868, 1614,  ..., 1339, 1498, 3414],
        ...,
        [2451, 2477, 1745,  ..., 1573, 1896, 2386],
        [2482, 2477, 1954,  ...,  181, 3060, 3192],
        [1635, 2810, 2666,  ..., 3024, 3060,  297]])

In [4]:
torch.manual_seed(333)

block_size = 5

def get_batch(split, i, batch=1):
    data = train_data

    indices = torch.randint(len(data)-1-block_size, (1, ))
    indices = torch.tensor([i] * batch)

    # (batch, block)
    x = torch.stack([data[i:i+block_size] for i in indices])
    x = x.gather(2, torch.randint(x.shape[2], (x.shape[0], x.shape[1], 1))).squeeze(-1)

    # (batch, block)
    y = torch.stack([data[i+1:i+1+block_size] for i in indices])
    y = y.gather(2, torch.randint(y.shape[2], (y.shape[0], y.shape[1], 1))).squeeze(-1)


    return x, y

x, y = get_batch('val', 295, 1000)





In [38]:
from operator import itemgetter
import numpy as np
from collections import Counter

size_data = len(pd_train_data)
index = size_data - block_size - 0


haha = []

def predect():
    pd_data = pd_train_data 
    x, y = get_batch('val', index, 2000)

    idx = model.generate(x, 1)

    # 统计频次
    counter = Counter(idx[:, -1].tolist())
    # 获取出现频次最多的值\
    global haha
    haha = counter.most_common()

    for i in range(block_size):
        print(f'date={pd_data.iloc[index+i, 0]}, chg={pd_data.loc[index+i, decode(idx[0][i].item())]:<6}, code={decode(idx[0][i].item())}, code_id={idx[0][i].item()}')

predect()

print(haha)
print(len(haha))


# 输出前5个最大值及其键
print("---前5个最小的值及其键：")
for key, value in haha[-5:]:
    print(f"code={decode(key)}, Key: {key}, Value: {value}")
    if index + block_size < len(pd_train_data):
        if index + block_size < len(pd_train_data):
            print(f'date={pd_train_data.iloc[index+block_size, 0]}, code={decode(key)}, code_id={key}, chg={((pd_train_data.loc[index+block_size, decode(key)])-1)*100:.2f}%')
print('---最大的')
for key, value in haha[:5]:
    print(f"code={decode(key)}, Key: {key}, Value: {value}")
    if index + block_size < len(pd_train_data):
        if index + block_size < len(pd_train_data):
            print(f'date={pd_train_data.iloc[index+block_size, 0]}, code={decode(key)}, code_id={key}, chg={((pd_train_data.loc[index+block_size, decode(key)])-1)*100:.2f}%')


import easyquotation
quotation = easyquotation.use('tencent') # 新浪 ['sina'] 腾讯 ['tencent', 'qq'] 

def real_time(codes):
    all = quotation.stocks(codes) 
    for code, info in all.items():
        print(f"code is {info['code']}, name is {info['name']}, price is {info['now']}, chg is {info['涨跌(%)']}")

print('---最大的实时数据')
codes = []
for key, value in haha[:5]:
    code = decode(key)[:-3]
    codes.append(code)
real_time(codes)

print('---最小的实时数据')
codes = []
for key, value in haha[-5:]:
    code = decode(key)[:-3]
    codes.append(code)
codes.append('002565')
real_time(codes)


date=20240328, chg=1.1426, code=300887.SZ, code_id=2349
date=20240329, chg=1.1209, code=300430.SZ, code_id=1908
date=20240401, chg=1.2   , code=300618.SZ, code_id=2091
date=20240402, chg=1.1701, code=300890.SZ, code_id=2352
date=20240403, chg=1.1003, code=000701.SZ, code_id=229
[(2366, 58), (2028, 55), (1062, 38), (1702, 38), (1800, 37), (1889, 36), (1780, 31), (1908, 31), (1915, 31), (163, 31), (2029, 31), (1952, 30), (2566, 30), (2643, 29), (2299, 28), (1701, 28), (1943, 26), (38, 26), (2071, 26), (2180, 25), (816, 24), (2337, 24), (2142, 22), (2803, 21), (1771, 21), (2273, 20), (2169, 20), (2478, 19), (1955, 18), (2352, 18), (690, 18), (2023, 17), (1946, 17), (1518, 17), (2223, 16), (2807, 16), (3398, 16), (2573, 16), (2764, 15), (2591, 15), (2417, 14), (1700, 13), (2495, 13), (1953, 12), (2077, 12), (1808, 11), (2217, 11), (782, 11), (2394, 11), (1791, 11), (2471, 11), (1477, 11), (2523, 10), (2804, 10), (2833, 10), (2344, 10), (3060, 10), (2750, 10), (2519, 9), (2300, 9), (2005, 9