In [None]:
# prompt: 在.ipynb中安装依赖。pip install torch numpy transformers datasets tiktoken wandb tqdm

!pip install torch numpy transformers datasets tiktoken wandb tqdm


In [None]:
!pip install easyquotation

In [None]:
"""
Sample from a trained model
"""
import os
import pickle
from contextlib import nullcontext
import torch
import tiktoken
from model import GPTConfig, GPT
import json
from operator import itemgetter
import numpy as np
from collections import Counter


# -----------------------------------------------------------------------------
init_from = 'resume' # either 'resume' (from an out_dir) or a gpt2 variant (e.g. 'gpt2-xl')
out_dir = 'out-stock' # ignored if init_from is not 'resume'
start = "\n" # or "<|endoftext|>" or etc. Can also specify a file, use as: "FILE:prompt.txt"
num_samples = 10 # number of samples to draw
max_new_tokens = 500 # number of tokens generated in each sample
temperature = 0.8 # 1.0 = no change, < 1.0 = less random, > 1.0 = more random, in predictions
top_k = 10 # retain only the top_k most likely tokens, clamp others to have 0 probability
seed = 1337
device = 'cuda' if torch.cuda.is_available() else 'cpu' # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1', etc.
dtype = 'bfloat16' if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else 'float16' # 'float32' or 'bfloat16' or 'float16'
compile = False # use PyTorch 2.0 to compile the model to be faster

device
# -----------------------------------------------------------------------------


In [None]:
import pandas as pd

# torch.manual_seed(seed)
# torch.cuda.manual_seed(seed)
torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul
torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn
device_type = 'cpu' # for later use in torch.autocast
ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast(device_type=device_type, dtype=ptdtype)

# model
if init_from == 'resume':
    # init from a model saved in a specific directory
    ckpt_path = os.path.join(out_dir, 'ckpt.pt')
    checkpoint = torch.load(ckpt_path, map_location=device)
    gptconf = GPTConfig(**checkpoint['model_args'])
    model = GPT(gptconf)
    state_dict = checkpoint['model']
    unwanted_prefix = '_orig_mod.'
    for k,v in list(state_dict.items()):
        if k.startswith(unwanted_prefix):
            state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
    model.load_state_dict(state_dict)


model.eval()
model.to(device)


datadir = os.path.join('data', 'stock')

# meta数据
meta = {}
with open(os.path.join(datadir, 'meta.pkl'), 'r') as f:
    meta = json.load(f)
    meta_vocab_size = meta['vocab_size']
    meta_vocab_size = 31
def decode(id):
    return meta['itos'][str(id)]
def decode_arr(ids):
    return [decode(id) for id in ids]
def encode(s):
    return [meta['stoi'][c] for c in s]


pd_train_data = pd.read_csv(os.path.join(datadir, 'train.csv'))
for append_name in sorted([name for name in os.listdir(datadir) if name.startswith('real_time_')]):
    data_append = pd.read_csv(os.path.join(datadir, append_name))
    pd_train_data = pd.concat([pd_train_data, data_append], ignore_index=True)

pd_train_data = pd_train_data.iloc[1:,:2049]
pd_train_data.reset_index(drop=True, inplace=True)
pd_train_data



In [None]:
def transform_dataframe_id(df):
  """
  Transforms values in a dataframe according to the specified rules.

  Args:
      df: The pandas dataframe to transform.

  Returns:
      A new pandas dataframe with transformed values.
  """

  def transform_pri_chg_id(value):
    if value == -100:
      return 10
    else:
      new_value = int((value - 0.9) * 100)
      if new_value < 0:
        return 0
      elif new_value > 30:
        return 30
      else:
        return new_value

  # Apply the transformation function to each column in the dataframe
  transformed_df = df.applymap(transform_pri_chg_id)

  return transformed_df

train_data = transform_dataframe_id(pd_train_data.iloc[:, 1:])
train_data = torch.from_numpy(train_data.to_numpy().astype(np.int64))
print(train_data.shape)
train_data


In [None]:
# torch.manual_seed(333)

import random

block_size = 32

def get_batch(split, i, index=None):
    data = train_data
    pd_data = pd_train_data

    if index is None:
        dates, codes = data.shape
        index_code = torch.tensor([code for code in range(codes) ])
        index_date = torch.tensor([i])

        # (batch, block)
        x = torch.stack([data[index_date:index_date+block_size, i].squeeze() for i in index_code])
        # (batch, block)
        y = torch.stack([data[index_date+1:index_date+1+block_size, i].squeeze() for i in index_code])

    # index_first = index
    # if index is None:
    #     index_first = index_date[0].item()
    #     index_code = index_code[0].item()
    # for i in range(block_size):
    #     print(f'x is date={pd_data.iloc[index_first+i, 0]}, chg={pd_data.iloc[index_first+i, index_code+1]:<6}, code={pd_data.columns[index_code+1]}, code_id={index_code+1}')
    # print('----')
    # for i in range(block_size):
    #      print(f'y is date={pd_data.iloc[index_first+i+1, 0]}, chg={pd_data.iloc[index_first+i+1, index_code+1]:<6}, code={pd_data.columns[index_code+1]}, code_id={index_code+1}')

    return x, y

print(train_data.shape)
x, y = get_batch('train', 100)

print(f'x.shape is {x.shape}, y.shape is {y.shape}')






In [None]:
from operator import itemgetter
import numpy as np
from collections import Counter

size_data = len(pd_train_data)
index = size_data - block_size - 48
len_dates, len_codes = train_data.shape


pd_data = pd_train_data
x, y = get_batch('train', index)

idx = model.generate(x, 1)

print(idx.shape)
print(idx)

last_dim = idx.shape[-1] - 1
sorted_tensor, indices = torch.sort(idx[:, last_dim], descending=True)
print(sorted_tensor)
print(indices)

result = torch.stack((indices, sorted_tensor), dim=1)

# 输出前5个最大值及其键
print("---前5个最小的值及其键：")
for i_code, predict in result[-5:]:
  i_code = i_code.item()
  print(f'-----id={i_code}, code={pd_train_data.iloc[0, i_code+1]}, predict={predict}')
  for i in range(block_size-2, block_size):
      print(f'date={pd_data.iloc[index+i, 0]}, chg={pd_data.iloc[index+i, i_code+1]:<6}, code={pd_data.columns[i_code+1]}, code_id={i_code}')
  if index + block_size < len(pd_train_data):
      print(f'date={pd_train_data.iloc[index+block_size, 0]}, code={pd_data.columns[i_code+1]}, idx={idx[i_code][-1]}, predict={((0.9+float(idx[i_code][-1].item())/100 -1)*100):.2f}%, chg={((pd_train_data.iloc[index+block_size, i_code+1])-1)*100:.2f}%')

print('-------------------')
print('-------------------')
print('-------------------')
print('---最大的')
for i_code, predict in result[0:5]:
  i_code = i_code.item()
  print(f'-----id={i_code}, code={pd_train_data.iloc[0, i_code+1]}, predict={predict}')
  print(f'idx is {idx[i_code]}')
  for i in range(block_size-2, block_size):
      print(f'date={pd_data.iloc[index+i, 0]}, chg={pd_data.iloc[index+i, i_code+1]:<6}, code={pd_data.columns[i_code+1]}, code_id={i_code}')
  if index + block_size < len(pd_train_data):
      print(f'date={pd_train_data.iloc[index+block_size, 0]}, code={pd_data.columns[i_code+1]}, idx={idx[i_code][-1]}, predict={((0.9+float(idx[i_code][-1].item())/100 -1)*100):.2f}%, chg={((pd_train_data.iloc[index+block_size, i_code+1])-1)*100:.2f}%')


import easyquotation
quotation = easyquotation.use('tencent') # 新浪 ['sina'] 腾讯 ['tencent', 'qq']

def real_time(codes):
    all = quotation.stocks(codes)
    for code, info in all.items():
        print(f"code is {info['code']}, name is {info['name']}, price is {info['now']}, chg is {info['涨跌(%)']}")

print('---最大的实时数据')
codes = []
for key, value in result[:5]:
    code = pd_train_data.columns[key.item()+1][:-3]
    codes.append(code)
real_time(codes)

print('---最小的实时数据')
codes = []
for key, value in result[-5:]:
    code = pd_train_data.columns[key.item()+1][:-3]
    codes.append(code)
codes.append('002565')
real_time(codes)
