In [None]:
# prompt: 在.ipynb中安装依赖。pip install torch numpy transformers datasets tiktoken wandb tqdm

!pip install torch numpy transformers datasets tiktoken wandb tqdm


In [None]:
!pip install easyquotation

In [None]:
"""
Sample from a trained model
"""
import os
import pickle
from contextlib import nullcontext
import torch
import tiktoken
from model import GPTConfig, GPT
import json
from operator import itemgetter
import numpy as np
from collections import Counter


# -----------------------------------------------------------------------------
init_from = 'resume' # either 'resume' (from an out_dir) or a gpt2 variant (e.g. 'gpt2-xl')
out_dir = 'out-stock' # ignored if init_from is not 'resume'
start = "\n" # or "<|endoftext|>" or etc. Can also specify a file, use as: "FILE:prompt.txt"
num_samples = 10 # number of samples to draw
max_new_tokens = 500 # number of tokens generated in each sample
temperature = 0.8 # 1.0 = no change, < 1.0 = less random, > 1.0 = more random, in predictions
top_k = 10 # retain only the top_k most likely tokens, clamp others to have 0 probability
seed = 1337
device = 'cuda' if torch.cuda.is_available() else 'cpu' # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1', etc.
dtype = 'bfloat16' if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else 'float16' # 'float32' or 'bfloat16' or 'float16'
compile = False # use PyTorch 2.0 to compile the model to be faster

device
# -----------------------------------------------------------------------------


In [None]:
import pandas as pd

# torch.manual_seed(seed)
# torch.cuda.manual_seed(seed)
torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul
torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn
device_type = 'cpu' # for later use in torch.autocast
ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast(device_type=device_type, dtype=ptdtype)

# model
if init_from == 'resume':
    # init from a model saved in a specific directory
    ckpt_path = os.path.join(out_dir, 'ckpt.pt')
    checkpoint = torch.load(ckpt_path, map_location=device)
    gptconf = GPTConfig(**checkpoint['model_args'])
    model = GPT(gptconf)
    state_dict = checkpoint['model']
    unwanted_prefix = '_orig_mod.'
    for k,v in list(state_dict.items()):
        if k.startswith(unwanted_prefix):
            state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
    model.load_state_dict(state_dict)


model.eval()
model.to(device)


datadir = os.path.join('data', 'stock')

# meta数据
meta = {}
with open(os.path.join(datadir, 'meta.pkl'), 'r') as f:
    meta = json.load(f)
    meta_vocab_size = meta['vocab_size']
    meta_vocab_size = 41
def decode(id):
    return meta['itos'][str(id)]
def decode_arr(ids):
    return [decode(id) for id in ids]
def encode(s):
    return [meta['stoi'][c] for c in s]


pd_train_data = pd.read_csv(os.path.join(datadir, 'train.csv'))
for append_name in sorted([name for name in os.listdir(datadir) if name.startswith('real_time_')]):
    data_append = pd.read_csv(os.path.join(datadir, append_name))
    pd_train_data = pd.concat([pd_train_data, data_append], ignore_index=True)

pd_train_data = pd_train_data.iloc[1:,:]
pd_train_data.reset_index(drop=True, inplace=True)
pd_train_data



In [None]:
def transform_dataframe_id(df):
  """
  Transforms values in a dataframe according to the specified rules.

  Args:
      df: The pandas dataframe to transform.

  Returns:
      A new pandas dataframe with transformed values.
  """

  def transform_pri_chg_id(value):
    if value == -100 or value is None:
      value = 1

    new_value = round((value - 1)*100)
    if new_value > 20:
      new_value = 20
    if new_value < -20:
      new_value = -20
    return new_value + 20
    if new_value < -9.7:
      return 0
    elif new_value < -6:
      return 1
    elif new_value < -1:
      return 2
    elif new_value < 1:
      return 3
    elif new_value < 6:
      return 4
    elif new_value < 9.7:
      return 5
    else:
      return 6

  # Apply the transformation function to each column in the dataframe
  transformed_df = df.applymap(transform_pri_chg_id)

  return transformed_df

train_data = transform_dataframe_id(pd_train_data.iloc[:, 1:])
train_data = torch.from_numpy(train_data.to_numpy().astype(np.int64))
print(train_data.shape)
train_data


In [None]:
tensor = train_data
tensor.shape
pd_train_data.shape

In [None]:

# 策略一、pre_num < 17 and current_num = 8



# 创建一个具有给定shape的随机张量（tensor），值的范围在0到6之间
# tensor = train_data
tensor = train_data


# 初始化一个字典来存储结果
result = {}

num = 20
diyi = tensor.shape[0]-num-10
print(f'date={pd_train_data.iloc[diyi, 0]}')

infos = []
all_count = 0
# 遍历每一列
for i in range(tensor.shape[1]):
    for j in range(diyi, diyi + num-1):
        # 取得当前数字和它后面的数字
        
        pre_num = tensor[j-1, i].item()
        current_num = tensor[j, i].item()
        next_num = tensor[j+1, i].item()

        # if pre_num == 30 or pre_num == 40:
        #     continue
        
        # if current_num == 6:
        #     infos.append(f'date={pd_train_data.iloc[j,0]}, code={pd_train_data.columns[i+1]}, chg={tensor[j-3:j+2,i]}')
        # 更新结果字典
        if (current_num, next_num) not in result:
            result[(current_num, next_num)] = 1
        else:
            result[(current_num, next_num)] += 1
        all_count += 1

print(f'日期={pd_train_data.iloc[diyi,0]}, {pd_train_data.iloc[diyi+num-2,0]}, 总次数={all_count}')
newinfos = sorted(infos)
for line in newinfos:
    print(line)

# 输出结果
all_range = range(-21+20, 22+20)
for first in all_range:
    count = 0
    for sencod in all_range:
        if (first, sencod) not in result:
            continue
        count += result[(first, sencod)]
    if count == 0:
        count = 0.0001
    greatercount = 0
    for sencod in all_range:
        if (first, sencod) not in result:
            continue
        greatercount += ((result[(first, sencod)])*(sencod-20))
    print(f'前一天={first},总次数={count}, 总收益={greatercount}, 平均收益={greatercount/float(count):.2f}')

caca = {}
for first in all_range:
    count = 0
    for sencod in all_range:
        if (first, sencod) not in result:
            continue
        count += result[(first, sencod)]
    if count == 0:
        count = 0.1
    caca[first] = count

    sta = []
    greater_one = 0
    lower_one = 0
    for sencod in all_range:
        if (first, sencod) not in result:
            continue
        fre = result[(first, sencod)]
        ha = f'{fre/count:.0%}->{sencod-20}'
        sta.append(ha)
        if sencod >= 3+20:
            greater_one += fre
        if sencod <= -1+20:
            lower_one += fre
    if greater_one == 0:
        greater_one = 0.00001
    if lower_one == 0:
        lower_one = 0.00001
    pi = '；'.join(sta)
    print(f'今天={first}, 次数，{count}，盈亏比={greater_one/float(lower_one):.1f}，回本比例，{greater_one/count:.0%}，回本次数，{greater_one}，巨亏比例，{lower_one/count:.0%}，巨亏次数，{lower_one}，明天，{pi}')




import matplotlib.pyplot as plt

# 示例数据：数字及其出现的频次
data = caca

# 提取数字和频次
numbers = list(data.keys())
frequencies = list(data.values())

# 绘制条形图
plt.bar(numbers, frequencies)

# 为x轴和y轴添加标签
plt.xlabel("num")
plt.ylabel("fre")

# 添加标题
plt.title("ha")

# 显示图形
plt.show()


In [None]:
print(train_data[3:8,2])
train_data[3:6:2,2]

In [None]:

# 策略二、趋势交易法



# 创建一个具有给定shape的随机张量（tensor），值的范围在0到6之间
# tensor = train_data
tensor = train_data


# 初始化一个字典来存储结果
result = {}

num = 10
diyi = tensor.shape[0]-num-3
print(f'date={pd_train_data.iloc[diyi, 0]}')
days = 1

infos = []
all_count = 0
# 遍历每一列
for i in range(tensor.shape[1]):
    for j in range(diyi, diyi + num-1):
        # 取得当前数字和它后面的数字
        
        pre_num = tensor[j-1, i].item()
        current_num = tensor[j, i].item()
        next_num = torch.sum(tensor[j+1:j+1+days, i]).item()
        
        # if (torch.sum(tensor[j-30:j, i]) - (20*30))>30:
        #     continue
        # if pre_num == 30 or pre_num == 40:
        #     continue
        
        # if current_num == 6:
        #     infos.append(f'date={pd_train_data.iloc[j,0]}, code={pd_train_data.columns[i+1]}, chg={tensor[j-3:j+2,i]}')
        # 更新结果字典
        if (current_num, next_num) not in result:
            result[(current_num, next_num)] = 1
        else:
            result[(current_num, next_num)] += 1
        all_count += 1

print(f'日期={pd_train_data.iloc[diyi,0]}, {pd_train_data.iloc[diyi+num-2,0]}, 总次数={all_count}')
newinfos = sorted(infos)
for line in newinfos:
    print(line)

# 输出结果
all_range = range(-21+20, 22+20)
range_secod = range(20*(days+1))
for first in all_range:
    count = 0
    for sencod in range_secod:
        if (first, sencod) not in result:
            continue
        count += result[(first, sencod)]
    if count == 0:
        continue
    greatercount = 0
    for sencod in range_secod:
        if (first, sencod) not in result:
            continue
        greatercount += ((result[(first, sencod)])*(sencod-20*days))
    print(f'前一天={first},总次数={count}, 总收益={greatercount}, 平均收益={greatercount/float(count):.2f}')

caca = {}
for first in all_range:
    count = 0
    for sencod in range_secod:
        if (first, sencod) not in result:
            continue
        count += result[(first, sencod)]
    if count == 0:
        continue

    caca[first] = count

    sta = []
    greater_one = 0
    lower_one = 0
    for sencod in all_range:
        if (first, sencod) not in result:
            continue
        fre = result[(first, sencod)]
        ha = f'{fre/count:.0%}->{sencod-20}'
        sta.append(ha)
        if sencod >= 3+20:
            greater_one += fre
        if sencod <= -1+20:
            lower_one += fre
    if greater_one == 0:
        greater_one = 0.00001
    if lower_one == 0:
        lower_one = 0.00001
    pi = '；'.join(sta)
    print(f'今天={first}, 次数，{count}，盈亏比={greater_one/float(lower_one):.1f}，回本比例，{greater_one/count:.0%}，回本次数，{greater_one}，巨亏比例，{lower_one/count:.0%}，巨亏次数，{lower_one}，明天，{pi}')




In [None]:
# torch.manual_seed(333)

import random

block_size = 32

def get_batch(split, i, index=None):
    data = train_data
    pd_data = pd_train_data

    if index is None:
        dates, codes = data.shape
        index_code = torch.tensor([code for code in range(codes) ])
        index_date = torch.tensor([i])

        # (batch, block)
        x = torch.stack([data[index_date:index_date+block_size, i].squeeze() for i in index_code])
        # (batch, block)
        y = torch.stack([data[index_date+1:index_date+1+block_size, i].squeeze() for i in index_code])

    # index_first = index
    # if index is None:
    #     index_first = index_date[0].item()
    #     index_code = index_code[0].item()
    # for i in range(block_size):
    #     print(f'x is date={pd_data.iloc[index_first+i, 0]}, chg={pd_data.iloc[index_first+i, index_code+1]:<6}, code={pd_data.columns[index_code+1]}, code_id={index_code+1}')
    # print('----')
    # for i in range(block_size):
    #      print(f'y is date={pd_data.iloc[index_first+i+1, 0]}, chg={pd_data.iloc[index_first+i+1, index_code+1]:<6}, code={pd_data.columns[index_code+1]}, code_id={index_code+1}')

    return x, y

print(train_data.shape)
x, y = get_batch('train', 100)

print(f'x.shape is {x.shape}, y.shape is {y.shape}')



In [None]:
from operator import itemgetter
import numpy as np
from collections import Counter

size_data = len(pd_train_data)
index = size_data - block_size - 43
len_dates, len_codes = train_data.shape


pd_data = pd_train_data

def print_last_day(idx, day_ago=1):
    last_dim = idx.shape[-1] - day_ago
    sorted_tensor, indices = torch.sort(idx[:, last_dim], descending=True)
    i_code_counter = Counter([i.item() for i in sorted_tensor])
    print(f'last_day is {i_code_counter}')


def valid_pre_chg(pre_chgs, predict_chg):
    if predict_chg >= 23 and torch.sum(torch.ge(pre_chgs[-2:], 30)).item() < 1:
        if torch.sum(torch.eq(pre_chgs, 30)).item() > 0 or torch.sum(torch.eq(pre_chgs, 40)).item() > 0:
            if torch.sum(torch.eq(pre_chgs, 30)).item() <= 3 and torch.sum(torch.eq(pre_chgs, 40)).item() <= 3:
                if torch.sum(torch.le(pre_chgs, 10)).item() <= 1:
                    if pre_chgs[-1] >= 19:
                        return True
    return False

    # idx[index_code][-2] >= -100 and idx[index_code][-2] <= 200 and idx[index_code][-1] >= 30 an
    count_zero = torch.sum(torch.eq(pre_chgs, 0)).item()
    count_one = torch.sum(torch.eq(pre_chgs, 1)).item()
    count_two = torch.sum(torch.eq(pre_chgs, 2)).item()
    return True
    return count_zero < 1 and count_one < 2 and count_two < 3

def predict(fre=1):
    global index
    result = torch.zeros((5000, ))

    for i in range(fre):
        x, y = get_batch('train', index)
        idx = model.generate(x, 1, temperature=0.0001)
        

        
        last_dim = idx.shape[-1] - 1
        sorted_tensor, indices = torch.sort(idx[:, last_dim], descending=True)
        for score, index_code in zip(sorted_tensor, indices):
            if valid_pre_chg(idx[index_code][:-1], idx[index_code][-1]):
                result[index_code] += 1
    indices = torch.nonzero(result >= fre)

    if len(indices) > 0:
        print(idx)
        print_last_day(idx, 2)
        print_last_day(idx, 1)
    # print(indices)
    if True: 
        for i_code in indices:
            i_code = i_code.item()
            flag = sum(idx[i_code][:-1]) - len(idx[i_code][:-1])*3
            # if flag < 4:
            #     continue
            print(f'-----id={i_code}, code={pd_data.columns[i_code+1]}')
            print(f'idx is {idx[i_code]}, count={sum(idx[i_code][:-1]) - len(idx[i_code][:-1])*3}')
            for i in range(block_size-2, block_size):
                print(f'date={pd_data.iloc[index+i, 0]}, chg={pd_data.iloc[index+i, i_code+1]:<6}, code={pd_data.columns[i_code+1]}, code_id={i_code}')
            if index + block_size < len(pd_train_data):
                print(f'date={pd_train_data.iloc[index+block_size, 0]}, code={pd_data.columns[i_code+1]}, idx={idx[i_code][-1]}, predict={idx[i_code][-1].item()}, chg={((pd_train_data.iloc[index+block_size, i_code+1])-1)*100:.2f}%')
    return indices

for shift in range(20):
    print('-----------')
    print(f'-----index: {index}, date={pd_train_data.iloc[index, 0]}')
    match_codes = predict(1)
    index += 1

print(len(match_codes))


In [None]:
import easyquotation
quotation = easyquotation.use('tencent') # 新浪 ['sina'] 腾讯 ['tencent', 'qq']

def real_time(codes, prefix=False):
    all = quotation.stocks(codes, prefix=prefix)
    for code, info in all.items():
        print(f"code is {info['code']}, name is {info['name']}, price is {info['now']}, chg is {info['涨跌(%)']}")

print('---最大的实时数据')
codes = []
# if True:
#     for key in match_codes:
#         key = key.item()
#         code = pd_train_data.columns[key+1][:-3]
#         codes.append(code)

real_time(['sh000001', '399001', '399006'], prefix=True)

print('----')
codes.append('300404')
codes.append('002723')
codes.append('300919')
codes.append('002407')
codes.append('603659')
codes.append('300347')
real_time(codes)
