In [6]:
import pandas as pd
import efinance as ef

stock_concepts = pd.read_csv("stock_concept.csv")
concept_dic = {}
code2name = {}
for _, row in stock_concepts.iterrows():
    t, n, c = row['板块类型'], row['板块名称'], row['板块代码']
    if t not in concept_dic:
        concept_dic[t] = {}
    if n not in concept_dic[t]:
        concept_dic[t][n] = c
    code2name[c] = n

In [7]:
code2name

{'BK1013': '华为欧拉',
 'BK0519': '稀缺资源',
 'BK0979': '低碳冶金',
 'BK1074': '托育服务',
 'BK1100': '减速器',
 'BK0561': '基本金属',
 'BK1119': 'PLC概念',
 'BK0695': '小金属概念',
 'BK0547': '黄金概念',
 'BK0578': '稀土永磁',
 'BK0831': '万达概念',
 'BK0505': '中字头',
 'BK0951': '刀片电池',
 'BK0692': '在线旅游',
 'BK1121': '第四代半导体',
 'BK0919': 'RCS概念',
 'BK0897': 'IPv6',
 'BK0563': '油价相关',
 'BK0907': '转基因',
 'BK0912': '远程办公',
 'BK0998': '机器视觉',
 'BK0987': '盐湖提锂',
 'BK0996': '毛发医疗',
 'BK1090': '机器人概念',
 'BK0989': '储能',
 'BK1070': '土壤修复',
 'BK1080': '新型城镇化',
 'BK1022': '职业教育',
 'BK0566': '滨海新区',
 'BK1052': '动力电池回收',
 'BK0671': '彩票概念',
 'BK0928': '抖音小店',
 'BK1087': '超超临界发电',
 'BK1086': '粮食概念',
 'BK1094': '钙钛矿电池',
 'BK0492': '煤化工',
 'BK0943': '汽车拆解',
 'BK0712': '一带一路',
 'BK0988': '钠离子电池',
 'BK0662': '在线教育',
 'BK0834': '乡村振兴',
 'BK0922': '数据中心',
 'BK1008': '国资云概念',
 'BK0669': '生态农业',
 'BK0640': '智能机器',
 'BK0962': 'RCEP概念',
 'BK0579': '云计算',
 'BK1072': '中俄贸易概念',
 'BK0690': '氟化工',
 'BK1010': '磷化工',
 'BK0642': '手游概念',
 'BK1082': '噪声防治',
 'B

In [75]:
concept_histories = {}
for concept_type in concept_dic:
    
    codes = list(concept_dic[concept_type].values())
    concept_histories[concept_type] = ef.stock.get_quote_history(codes)


Processing => BK0524: 100%|██████████| 409/409 [00:10<00:00, 37.24it/s]
Processing => BK0473: 100%|██████████| 86/86 [00:06<00:00, 12.52it/s]


In [154]:
stock_k_line_data = {}
def get_code_k_data(stock_code):
    if stock_code not in stock_k_line_data:
        k_data = ef.stock.get_quote_history(stock_code)
        k_data['涨停'] = k_data.apply(lambda x: 1 if x['涨跌幅'] > 9 and x['成交量'] > 0 else 0, axis=1)
        k_data['跌停'] = k_data.apply(lambda x: 1 if x['涨跌幅'] < -9 and x['成交量'] > 0 else 0, axis=1)
        k_data["5日均线"] = k_data["收盘"].rolling(5).mean()
        k_data["10日均线"] = k_data["收盘"].rolling(10).mean()
        k_data["20日均线"] = k_data["收盘"].rolling(20).mean()
        k_data["30日均线"] = k_data["收盘"].rolling(30).mean()
        k_data["60日均线"] = k_data["收盘"].rolling(60).mean()
        def calc_amount_ratio(x):
            sum_amount = 0
            for _ in list(x)[:-1]:
                sum_amount += _
            amount = list(x)[-1]
            return amount/sum_amount * len(list(x))
        k_data["5日量比"] = k_data["成交量"].rolling(6).apply(calc_amount_ratio)
        k_data["1日量比"] = k_data["成交量"].rolling(2).apply(calc_amount_ratio)

        stock_k_line_data[stock_code] = k_data
    return stock_k_line_data[stock_code]

In [155]:
# stock_k_line_data
stock_infos = {}
stock2board = {}
code2board = {}
board2code = {}

# extract_k_line_feature
def get_labels(stock_code, date):
    k_data = get_code_k_data(stock_code)
    line = k_data[k_data['日期']==date]
    if line.shape[0] == 0:
        print("get_labels invalid day")
        return None
    idx = line.index.tolist()[0]
    if idx + 1 >= k_data.shape[0]:
        print("get_labels no next day")
        return None
    tmr_line = k_data.iloc[idx + 1,:]
    tmr_open, mr_amplitude, mr_ad, mr_volumn, open, amplitude, ad, volumn = tmr_line['开盘'], tmr_line['振幅'], tmr_line['涨跌幅'], tmr_line['成交量'], line['开盘'], line['振幅'], line['涨跌幅'], line['成交量']
    labels = {}
    labels["涨停"] = mr_volumn > 0 and mr_ad > 9 and mr_amplitude < 0.1
    labels["跌停"] = mr_volumn > 0 and mr_ad <-9 and mr_amplitude < 0.1
    price_diff = list((tmr_open)/open)[0]

    labels["大涨"] = price_diff >=1.06
    labels["大跌"] = price_diff <= 0.94
    labels["涨"] = price_diff >= 1.02 and price_diff < 1.06
    labels["跌"] = price_diff <= 0.98 and price_diff > 0.94
    labels["小涨"] = price_diff >=1 and price_diff < 1.02
    labels["小跌"] = price_diff <1 and price_diff > 0.98
    return labels

# 涨/跌停天数相关特征：昨日是否涨/跌停，是否连续2板，是否连续3板，是否连续4板，是否连续5板，是否超过5板，过去7天涨停次数，过去30天涨停次数，过去365天涨停次数
def get_lmt_feature(stock_code, date):
    k_data = get_code_k_data(stock_code)
    line = k_data[k_data['日期']==date]
    if line.shape[0] == 0:
        print("get_labels invalid day")
        return None
    curr_idx = line.index.tolist()[0]
    if curr_idx == line.shape[0] - 1:
        print("get_labels no next day")
        return None
    # lianbanshu
    def get_lmt_cnt(flag='涨停'):
        lianbanshu = 0
        lianban_flag = True
        day_cnt = {}
        i = 1
        lmt_cnt = 0
        while i < curr_idx:
            if k_data.iloc[curr_idx-i][flag] == 1:
                if k_data.iloc[curr_idx-i][flag] == k_data.iloc[curr_idx-1][flag] and lianban_flag:
                    lianbanshu += 1
                else:
                    lianban_flag = False
                lmt_cnt += 1 
            else:
                lianban_flag = False
            day_cnt[i] = lmt_cnt
            i += 1
        return lianbanshu, day_cnt
    features = {}
    up_lianban_cnt, up_day_cnt = get_lmt_cnt('涨停')
    down_lianban_cnt, down_day_cnt = get_lmt_cnt('跌停')
    for i in range(1, 6):
        features['连续涨停%d天'%i] = up_lianban_cnt == i
    features['连续涨停大于5天'] = up_lianban_cnt > 5
    for i in range(1, 6):
        features['连续跌停%d天'%i] = down_lianban_cnt == i
    features['连续跌停大于5天'] = down_lianban_cnt > 5
    features['次新股'] = curr_idx > 30 and curr_idx < 365
    features['新股'] = curr_idx <= 30
    max_days = curr_idx
    for d in [3, 7, 14, 30, 120, 365]:
        if d >= curr_idx:
            up_cnt =  up_day_cnt.get(max_days-1, 0)
            down_cnt =  down_day_cnt.get(max_days-1, 0)
            features['%d天内涨停天占比'%d] = up_day_cnt.get(max_days-1, 0) / min(d, max_days-1)
            features['%d天内跌停天占比'%d] = down_day_cnt.get(max_days-1, 0) / min(d, max_days-1)
        else:
            up_cnt = up_day_cnt.get(d, 0)
            down_cnt = down_day_cnt.get(d, 0)
            features['%d天内涨停天占比'%d] = up_day_cnt.get(d, 0) / min(d, max_days-1)
            features['%d天内跌停天占比'%d] = down_day_cnt.get(d, 0) / min(d, max_days-1)
    return features

# 获取行业、市值、市盈率信息
def get_basic_feature(stock_code):
    # 1. 行业
    df = ef.stock.get_belong_board(stock_code)
    # features = {'boards': []}
    boards = []
    for _, row in df.iterrows():
        code, name = row['板块代码'], row['板块名称']
        code2board[code] = name
        board2code[name] = code
        boards.append(name)
    basic_info = ef.stock.get_base_info(stock_code)
    stock2board[stock_code] = boards
    features['大盘股'] = basic_info['总市值'] > 500 * 10**8
    features['中盘股'] = basic_info['总市值'] >= 50 * 10**8 and basic_info['总市值'] < 500 * 10**8
    features['小盘股'] = basic_info['总市值'] <= 50 * 10**8

    features['负市盈率'] = basic_info['市盈率(动)'] < 0
    features['高市盈率'] = basic_info['市盈率(动)'] >= 100
    features['中市盈率'] = basic_info['市盈率(动)'] > 30
    features['低市盈率'] = basic_info['市盈率(动)'] <= 30
    return features

In [156]:

# stock_history with cache
stock_info = {}
feature_line = {}

#no cache
stock_k_line_data = {}
# get stock feature
# input stock code, date
# output stock features, label
# features: 板块，涨跌（昨日涨跌、7日涨跌、30日涨跌），价格情况（），当天开盘情况
# label: 开盘价买入，第二天的开盘相比涨跌情况（大幅收益涨停、大幅回撤跌停、大幅收益、波动收益、正向收益....）

# 获取量价因子 过去1天涨幅，过去3天涨幅，过去7天涨幅，过去1天量比，过去3天量比，过去7天量比
def get_price_volumn_feature(stock_code, date, smooth_ratio=1):
    k_data = get_code_k_data(stock_code)
    features = {}
    line = k_data[k_data['日期']==date]
    if line.shape[0] == 0:
        print("get_labels invalid day")
        return None
    curr_idx = line.index.tolist()[0]
    # 1： 均线排列 （前一天收盘 x日均价）在（y日均价） 上下
    ysd_line = k_data.iloc[curr_idx-1]
    price_name = ["收盘", "5日均线", "10日均线", "20日均线", "30日均线", "60日均线"]
    for i in range(len(price_name)):
        for j in range(i + 1, len(price_name)):
            a, b = ysd_line[price_name[i]], ysd_line[price_name[j]]
            diff = ((a - b) / b) * smooth_ratio
            features["%s %s正偏离过大"%(price_name[i], price_name[j])] = diff > 0.25
            features["%s %s正偏离偏大"%(price_name[i], price_name[j])] = diff >= 0.15 and diff < 0.25
            features["%s %s正偏离"%(price_name[i], price_name[j])] = diff >=0 and diff < 0.15
            features["%s %s负偏离过大"%(price_name[i], price_name[j])] = diff < -0.25
            features["%s %s负偏离偏大"%(price_name[i], price_name[j])] = diff < -0.15 and diff >= 0.25
            features["%s %s负偏离"%(price_name[i], price_name[j])] = diff < 0 and diff >= -0.15
    # 1、3、7、30日涨跌情况
    for d in [1, 3, 7, 30]:
        l = k_data.iloc[curr_idx-d]
        start_price = l['开盘']
        end_price = ysd_line['收盘']
        change_ratio = (end_price-start_price)/start_price * smooth_ratio
        features['%d大涨'%d] = change_ratio > 0.05 * d or change_ratio > 0.2
        features['%d涨'%d] = not features['%d大涨'%d] and (change_ratio > 0.03 * d or change_ratio > 0.1)
        features['%d微涨'%d] = not features['%d大涨'%d] and not features['%d涨'%d] and (change_ratio >= 0 or change_ratio > 0.1)
        change_ratio *= -1
        features['%d大跌'%d] = change_ratio > 0.05 * d or change_ratio > 0.2
        features['%d跌'%d] = not features['%d大涨'%d] and (change_ratio > 0.03 * d or change_ratio > 0.1)
        features['%d微跌'%d] = not features['%d大涨'%d] and not features['%d涨'%d] and (change_ratio > 0 or change_ratio > 0.1)

    # 量比关系
    volumn_ratio_5d = k_data.iloc[curr_idx-1]['5日量比']
    volumn_ratio_1d = k_data.iloc[curr_idx-1]['1日量比']
    features['5日大幅缩量'] = volumn_ratio_5d < 0.3
    features['5日缩量'] = volumn_ratio_5d < 0.6 and volumn_ratio_5d >=0.3
    features['5日小幅缩量'] = volumn_ratio_5d < 1 and volumn_ratio_5d >=0.6
    features['5日小幅放量'] = volumn_ratio_5d >= 1 and volumn_ratio_5d < 1.3
    features['5日放量'] = volumn_ratio_5d >= 1 and volumn_ratio_5d < 2
    features['5日大幅放量'] = volumn_ratio_5d >= 2
    features['5日爆量'] = volumn_ratio_5d >= 5
    features['1日放量'] = volumn_ratio_1d >= 1
    features['1日缩量'] = volumn_ratio_1d < 1
    
    return features

# 板块概念因子，1. 板块近1、3、7、30、90天涨幅，板块

# 当天开盘情况，1. 开盘点位 2. 同板块开盘点位 3. 同板块开盘

# 主力资金异动因子


def get_stock_feature_label(stock_code, date):
    # 1. get k line data
    if stock_code not in stock_k_line_data:
        k_data = ef.stock.get_quote_history(stock_code)
        k_data['涨停'] = k_data.apply(lambda x: 1 if x['涨跌幅'] > 9 and x['成交量'] > 0 else 0, axis=1)
        k_data['跌停'] = k_data.apply(lambda x: 1 if x['涨跌幅'] < -9 and x['成交量'] > 0 else 0, axis=1)
        k_data["5日均线"] = k_data["收盘"].rolling(5).mean()
        k_data["10日均线"] = k_data["收盘"].rolling(10).mean()
        k_data["20日均线"] = k_data["收盘"].rolling(20).mean()
        k_data["30日均线"] = k_data["收盘"].rolling(30).mean()
        k_data["60日均线"] = k_data["收盘"].rolling(60).mean()
        def calc_amount_ratio(x):
            sum_amount = 0
            for _ in list(x)[:-1]:
                sum_amount += _
            amount = list(x)[-1]
            return amount/sum_amount * len(list(x))
        k_data["5日量比"] = k_data["成交量"].rolling(6).apply(calc_amount_ratio)
        k_data["1日量比"] = k_data["成交量"].rolling(2).apply(calc_amount_ratio)

        stock_k_line_data[stock_code] = k_data
        print('reload data\n')
    k_data = stock_k_line_data[stock_code]

    # 2. get label
    labels = get_labels(stock_code, date)
    # 3. get feature
    all_features = {}
    lmt_fea = get_lmt_feature(stock_code, date)
    basic_fea = get_basic_feature(stock_code)
    price_volumn_fea = get_price_volumn_feature(stock_code, date)
    for feas in [lmt_fea, basic_fea, price_volumn_fea]:
        for f in feas:
            all_features[f] = feas[f]
    # return lmt_fea, basic_fea, labels
    return all_features #price_volumn_fea

get_stock_feature_label("601137", "2023-01-19")



reload data



{'连续涨停1天': False,
 '连续涨停2天': False,
 '连续涨停3天': False,
 '连续涨停4天': False,
 '连续涨停5天': False,
 '连续涨停大于5天': False,
 '连续跌停1天': False,
 '连续跌停2天': False,
 '连续跌停3天': False,
 '连续跌停4天': False,
 '连续跌停5天': False,
 '连续跌停大于5天': False,
 '次新股': False,
 '新股': False,
 '3天内涨停天占比': 0.0,
 '3天内跌停天占比': 0.0,
 '7天内涨停天占比': 0.0,
 '7天内跌停天占比': 0.0,
 '14天内涨停天占比': 0.07142857142857142,
 '14天内跌停天占比': 0.0,
 '30天内涨停天占比': 0.03333333333333333,
 '30天内跌停天占比': 0.0,
 '120天内涨停天占比': 0.025,
 '120天内跌停天占比': 0.0,
 '365天内涨停天占比': 0.0273972602739726,
 '365天内跌停天占比': 0.0,
 'boards': ['有色金属',
  '浙江板块',
  '新能源',
  '太阳能',
  '预盈预增',
  '新材料',
  '富士康',
  '5G概念',
  '航母概念',
  '融资融券',
  '无线耳机'],
 '大盘股': False,
 '中盘股': True,
 '小盘股': False,
 '负市盈率': False,
 '高市盈率': False,
 '中市盈率': False,
 '低市盈率': True,
 '收盘 5日均线正偏离过大': False,
 '收盘 5日均线正偏离偏大': False,
 '收盘 5日均线正偏离': True,
 '收盘 5日均线负偏离过大': False,
 '收盘 5日均线负偏离偏大': False,
 '收盘 5日均线负偏离': False,
 '收盘 10日均线正偏离过大': False,
 '收盘 10日均线正偏离偏大': False,
 '收盘 10日均线正偏离': True,
 '收盘 10日均线负偏离过大': False,
 '收盘 10日均线负偏离偏

0       2000-01-04
1       2000-01-05
2       2000-01-06
3       2000-01-07
4       2000-01-10
           ...    
5581    2023-01-16
5582    2023-01-17
5583    2023-01-18
5584    2023-01-19
5585    2023-01-20
Name: 日期, Length: 5586, dtype: object

In [None]:
def get_change(df, start_time, end_time):
    cut_data = df.query("(日期 >= '%s') & (日期 <= '%s')"%(start_time, end_time))
    if cut_data.size == 0:
        return 0
    open_price = cut_data.iloc[0]['开盘']
    close_price = cut_data.iloc[-1]['收盘']
    change = (close_price - open_price) / open_price
    return change


In [None]:
import datetime
curr_time = datetime.datetime.now()
curr_time + datetime.timedelta(days=-1)
curr_time.strftime("%Y-%m-%d")

from chinese_calendar import is_workday
is_workday(curr_time + datetime.timedelta(days=-6))
work_days = []
for i in range(365):
    tm = curr_time + datetime.timedelta(days=-(365-i))
    if is_workday(tm):
        work_days.append(tm.strftime("%Y-%m-%d"))
print(work_days)

In [None]:
def tm_concept(timeunit=7):
    time_ranges = []
    for i in range(len(work_days) - timeunit):
        time_ranges.append((work_days[i], work_days[i + timeunit - 1]))
    start_time_concepts={}
    for start_time, end_time in time_ranges:
        start_time_concepts[start_time] = []
        for t in concept_histories:
            for code in concept_histories[t]:
                change = get_change(concept_histories[t][code], start_time, end_time)
                start_time_concepts[start_time].append((code2name[code], change))
        start_time_concepts[start_time].sort(key=lambda x: x[1], reverse=True)
    days = list(start_time_concepts.keys())
    days.sort()
    results = []
    for d in days:
        top_concepts = [(_[0], _[1]) for _ in start_time_concepts[d][:10] if '昨日' not in _[0]]
        reverse_list =  start_time_concepts[d][-10:]

        reverse_list.sort(key=lambda x:x[1], reverse=False)
        bottom_list = [(_[0], _[1]) for _ in reverse_list]
        # print(d, [(_[0], '%.1f%%'%(_[1] * 100)) for _ in top_concepts])
        # print(d, [(_[0], '%.1f%%'%(_[1] * 100)) for _ in bottom_list])
        # print("----------------------------------")
        results.append((d, top_concepts, bottom_list))
    return results

In [None]:
tm_concept(2)

In [None]:
tm_concept(1)

In [None]:
# tm_concept(30)

In [None]:
# timeunit_results = {}
# for i in [1, 7, 30]:
#     timeunit_results[i] = tm_concept(i)
