In [269]:
import pywencai
import re
import  pandas as pd
from typing import Optional
import numpy as np

import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)  # 显示所有列
pd.set_option('display.max_rows', None)     # 显示所有行
pd.set_option('display.width', 1000)        # 设置显示宽度
pd.set_option('display.colheader_justify', 'left')  # 左对齐列标题
pd.set_option('display.precision', 2)       # 设置浮点数精度

In [270]:
def extract_trade_date(df: pd.DataFrame) -> Optional[str]:
    """
    从 DataFrame 的列名中提取第一个形如 [YYYYMMDD] 的日期字符串。
    如果未找到则返回 None。
    """
    for col in df.columns:
        m1 = re.search(r'\[(\d{8})\]', col)
        if m1:
            return m1.group(1)
        m2 = re.search(r'\[(\d{8}-\d{8})\]', col)
        if m2:
            return m2.group(1).split('-')[1]
    return None

In [271]:
def remove_date_suffix(df: pd.DataFrame) -> pd.DataFrame:
    """
    删除 DataFrame 列名中形如 [YYYYMMDD]和[YYYYMMDD-YYYYMMDD] 的日期后缀。
    返回修改后的 DataFrame（不会修改原 df）。
    """
    column_mapping = {}
    for col in df.columns:
        if re.search(r'\[\d{8}\]', col):
            new_name = re.sub(r'\[\d{8}\]', '', col)
            column_mapping[col] = new_name
        elif re.search(r'\[\d{8}-\d{8}\]', col):
            new_name = re.sub(r'\[\d{8}-\d{8}\]', '', col)
            column_mapping[col] = new_name
    return df.rename(columns=column_mapping)

In [272]:
def clean_dataframe(df: pd.DataFrame, columns: list[str] = None) -> pd.DataFrame:
    """
    仅对指定列剔除正无穷/负无穷，并删除这些列中含 NaN 的行，
    然后重置索引并返回新 DataFrame。

    参数
    ----
    df : pd.DataFrame
        需要清洗的数据
    columns : list[str] or None
        需要清洗的列名列表；为 None 时对全部列处理

    返回
    ----
    pd.DataFrame
        清洗后的新 DataFrame
    """
    cleaned = df.copy()
    cols = columns if columns is not None else cleaned.columns
    # 只处理指定列
    cleaned[cols] = cleaned[cols].replace([np.inf, -np.inf], np.nan)
    # 只要指定列中出现 NaN 就删整行
    cleaned.dropna(subset=cols, inplace=True)
    cleaned.reset_index(drop=True, inplace=True)
    remove_num = df.shape[0] - cleaned.shape[0]
    print(f"剔除异常行数: {remove_num}")
    return cleaned

In [273]:
# def get_recent_d_days_top_rank(days=5, rank=5):
#     q = f"最近{days}个交易日的区间涨跌幅从大到小排序前{rank}"
#     print(f'========= question : {q} ===========')
#     df = pywencai.get(query=q, query_type='stock')
#     # display(df)
#     df['交易日期'] = extract_trade_date(df)
#     df = remove_date_suffix(df)
#     df.rename(columns={'区间涨跌幅:前复权': '区间涨幅', '区间涨跌幅:前复权排名': '区间排名'}, inplace=True)
#     df['区间长度'] = days
#     df['区间排名'] = df['区间排名'].astype(str).str.split('/').str[0].astype(int)
#     for col in ['区间涨幅']:
#         if col in df.columns:
#             df[col] = df[col].astype(float).round(2)
#     for col in ['market_code', 'code']:
#         if col in df.columns:
#             df[col] = df[col].astype(str)
#     format_name = ['交易日期','股票简称','区间长度','区间涨幅', '区间排名','market_code','code']
#     return df[format_name]

In [274]:
# import pandas as pd

# def aggregate_intervals(df: pd.DataFrame) -> pd.DataFrame:
#     # 按唯一股票分组
#     grouped = (
#         df.groupby(["交易日期", "股票简称", "market_code", "code"], as_index=False)
#           .agg({
#               # 将区间长度-区间排名拼接
#               "区间长度": lambda x: "|".join(
#                   f"{l}-{r}" for l, r in zip(x, df.loc[x.index, "区间排名"])
#               )
#           })
#     )
#     # 重命名列方便理解
#     grouped.rename(columns={"区间长度": "区间信息"}, inplace=True)
#     return grouped


# # 示例调用
# result = aggregate_intervals(df)
# print(result)


In [276]:
# import pandas as pd

# def transform_interval_data(df: pd.DataFrame) -> pd.DataFrame:
#     """
#     将原始区间数据聚合并排序：
#     1. 按 [交易日期, 股票简称, market_code, code] 分组
#     2. 聚合字段：区间信息（'区间长度-区间排名' 用 | 连接）
#     3. 统计区间个数、最小排名、最小长度
#     4. 按区间个数(降序)、最小排名(升序)、最小长度(升序)排序
#     5. 返回去掉排序辅助列的结果
#     """

#     def agg_func(sub_df):
#         return pd.Series({
#             "区间信息": "|".join(
#                 f"{l}-{r}" for l, r in zip(sub_df["区间长度"], sub_df["区间排名"])
#             ),
#             "区间个数": len(sub_df),
#             "最小排名": sub_df["区间排名"].min(),
#             "最小长度": sub_df["区间长度"].min()
#         })

#     grouped = (
#         df.groupby(["交易日期", "股票简称", "market_code", "code"], as_index=False)
#           .apply(agg_func)
#     )

#     grouped.sort_values(
#         by=["区间个数", "最小排名", "最小长度"],
#         ascending=[False, True, True],
#         inplace=True
#     )

#     grouped.reset_index(drop=True, inplace=True)
#     # 最终结果只保留需要的列
#     return grouped[["交易日期", "股票简称", "market_code", "code", "区间信息"]]


In [311]:
import pywencai
import re
import  pandas as pd
from typing import Optional
import numpy as np

def extract_trade_date(df: pd.DataFrame) -> Optional[str]:
    """
    从 DataFrame 的列名中提取第一个形如 [YYYYMMDD] 的日期字符串。
    如果未找到则返回 None。
    """
    for col in df.columns:
        m1 = re.search(r'\[(\d{8})\]', col)
        if m1:
            return m1.group(1)
        m2 = re.search(r'\[(\d{8}-\d{8})\]', col)
        if m2:
            return m2.group(1).split('-')[1]
    return None


def remove_date_suffix(df: pd.DataFrame) -> pd.DataFrame:
    """
    删除 DataFrame 列名中形如 [YYYYMMDD]和[YYYYMMDD-YYYYMMDD] 的日期后缀。
    返回修改后的 DataFrame（不会修改原 df）。
    """
    column_mapping = {}
    for col in df.columns:
        if re.search(r'\[\d{8}\]', col):
            new_name = re.sub(r'\[\d{8}\]', '', col)
            column_mapping[col] = new_name
        elif re.search(r'\[\d{8}-\d{8}\]', col):
            new_name = re.sub(r'\[\d{8}-\d{8}\]', '', col)
            column_mapping[col] = new_name
    return df.rename(columns=column_mapping)


def transform_interval_data(df: pd.DataFrame) -> pd.DataFrame:
    """
    将原始区间数据聚合并排序：
    1. 按 [交易日期, 股票简称, market_code, code] 分组
    2. 聚合字段：区间信息（'区间长度-区间排名' 用 | 连接）
    3. 统计区间个数、最小排名、最小长度
    4. 按区间个数(降序)、最小排名(升序)、最小长度(升序)排序
    5. 返回去掉排序辅助列的结果
    """

    def agg_func(sub_df):
        return pd.Series({
            "区间信息": "|".join(
                f"{l}-{r}" for l, r in zip(sub_df["区间长度"], sub_df["区间排名"])
            ),
            "区间个数": len(sub_df),
            "最小排名": sub_df["区间排名"].min(),
            "最小长度": sub_df["区间长度"].min()
        })

    grouped = (
        df.groupby(["交易日期", "股票简称", "market_code", "code"], as_index=False)
          .apply(agg_func)
    )

    grouped.sort_values(
        by=["区间个数", "最小排名", "最小长度"],
        ascending=[False, True, True],
        inplace=True
    )

    grouped.reset_index(drop=True, inplace=True)
    # 最终结果只保留需要的列
    return grouped[["交易日期", "股票简称", "market_code", "code", "区间信息"]]


def transform_with_importance(df: pd.DataFrame, alpha=0.5, beta=0.2) -> pd.DataFrame:
    """
    基于原始数据聚合并计算重要度：
    1. 分组字段: [交易日期, 股票简称, market_code, code]
    2. 区间信息: '区间长度-区间排名' 用 | 连接
    3. 重要度: sum(1 / 区间排名)
    4. 如果想更强调排名差距: 提高alhpa, 如果想更强调短区间: 提高beta
    5. 排序: 按重要度(降序)
    """
    def agg_func(sub_df):
        return pd.Series({
            "区间信息": "|".join(
                f"{l}-{r}" for l, r in zip(sub_df["区间长度"], sub_df["区间排名"])
            ),
            # "重要度": (1 / sub_df["区间排名"]).sum()
            "重要度": float(100 * (np.exp(-alpha * sub_df["区间排名"]) * np.exp(-beta * sub_df["区间长度"])).sum())

        })

    grouped = (
        df.groupby(["交易日期", "股票简称", "market_code", "code"], as_index=False)
          .apply(agg_func)
    )

    # 按重要度从大到小排序
    grouped.sort_values(by="重要度", ascending=False, inplace=True)
    grouped.reset_index(drop=True, inplace=True)
    return grouped[["交易日期", "股票简称", "market_code", "code", "区间信息", "重要度"]]

def get_recent_d_days_top_rank(days=5, rank=5, selected = False):
    if selected:
        q = f"非ST,股票简称不包含退,上市天数大于30,流通市值大于100亿,最近{days}个交易日的区间涨跌幅从大到小排序前{rank}"
    else:
        q = f"最近{days}个交易日的区间涨跌幅从大到小排序前{rank}"
    print(f'========= question : {q} ===========')
    df = pywencai.get(query=q, query_type='stock')
    # display(df)
    df['交易日期'] = extract_trade_date(df)
    df = remove_date_suffix(df)
    df.rename(columns={'区间涨跌幅:前复权': '区间涨幅', '区间涨跌幅:前复权排名': '区间排名'}, inplace=True)
    df['区间长度'] = days
    df['区间排名'] = df['区间排名'].astype(str).str.split('/').str[0].astype(int)
    for col in ['区间涨幅']:
        if col in df.columns:
            df[col] = df[col].astype(float).round(2)
    for col in ['market_code', 'code']:
        if col in df.columns:
            df[col] = df[col].astype(str)
    format_name = ['交易日期','股票简称','区间长度','区间涨幅', '区间排名','market_code','code']
    return df[format_name]

In [319]:
all_df = []
for d,r in [(2,10),(3,10),(5,10),(10,5),(15,3),(20,2)]:
    print(f'========== {d}-{r} ===========')
    df = get_recent_d_days_top_rank(days=d, rank=r)
    print(df.shape)
    all_df.append(df)
    time.sleep(3)
df_m = pd.concat(all_df)

(10, 7)
(10, 7)
(10, 7)
(5, 7)
(3, 7)
(3, 7)


In [320]:
res1 = transform_with_importance(df_m, alpha=0.999,beta=0.001)
res1

Unnamed: 0,交易日期,股票简称,market_code,code,区间信息,重要度
0,20250924,向日葵,33,300111,2-1|3-1|5-1|10-3|15-3,120.0
1,20250924,首开股份,17,600376,15-1|20-2,49.6
2,20250924,天普股份,17,605255,5-5|20-1,36.8
3,20250924,波长光电,33,301421,5-6|10-1,36.7
4,20250924,长川科技,33,300604,2-2|3-2|5-3|10-5,32.7
5,20250924,矽电股份,33,301629,2-4|3-4|5-2,17.2
6,20250924,赛微微电,17,688325,10-2,13.4
7,20250924,香农芯创,33,300475,15-2,13.4
8,20250924,初灵信息,33,300250,2-8|3-3|5-4,6.84
9,20250924,海博思创,17,688411,2-7|3-9|20-3,5.0


In [315]:
all_df = []
for d,r in [(2,10),(3,10),(5,10),(10,5),(15,3),(20,3)]:
    print(f'========== {d}-{r} ===========')
    df = get_recent_d_days_top_rank(days=d, rank=r, selected = True)
    print(df.shape)
    all_df.append(df)
    time.sleep(3)
df_l= pd.concat(all_df)

(10, 7)
(10, 7)
(10, 7)
(5, 7)
(3, 7)
(3, 7)


In [316]:
res2 = transform_with_importance(df_l, alpha=0.999,beta=0.001)
res2

Unnamed: 0,交易日期,股票简称,market_code,code,区间信息,重要度
0,20250924,向日葵,33,300111,2-1|3-1|5-1|10-3|15-3,120.0
1,20250924,首开股份,17,600376,15-1|20-2,49.6
2,20250924,天普股份,17,605255,5-5|20-1,36.8
3,20250924,长川科技,33,300604,2-2|3-2|5-3|10-5,32.7
4,20250924,香农芯创,33,300475,15-2,13.4
5,20250924,海博思创,17,688411,2-7|3-9|20-3,5.0
6,20250924,精智达,17,688627,10-4,1.82
7,20250924,聚辰股份,17,688123,3-5|5-14,0.675
8,20250924,凯美特气,33,2549,5-7,0.0914
9,20250924,联美控股,17,600167,2-20|3-10|5-8,0.0382


In [321]:
res = pd.concat([res1,res2])
res = res.reset_index(drop=True)

In [322]:
keys = ['交易日期','股票简称','market_code','code']
idx = res.groupby(keys)['重要度'].idxmax()
df_max = res.loc[idx].sort_values(by='重要度',ascending=False).reset_index(drop=True)
df_max

Unnamed: 0,交易日期,股票简称,market_code,code,区间信息,重要度
0,20250924,向日葵,33,300111,2-1|3-1|5-1|10-3|15-3,120.0
1,20250924,首开股份,17,600376,15-1|20-2,49.6
2,20250924,天普股份,17,605255,5-5|20-1,36.8
3,20250924,波长光电,33,301421,5-6|10-1,36.7
4,20250924,长川科技,33,300604,2-2|3-2|5-3|10-5,32.7
5,20250924,矽电股份,33,301629,2-4|3-4|5-2,17.2
6,20250924,赛微微电,17,688325,10-2,13.4
7,20250924,香农芯创,33,300475,15-2,13.4
8,20250924,初灵信息,33,300250,2-8|3-3|5-4,6.84
9,20250924,海博思创,17,688411,2-7|3-9|20-3,5.0


In [337]:
def get_first_breakout_stocks():
    q = f"今日涨幅大于9.5%,前10个交易日至昨日的涨幅超过9.5%的次数等于0"
    print(f'========= question : {q} ===========')
    df = pywencai.get(query=q, query_type='stock')
    df['交易日期'] = extract_trade_date(df)
    df = remove_date_suffix(df)
    df.rename(columns={'涨跌幅:前复权': '涨跌幅'}, inplace=True)
    for col in ['涨跌幅']:
        if col in df.columns:
            df[col] = df[col].astype(float).round(2)
    for col in ['market_code', 'code']:
        if col in df.columns:
            df[col] = df[col].astype(str)
    format_name = ['交易日期','股票简称','涨跌幅','market_code','code']
    return df[format_name]

In [339]:
df = get_first_breakout_stocks()
df



AttributeError: 'NoneType' object has no attribute 'columns'

In [64]:

def get_all_stocks_hq_data():
    q = 'A股,非北交所,上市板块,上市天数,开盘价,最高价,最低价,收盘价,前复权:开盘价,前复权:最高价,前复权:最低价,前复权:收盘价,成交额,成交量,竞价涨幅,竞价金额,竞价量,dde大单净额,实际换手率,自由流通市值,自由流通股,个股热度排名'
    print(f'========= question : {q} ===========')
    df = pywencai.get(query=q, query_type='stock')
    trade_date = extract_trade_date(df)
    df = remove_date_suffix(df)
    map_col = {'开盘价:前复权':'开盘价_前',
               '收盘价:前复权':'收盘价_前',
              '最高价:前复权':'最高价_前',
              '最低价:前复权':'最低价_前',
              '开盘价:不复权':'开盘价',
               '收盘价:不复权':'收盘价',
              '最高价:不复权':'最高价',
              '最低价:不复权':'最低价',
              '最新涨跌幅':'涨跌幅',
              '自由流通市值':'市值Z',
               '实际换手率':'换手Z',
              'dde大单净额':'大单净额',
              '个股热度排名':'热度排名'}
    df.rename(columns=map_col, inplace=True)
    
    int_columns = ['市值Z', '上市天数','大单净额','热度排名','自由流通股','成交量','成交额','竞价量','竞价金额']
    float_columns = ['涨跌幅', '竞价涨幅','换手Z','开盘价', '最高价', '最低价', '收盘价','开盘价_前', '最高价_前', '最低价_前', '收盘价_前']
    str_columns = ['market_code', 'code']
    df = clean_dataframe(df, int_columns+float_columns+str_columns)
    
    for col in int_columns:
        if col in df.columns:
            df[col] = df[col].astype(float).astype(int)
    for col in float_columns:
        if col in df.columns:
            df[col] = df[col].astype(float).round(2)
    for col in ['market_code', 'code']:
        if col in df.columns:
            df[col] = df[col].astype(str)
            
    df['交易日期'] = trade_date
    df['竞换手Z'] = df.apply(lambda x : round(x['竞价量'] / x['自由流通股'] * 100, 2)  if x['自由流通股'] > 0 else -1000, axis=1)
    df['实体涨幅'] = df.apply(lambda x : round((x['收盘价'] / x['开盘价'] - 1) * 100, 2)  if x['开盘价'] > 0 else -1000, axis=1)
    
    format_name = ['交易日期','股票简称', '涨跌幅','实体涨幅', '大单净额', '热度排名', 
                   '开盘价', '最高价', '最低价', '收盘价',
                   '成交额', '成交量','市值Z','换手Z',
                   '竞价涨幅', '竞价金额', '竞价量', '竞换手Z',
                   '开盘价_前', '最高价_前', '最低价_前', '收盘价_前', 
                   '上市板块', '上市天数',
                   'market_code', 'code']
    return df[format_name]
# get_all_stocks_hq_data()

In [65]:
df = get_all_stocks_hq_data()

剔除异常行数: 0


In [66]:
df

Unnamed: 0,交易日期,股票简称,涨跌幅,实体涨幅,大单净额,热度排名,开盘价,最高价,最低价,收盘价,成交额,成交量,市值Z,换手Z,竞价涨幅,竞价金额,竞价量,竞换手Z,开盘价_前,最高价_前,最低价_前,收盘价_前,上市板块,上市天数,market_code,code
0,20250922,分众传媒,-1.92,-1.33,-45473775,1616,8.28,8.29,8.12,8.17,716873821,87706130,79520298518,0.9,-0.6,1391868,168100,0.0,8.28,8.29,8.12,8.17,主板,7720,33,2027
1,20250922,科大讯飞,0.76,-0.09,-87358560,177,53.25,53.77,52.59,53.2,3331938110,62664549,99373828279,3.35,0.85,21598200,405600,0.02,53.25,53.77,52.59,53.2,主板,6343,33,2230
2,20250922,北方华创,2.66,2.78,217404200,227,401.54,418.8,398.25,412.7,4230370240,10331046,171491607326,2.49,-0.11,15513498,38635,0.01,401.54,418.8,398.25,412.7,主板,5670,33,2371
3,20250922,立讯精密,10.0,0.0,1051477930,2,60.95,60.95,60.13,60.95,5501600820,90279351,274722011017,2.0,10.0,1481725000,24310500,0.54,60.95,60.95,60.13,60.95,主板,5487,33,2475
4,20250922,牧原股份,-1.51,-0.52,23832421,408,53.84,53.84,52.55,53.56,2219200340,41808555,128574813442,1.74,-0.99,10816456,200900,0.01,53.84,53.84,52.55,53.56,主板,4256,33,2714
5,20250922,比亚迪,-1.01,-0.81,-617938920,60,108.87,108.91,107.15,107.99,5293961580,49155923,265348024067,2.0,-0.2,42361317,389100,0.02,108.87,108.91,107.15,107.99,主板,5199,33,2594
6,20250922,京东方A,0.24,0.24,26220810,352,4.13,4.18,4.08,4.14,2079566770,503754598,135140847833,1.54,0.0,8217465,1989701,0.01,4.13,4.18,4.08,4.14,主板,9020,33,725
7,20250922,盐湖股份,-1.26,-1.31,11114159,1346,19.84,20.05,19.45,19.58,915886604,46777189,66011008955,1.39,0.05,3015680,152000,0.0,19.84,20.05,19.45,19.58,主板,10246,33,792
8,20250922,紫光股份,1.06,-0.55,-37293570,142,30.64,30.77,29.88,30.47,4389738220,144418233,62744102495,7.01,1.62,27024480,882000,0.04,30.64,30.77,29.88,30.47,主板,9455,33,938
9,20250922,中油资本,1.41,2.18,-44093845,326,10.53,10.82,10.53,10.76,1251091560,116866165,27175230513,4.63,-0.75,9453834,897800,0.04,10.53,10.82,10.53,10.76,主板,10563,33,617


In [68]:
df.sort_values(by='竞换手Z',ascending=False).to_csv('a.csv')