In [1]:
# 载入python库
import pandas as pd
import xlrd, openpyxl
import re
import os
from copy import deepcopy

In [2]:
#排除数据源中的干扰行
def fliter_none(issue):
    pattern = re.compile('\d{6}.[A-Z]{2}')
    try:
        id_stock = pattern.search(issue)
        if id_stock is not None :
            return True
        else:
            return False
    except:
        return False 

In [3]:
#分别生成每只基金重仓股即其持股比例的列表
def generate_dict(dataframe, point_filepath, csv_name):
    fund_network = {}
    node_network = {'node':[], 'group':[], 'k_value':[]}  #生成一个储存node信息的字典
    for _, row in dataframe.iterrows():
        fund, stock, group, fluent_rate, value_rate, k_value = row
        if fund in fund_network:
            fund_network[fund].append((stock, k_value))
        else:    
            fund_network[fund] = []
            fund_network[fund].append((stock, k_value))
            node_network['node'].append(fund)
            node_network['group'].append(group)
    for fund in node_network['node']:
        node_network['k_value'].append(fund_network[fund])
    node_network = pd.DataFrame(node_network)  
    node_network.to_csv(point_filepath+'/'+csv_name+'_node_network'+'.csv', encoding='utf_8_sig', index=True)
    return fund_network

In [4]:
#统计两只基金相同持股的股票列表，若比例大于临界值则记为1，否则记为0
def find_common(stock_list1, stock_list2, threshold):
    coshare_element = []
    for stock_name1, rate1 in stock_list1:
        for stock_name2, rate2 in stock_list2:
            if (stock_name1 == stock_name2) and ((rate1 > threshold) & (rate2>threshold)):
                coshare_element.append((stock_name1, 1))
            elif (stock_name1 == stock_name2) and ((rate1 <= threshold) | (rate2<=threshold)):
                coshare_element.append((stock_name1, 0))
    return coshare_element

In [5]:
#统计两只基金相同持股的股票数量及均大于临界值的股票数量
def generate_network_dataframe(fund_network, fund_network_copy, threshold):
    network_dataframe = {'source':[],'target':[],'coshare_stock':[],'coshare_num':[],'weight_stock':[],'weight':[]}
    for key1, stock_list1 in fund_network.items():
        del fund_network_copy[key1]
        for key2, stock_list2 in fund_network_copy.items():
            coshare_element = find_common(stock_list1, stock_list2, threshold)
            if len(coshare_element)>0:
                network_dataframe['source'].append(key1)
                network_dataframe['target'].append(key2)
                network_dataframe['coshare_stock'].append('*'.join(sorted([share_stock[0] for share_stock in coshare_element])))
                network_dataframe['coshare_num'].append(len(coshare_element))
                network_dataframe['weight_stock'].append('*'.join(sorted([share_stock[0] for share_stock in coshare_element if share_stock[1]==1])))
                network_dataframe['weight'].append(sum([share_stock[1] for share_stock in coshare_element]))
    network_dataframe = pd.DataFrame(network_dataframe)                          #return 这个是全weight数据
    network_dataframe_weight = network_dataframe[network_dataframe['weight']!=0] #return 这个是删除weight为0的
    output_list = ['source','target','weight_stock','weight']
    return network_dataframe_weight[output_list]

In [6]:
#生成网络结构数据
def xlsx_csv_fund(xlsx_filepath, csv_filepath, fund_filepath, point_filepath, hyper_params, invest_category = None):
    #获取文件路径
    list_dir =  [os.path.splitext(dir) for dir in os.listdir(xlsx_filepath)]
    
    #确认成的网络
    if invest_category is not None:
        print('正在生成'+'+'.join(invest_category)+'投资的网络')
    else:
        print('正在生成全部投资的网络')
        
    for tuple_dir in list_dir:
        #读取wind数据
        date_name, format_name = tuple_dir
        dataframe = pd.read_excel(xlsx_filepath+'/'+date_name+format_name)
        
        #删去不符合投资类型的股票，并检验是否数据框为空，为空则跳转到下一个循环
        if invest_category is not None:
            dataframe = dataframe[dataframe['投资类型'].isin(invest_category)]
        if dataframe.empty:
            continue
        
        #从原始数据提取必要信息
        column_name = ['代码','名称','股票代码','股票简称','持股占流通股比(%)','持股市值占基金净值比(%)','管理公司']
        data_csv = dataframe[dataframe['代码'].apply(fliter_none)][column_name]
        data_csv = data_csv.rename(columns = {'代码':'code',
                                              '名称':'source',
                                              '股票代码':'stock_code',
                                              '股票简称':'target', 
                                              '持股占流通股比(%)':'fluent_rate',
                                              '持股市值占基金净值比(%)':'value_rate', 
                                              '管理公司':'group'})
        csv_name = ('_').join(date_name.split('-'))
        dataframe_fund = data_csv[['source','target','group','fluent_rate','value_rate']]
        
        #生成k值进一步完善dataframe_fund
        a, b, c = hyper_params
        dataframe_fund_copy = dataframe_fund.copy()
        dataframe_fund_copy['k_value'] = dataframe_fund.apply(lambda row: round(a*row[3]+b*row[4]+c*row[3]*row[4], 2), axis=1)#生成k值
        dataframe_fund_copy.to_csv(csv_filepath+'/'+csv_name+'.csv', encoding='utf_8_sig', index=True)

        
        #生成制作网络的数据表
        fund_network = generate_dict(dataframe_fund_copy, point_filepath, csv_name)
        fund_network_copy = deepcopy(fund_network)
        threshold = hyper_threshold
        network_dataframe = generate_network_dataframe(fund_network,fund_network_copy,threshold)
        network_dataframe.to_csv(fund_filepath+'/'+csv_name+'.csv', encoding='utf_8_sig', index=True)

In [7]:
#参数设置
xlsx_filepath = './Dataset/StockData'
csv_filepath = './Dataset/StockData_csv'
fund_filepath = './Dataset/StockData_fund'
point_filepath = './Dataset/StockData_point'
hyper_params = [0.2, 0.8, 0.2]
hyper_threshold = 8 

In [8]:
xlsx_csv_fund(xlsx_filepath, csv_filepath, fund_filepath, point_filepath, hyper_params, invest_category=['普通股票型基金','指数增强型基金','灵活配置型基金','偏股混合型基金','平衡混合型基金','偏债混合型基金'])

正在生成普通股票型基金+指数增强型基金+灵活配置型基金+偏股混合型基金+平衡混合型基金+偏债混合型基金投资的网络


In [9]:
import networkx as nx