## 事前準備

### モジュールのインポート

In [1]:
import warnings
warnings.filterwarnings('ignore', r'All-NaN (slice|axis) encountered')

In [2]:
# 自作モジュール
from utils.paths import Paths
from acquisition.jquants_api_operations import StockAcquisitionFacade
from utils.jquants_api_utils import cli
from calculation.target import TargetCalculator
# 基本モジュール
from datetime import datetime
import pandas as pd
import numpy as np
# クラスタリングで使用
import umap
from pyclustering.cluster.xmeans import xmeans
from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer
from scipy.spatial.distance import cdist

## 実行

### 銘柄情報の取得
* stock_lists: 2014年10月からの銘柄一覧
* history_list: 銘柄ごとのScaleCategoryの遍歴

In [1295]:
filter_codes = ["4912","4917","7956","8113","4919","4928","6630","7744","4452","4967","8283","4028","4911","4922","4927"]

filter_codes = [str(x) for x in filter_codes]

In [None]:
filter_condition = "(Listing==1)&((ScaleCategory=='TOPIX Core30')|(ScaleCategory=='TOPIX Large70')|(ScaleCategory=='TOPIX Mid400')|(ScaleCategory=='TOPIX Small 1'))" #現行のTOPIX500""
saf = StockAcquisitionFacade(filter=filter_condition) #filtered_code_list = filter_codes)
stock_dfs = saf.get_stock_data_dict()
stock_dfs['list']

### 価格情報の準備

In [None]:
# 目的変数（日内リターン）を算出
stock_dfs['price']['Target'] = stock_dfs['price']['Close'] / stock_dfs['price']['Open'] - 1
target = stock_dfs['price'][['Date', 'Code', 'Target']]
target = target.set_index(['Date', 'Code'], drop=True).unstack(-1).droplevel(0, axis=1)
target

### PCA処理の実行

In [None]:
from sklearn.decomposition import PCA
end_date = datetime(2022,1,1)
n_components = 115 # 累積寄与率がおおむね80%となる要素数


target = target[target.index <= end_date]
no_missing_residuals = target.dropna(axis=1).T

pca = PCA(n_components = n_components).fit(no_missing_residuals)

explained_ratio_df = pd.DataFrame(np.cumsum(pca.explained_variance_ratio_), 
                                  index=['PC_'+ '{:0=3}'.format(j) for j in range(0, n_components)],
                                  columns=['ExplainedRatio'])
explained_ratio_df

In [None]:
pca_array = pca.transform(no_missing_residuals)

pca_df = pd.DataFrame(pca_array, index=no_missing_residuals.index, columns=['PC_'+ '{:0=3}'.format(j) for j in range(0, n_components)])
pca_df = pca_df[[x for x in pca_df.columns if x not in ['PC_000', 'PC_002', 'PC_003', 'PC_004']]]
extracted_df = pca_df.sort_index(ascending=True)
extracted_df.to_csv('pca_residue.csv')
extracted_df

### クラスタリングの関数

In [1301]:
# UMAPを適用する関数
def apply_UMAP(df:pd.DataFrame, n_components:int=15, n_neighbors:int=5, min_dist:float=0.1):
    UMAP_model = umap.UMAP(n_components=n_components, n_neighbors=n_neighbors, min_dist=min_dist)
    UMAP_result = UMAP_model.fit_transform(df)
    reduced_df = pd.DataFrame(UMAP_result, index=df.index, columns=['Feature '+ str(i) for i in range(0, n_components)])
    reduced_df = reduced_df.sort_index(ascending=True)
    return reduced_df

# データフレームにxmeansを適用
def apply_xmeans(df:pd.DataFrame, n_iteration:int=30, kmax=10):
    dfs_xmeans = []
    for i in range(0, n_iteration):
        df_xmeans_cluster_center = _calculate_xmeans_cluster_center(df, kmax=kmax)
        dfs_xmeans.append(df_xmeans_cluster_center)
    dfs_xmeans = pd.concat(dfs_xmeans, axis=0)
    dfs_xmeans = dfs_xmeans.reset_index().drop('index', axis=1)
    df = dfs_xmeans.groupby('Code').mean()
    return df

# xmeansのクラスタ中心を算出するサブ関数
def _calculate_xmeans_cluster_center(df, kmax):
    xm_c = kmeans_plusplus_initializer(data=df, amount_centers=2).initialize()
    xm_i = xmeans(data=df, initial_centers=xm_c, kmax=kmax, tolerrance=0.001, ccore=True)
    xm_i.process()

    for cluster_num, cluster in enumerate(xm_i._xmeans__clusters):
        for index_num in cluster:
            df.loc[df.index[index_num]] = np.array(xm_i._xmeans__centers, dtype=np.float32)[cluster_num, :]

    df = df.reset_index()
    return df


# ユークリッド距離からクラスタを決定する関数
def determine_cluster_from_euclidean(df:pd.DataFrame, theshold_column='Distance', cluster_theshold=0.03):
    df_euclidean = _calculate_euclidean(df=df)
    df_clustered = _determine_cluster(df=df_euclidean, theshold_column=theshold_column, cluster_theshold=cluster_theshold)
    return df_clustered

# 銘柄間のユークリッド距離を算出するサブ関数
def _calculate_euclidean(df:pd.DataFrame):
    np_euclidean = cdist(df.values, df.values, metric='euclidean')
    df_euclidean = pd.DataFrame(np_euclidean, index=df.index, columns=df.index)
    return df_euclidean

# ユークリッド距離からクラスタを決定するサブ関数
def _determine_cluster(df:pd.DataFrame, theshold_column:str, cluster_theshold:float):
    df_unstacked = pd.DataFrame(df.unstack(), columns=[theshold_column])
    in_theshold = df_unstacked[theshold_column]<=cluster_theshold
    df_in_theshold = df_unstacked[in_theshold].sort_values(theshold_column, ascending=True)
    df_in_theshold['Cluster'] = np.nan

    cluster_num = 0
    for i in df_in_theshold.index.get_level_values(0).unique():

        clustering_completed = df_in_theshold['Cluster'].isnull().any() == False
        if clustering_completed:
            break
        cluster_is_undetermined = \
          df_in_theshold.loc[df_in_theshold.index.get_level_values(0)==i, 'Cluster'].isnull().all()
        if cluster_is_undetermined:
            df_in_theshold.loc[df_in_theshold.index.get_level_values(0)==i, 'Cluster'] = cluster_num
            df_in_theshold.loc[df_in_theshold.index.get_level_values(1)==i, 'Cluster'] = cluster_num
            cluster_num += 1
        else:
            df_in_theshold.loc[df_in_theshold.index.get_level_values(0)==i, 'Cluster'] = \
                int(df_in_theshold.loc[df_in_theshold.index.get_level_values(0)==i, 'Cluster'].mean())
            df_in_theshold.loc[df_in_theshold.index.get_level_values(1)==i, 'Cluster'] = \
                int(df_in_theshold.loc[df_in_theshold.index.get_level_values(0)==i, 'Cluster'].mean())

    df_in_theshold = df_in_theshold[['Cluster']].astype(int).reset_index(level=0, drop=True)
    df_in_theshold = df_in_theshold.loc[~df_in_theshold.index.duplicated(keep='first')]

    df_clustered = pd.merge(
                            df_in_theshold, stock_dfs['list'],
                            how='left', on='Code',
                            ).sort_values(by=['Cluster', 'Code'], ascending=True).set_index('Code', drop=True)
    return df_clustered[['CompanyName', 'MarketCodeName', 'Sector33CodeName', 'Sector17CodeName', 'ScaleCategory', 'Listing', 'Cluster']]



### クラスタリング(1段階目)


In [1302]:
filter_codes = ["7164","8473","8698","8604","8609","8613","8616","8628","8706","8601","8511","8697","8130","7981","5929","5943","9934","5930","7943","6651","7148","5938","1808","8897","1419","1928","3291","4204","8848","1911","1878","1925","1766","8923","8905","8803","3289","8801","8802","8804","8830","3231","3003","8934","8871"]

extracted_df = extracted_df.loc[filter_codes, :]

In [None]:
clustered_dfs = pd.DataFrame()
for i in range(10):
    umaped_df = apply_UMAP(extracted_df)
    df_after_xmeans = apply_xmeans(umaped_df, kmax=50)
    clustered_df = determine_cluster_from_euclidean(df_after_xmeans, cluster_theshold=0.03)
    if len(clustered_dfs) != 0:
        clustered_df = clustered_df[['Cluster']]
    clustered_df = clustered_df.rename(columns={'Cluster': f'Cluster_{i}'})
    clustered_dfs = pd.merge(clustered_dfs, clustered_df, left_index=True, right_index=True, how='outer')

clustered_dfs.to_csv('cluster1000_2.csv')
clustered_dfs