## 事前準備

### モジュールのインポート

In [1]:
import warnings
warnings.filterwarnings('ignore', r'All-NaN (slice|axis) encountered')

In [2]:
# 自作モジュール
from utils.paths import Paths
from acquisition.jquants_api_operations import StockAcquisitionFacade
from utils.jquants_api_utils import cli
from calculation.target import TargetCalculator
# 基本モジュール
from datetime import datetime
import pandas as pd
import numpy as np
# クラスタリングで使用
from clustering import UMAPReducer, HDBSCANCluster, SectorClusterer


## 実行

### 銘柄情報の取得
* stock_lists: 2014年10月からの銘柄一覧
* history_list: 銘柄ごとのScaleCategoryの遍歴

In [3]:
filter_condition = "(Listing==1)&((ScaleCategory=='TOPIX Core30')|(ScaleCategory=='TOPIX Large70')|(ScaleCategory=='TOPIX Mid400')|(ScaleCategory=='TOPIX Small 1'))" #現行のTOPIX500""
saf = StockAcquisitionFacade(filter=filter_condition) #filtered_code_list = filter_codes)
stock_dfs = saf.get_stock_data_dict()
stock_dfs['list']

Unnamed: 0,Code,CompanyName,MarketCodeName,Sector33CodeName,Sector17CodeName,ScaleCategory,Listing
18,1332,ニッスイ,プライム,水産・農林業,食品,TOPIX Mid400,1
19,1333,マルハニチロ,プライム,水産・農林業,食品,TOPIX Mid400,1
41,1377,サカタのタネ,プライム,水産・農林業,食品,TOPIX Small 1,1
57,1414,ショーボンドホールディングス,プライム,建設業,建設・資材,TOPIX Mid400,1
58,1417,ミライト・ワン,プライム,建設業,建設・資材,TOPIX Mid400,1
...,...,...,...,...,...,...,...
4397,9983,ファーストリテイリング,プライム,小売業,小売,TOPIX Core30,1
4398,9984,ソフトバンクグループ,プライム,情報･通信業,情報通信・サービスその他,TOPIX Core30,1
4400,9987,スズケン,プライム,卸売業,商社・卸売,TOPIX Mid400,1
4401,9989,サンドラッグ,プライム,小売業,小売,TOPIX Mid400,1


### 価格情報の準備

In [4]:
# 目的変数（日内リターン）を算出
stock_dfs['price']['Target'] = stock_dfs['price']['Close'] / stock_dfs['price']['Open'] - 1
target = stock_dfs['price'][['Date', 'Code', 'Target']]
target = target.set_index(['Date', 'Code'], drop=True).unstack(-1).droplevel(0, axis=1)
target

Code,1332,1333,1377,1414,1417,1419,1514,1515,1518,1605,...,9936,9948,9956,9962,9974,9983,9984,9987,9989,9997
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-04-05,0.000000,-0.022472,0.003241,-0.031609,-0.017260,-0.041379,-0.060606,0.012605,-0.039106,-0.026786,...,,-0.000985,0.015025,-0.035254,0.006993,-0.081579,-0.016913,-0.016667,-0.005974,0.011403
2013-04-08,-0.011050,0.000000,0.008019,0.011747,0.012848,0.003497,0.016000,0.000000,0.005780,-0.013972,...,,0.007901,-0.004875,-0.002072,0.010345,-0.015449,0.003161,0.019391,0.001202,0.020925
2013-04-09,-0.011050,0.000000,-0.008744,-0.012931,-0.015756,-0.032886,0.000000,0.006186,-0.005650,0.026000,...,,-0.004455,-0.002186,0.000692,-0.010204,-0.032951,-0.003106,-0.008152,0.003563,-0.018085
2013-04-10,0.005587,0.005556,0.015237,0.040639,0.013786,0.002073,0.023810,0.012371,0.033898,-0.013592,...,,0.005967,-0.004393,-0.028105,0.004135,0.000000,-0.029167,0.013699,-0.021302,0.004278
2013-04-11,-0.005525,0.005435,0.003150,-0.015110,0.005252,-0.001407,-0.030534,0.010204,-0.016304,-0.013540,...,,-0.018500,-0.003333,0.005618,-0.021333,0.010234,0.013830,0.007968,-0.011834,0.013904
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-05-30,0.013038,0.019935,0.004360,0.001636,0.015109,-0.027248,0.013657,0.020378,0.001025,0.003858,...,0.016035,-0.000349,0.007010,0.013499,0.001416,0.009395,-0.000525,0.021731,0.012627,0.005682
2025-06-02,-0.008856,-0.010207,-0.021645,0.006144,-0.001371,-0.002817,0.036090,0.005722,-0.012308,-0.001024,...,0.011461,-0.007639,0.004918,-0.002934,-0.007123,0.003148,-0.023240,0.009869,0.000225,0.005656
2025-06-03,0.003825,-0.004531,-0.008876,0.007531,-0.012941,-0.018414,0.011782,0.036879,-0.005258,0.008418,...,0.012748,-0.004213,-0.003259,0.006361,0.004310,-0.003103,-0.007133,-0.012216,0.006076,0.007892
2025-06-04,0.006429,0.006230,0.029851,-0.007478,0.000000,0.000000,0.000000,0.020520,0.006349,-0.002519,...,0.001410,0.009859,-0.005326,-0.002100,0.001433,0.013424,0.003499,0.008759,-0.004039,-0.008909


### PCA処理の実行

In [5]:
from sklearn.decomposition import PCA
end_date = datetime(2022,1,1)
n_components = 600


target = target[target.index <= end_date]
no_missing_residuals = target.dropna(axis=1).T

pca = PCA(n_components = n_components).fit(no_missing_residuals)

explained_ratio_df = pd.DataFrame(np.cumsum(pca.explained_variance_ratio_), 
                                  index=['PC_'+ '{:0=3}'.format(j) for j in range(0, n_components)],
                                  columns=['ExplainedRatio'])
explained_ratio_df

Unnamed: 0,ExplainedRatio
PC_000,0.052070
PC_001,0.082363
PC_002,0.101146
PC_003,0.118865
PC_004,0.132671
...,...
PC_595,0.951465
PC_596,0.951772
PC_597,0.952077
PC_598,0.952380


In [6]:
pca_array = pca.transform(no_missing_residuals)

pca_df = pd.DataFrame(pca_array, index=no_missing_residuals.index, columns=['PC_'+ '{:0=3}'.format(j) for j in range(0, n_components)])
extracted_df = pca_df.sort_index(ascending=True)
extracted_df.to_csv('pca_residue.csv')
extracted_df

Unnamed: 0_level_0,PC_000,PC_001,PC_002,PC_003,PC_004,PC_005,PC_006,PC_007,PC_008,PC_009,...,PC_590,PC_591,PC_592,PC_593,PC_594,PC_595,PC_596,PC_597,PC_598,PC_599
Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1332,0.090451,0.090313,-0.034657,-0.077660,0.061229,-0.036503,-0.040300,0.001044,-0.014527,0.013607,...,0.002033,0.000078,0.014088,0.007883,0.002335,0.010929,0.004746,-0.003603,0.012418,-0.007510
1333,0.150063,0.056509,-0.021346,-0.021120,0.094801,-0.046047,-0.101080,0.042065,-0.038165,-0.001628,...,0.007812,0.015494,-0.037711,0.007827,-0.016780,0.013980,0.001703,-0.009737,-0.014425,0.015567
1377,0.205484,0.002392,0.034572,0.067243,0.023296,0.000014,-0.021525,-0.004484,-0.033037,0.028025,...,-0.014330,0.012796,-0.014325,0.021601,0.006518,-0.027702,-0.009978,-0.001451,-0.007038,0.010161
1414,0.123851,-0.028660,-0.021627,0.063291,-0.006465,-0.135176,0.067006,-0.007860,0.021070,-0.041149,...,0.000827,-0.002211,0.010285,0.025622,0.012482,0.005818,0.023349,0.022752,-0.017296,-0.017762
1417,0.062327,-0.012388,-0.016964,0.086537,0.014600,-0.091703,0.011317,-0.011680,0.021163,-0.025103,...,-0.002798,-0.002109,0.011851,0.000077,-0.016419,-0.004660,0.001534,-0.024547,0.001663,-0.012291
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9983,-0.044501,0.038310,0.021695,-0.280806,-0.077470,0.027765,-0.016673,-0.035471,0.012108,-0.091556,...,0.001758,-0.020683,-0.015473,-0.021217,0.001327,-0.011637,-0.023271,0.002174,-0.002921,-0.006361
9984,-0.084884,-0.102121,0.060698,-0.147377,-0.034021,-0.052601,-0.171202,-0.009191,-0.039704,-0.057369,...,0.006959,-0.014196,-0.006023,0.015263,0.009150,0.013537,-0.008025,0.011138,-0.006926,0.012176
9987,0.161280,0.103493,0.047732,-0.019590,-0.017524,-0.002896,-0.054345,-0.040273,0.050617,0.016548,...,0.013109,0.005730,-0.004365,-0.006474,0.010717,-0.006178,-0.004546,0.013273,-0.009472,0.012357
9989,0.266386,0.027852,0.061253,-0.044490,0.011612,0.005407,0.014316,-0.060570,0.026374,0.008935,...,-0.016916,0.014957,0.028894,-0.031267,0.004493,0.020574,-0.008674,-0.002681,-0.009558,-0.015877


### クラスタリングの関数

In [7]:
# セクタークラスタリング用のクラスを準備
clusterer = SectorClusterer(stock_dfs["list"])

# UMAP->HDBSCAN(再帰) を実行する関数
def run_pipeline(df, umap_n_components=30, umap_n_neighbors=2, umap_min_dist=0.01, hdbscan_min_cluster_sizes=[2, 3, 4, 5, 6, 7, 8, 9, 10]):
    reduced = clusterer.apply_umap(df, n_components=umap_n_components, n_neighbors=umap_n_neighbors, min_dist=umap_min_dist)
    labels = clusterer.apply_recursive_hdbscan(reduced, hdbscan_min_cluster_sizes, metric='correlation')
    return labels


### クラスタリング(1段階目)


In [8]:
result_df = run_pipeline(extracted_df)
result_df.to_csv('cluster_hdbscan.csv')
result_df



TypeError: Unknown algorithm type brute specified