## 事前準備

### モジュールのインポート

In [1]:
import warnings
warnings.filterwarnings('ignore', r'All-NaN (slice|axis) encountered')

In [2]:
# 自作モジュール
from utils.paths import Paths
from acquisition.jquants_api_operations import StockAcquisitionFacade
from utils.jquants_api_utils import cli
from calculation.target import TargetCalculator
# 基本モジュール
from datetime import datetime
import pandas as pd
import numpy as np


## 実行

### 銘柄情報の取得
* stock_lists: 2014年10月からの銘柄一覧
* history_list: 銘柄ごとのScaleCategoryの遍歴

In [1295]:
filter_codes = ["4912","4917","7956","8113","4919","4928","6630","7744","4452","4967","8283","4028","4911","4922","4927"]

filter_codes = [str(x) for x in filter_codes]

In [1296]:
filter_condition = "(Listing==1)&((ScaleCategory=='TOPIX Core30')|(ScaleCategory=='TOPIX Large70')|(ScaleCategory=='TOPIX Mid400')|(ScaleCategory=='TOPIX Small 1'))" #現行のTOPIX500""
saf = StockAcquisitionFacade(filter=filter_condition) #filtered_code_list = filter_codes)
stock_dfs = saf.get_stock_data_dict()
stock_dfs['list']

Unnamed: 0,Code,CompanyName,MarketCodeName,Sector33CodeName,Sector17CodeName,ScaleCategory,Listing
18,1332,ニッスイ,プライム,水産・農林業,食品,TOPIX Mid400,1
19,1333,マルハニチロ,プライム,水産・農林業,食品,TOPIX Mid400,1
41,1377,サカタのタネ,プライム,水産・農林業,食品,TOPIX Small 1,1
57,1414,ショーボンドホールディングス,プライム,建設業,建設・資材,TOPIX Mid400,1
58,1417,ミライト・ワン,プライム,建設業,建設・資材,TOPIX Mid400,1
...,...,...,...,...,...,...,...
4393,9983,ファーストリテイリング,プライム,小売業,小売,TOPIX Core30,1
4394,9984,ソフトバンクグループ,プライム,情報･通信業,情報通信・サービスその他,TOPIX Core30,1
4396,9987,スズケン,プライム,卸売業,商社・卸売,TOPIX Mid400,1
4397,9989,サンドラッグ,プライム,小売業,小売,TOPIX Mid400,1


### 価格情報の準備

In [1297]:
# 目的変数（日内リターン）を算出
stock_dfs['price']['Target'] = stock_dfs['price']['Close'] / stock_dfs['price']['Open'] - 1
target = stock_dfs['price'][['Date', 'Code', 'Target']]
target = target.set_index(['Date', 'Code'], drop=True).unstack(-1).droplevel(0, axis=1)
target

Code,1332,1333,1377,1414,1417,1419,1514,1515,1518,1605,...,9936,9948,9956,9962,9974,9983,9984,9987,9989,9997
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-04-05,0.000000,-0.022472,0.003241,-0.031609,-0.017260,-0.041379,-0.060606,0.012605,-0.039106,-0.026786,...,,-0.000985,0.015025,-0.035254,0.006993,-0.081579,-0.016913,-0.016667,-0.005974,0.011403
2013-04-08,-0.011050,0.000000,0.008019,0.011747,0.012848,0.003497,0.016000,0.000000,0.005780,-0.013972,...,,0.007901,-0.004875,-0.002072,0.010345,-0.015449,0.003161,0.019391,0.001202,0.020925
2013-04-09,-0.011050,0.000000,-0.008744,-0.012931,-0.015756,-0.032886,0.000000,0.006186,-0.005650,0.026000,...,,-0.004455,-0.002186,0.000692,-0.010204,-0.032951,-0.003106,-0.008152,0.003563,-0.018085
2013-04-10,0.005587,0.005556,0.015237,0.040639,0.013786,0.002073,0.023810,0.012371,0.033898,-0.013592,...,,0.005967,-0.004393,-0.028105,0.004135,0.000000,-0.029167,0.013699,-0.021302,0.004278
2013-04-11,-0.005525,0.005435,0.003150,-0.015110,0.005252,-0.001407,-0.030534,0.010204,-0.016304,-0.013540,...,,-0.018500,-0.003333,0.005618,-0.021333,0.010234,0.013830,0.007968,-0.011834,0.013904
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-03-14,0.012460,0.010407,-0.008608,-0.000828,0.004356,-0.001418,-0.006631,0.011182,0.000000,0.004299,...,0.004637,0.009592,0.006783,0.013615,0.005917,0.007725,0.030239,-0.004888,0.011162,0.008621
2025-03-17,0.001802,0.004375,-0.002890,0.013469,0.009977,-0.004255,-0.007968,0.015798,-0.001202,0.007489,...,-0.006144,0.000339,0.001268,0.015957,0.000000,-0.016457,-0.000125,0.000409,0.001441,-0.009554
2025-03-18,0.004889,0.010520,-0.002894,-0.004260,0.004913,0.004274,0.082557,0.015198,0.014269,0.016268,...,0.003082,0.003033,0.000840,-0.010080,-0.001466,-0.002823,-0.017397,0.008155,-0.002370,0.004237
2025-03-19,0.011169,0.013472,0.005789,0.003878,-0.002883,0.009929,0.001253,0.040119,-0.006936,0.004371,...,0.000000,-0.010753,-0.000839,0.007429,0.019062,-0.003495,-0.009767,0.004246,0.011285,0.002116


### PCA処理の実行

In [1298]:
from sklearn.decomposition import PCA
end_date = datetime(2022,1,1)
n_components = 115 # 累積寄与率がおおむね80%となる要素数


target = target[target.index <= end_date]
no_missing_residuals = target.dropna(axis=1).T

pca = PCA(n_components = n_components).fit(no_missing_residuals)

explained_ratio_df = pd.DataFrame(np.cumsum(pca.explained_variance_ratio_), 
                                  index=['PC_'+ '{:0=3}'.format(j) for j in range(0, n_components)],
                                  columns=['ExplainedRatio'])
explained_ratio_df

Unnamed: 0,ExplainedRatio
PC_000,0.052151
PC_001,0.082441
PC_002,0.101188
PC_003,0.118848
PC_004,0.132631
...,...
PC_110,0.494700
PC_111,0.496618
PC_112,0.498525
PC_113,0.500426


In [1299]:
pca_array = pca.transform(no_missing_residuals)

pca_df = pd.DataFrame(pca_array, index=no_missing_residuals.index, columns=['PC_'+ '{:0=3}'.format(j) for j in range(0, n_components)])
pca_df = pca_df[[x for x in pca_df.columns if x not in ['PC_000', 'PC_002', 'PC_003', 'PC_004']]]
extracted_df = pca_df.sort_index(ascending=True)
extracted_df.to_csv('pca_residue.csv')
extracted_df

Unnamed: 0_level_0,PC_001,PC_005,PC_006,PC_007,PC_008,PC_009,PC_010,PC_011,PC_012,PC_013,...,PC_105,PC_106,PC_107,PC_108,PC_109,PC_110,PC_111,PC_112,PC_113,PC_114
Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1332,0.090865,-0.036063,-0.040189,-0.000418,-0.015146,0.013279,-0.050475,0.066829,-0.032814,0.092481,...,-0.003830,0.055505,-0.108564,-0.006134,-0.000709,-0.044390,-0.025046,0.045767,-0.046913,0.003845
1333,0.056864,-0.045871,-0.100996,0.041135,-0.038302,0.000021,-0.011970,0.043968,-0.032267,0.033310,...,-0.003517,0.009158,-0.022241,0.005037,0.009881,0.021685,-0.010504,-0.011996,0.019117,0.016874
1377,0.002000,0.000156,-0.021307,-0.004182,-0.031280,0.030437,0.003149,0.013997,-0.033700,0.004793,...,0.021567,0.034435,-0.013349,-0.010758,0.013699,-0.005419,-0.019280,-0.016214,0.020302,0.029854
1414,-0.028515,-0.134983,0.067094,-0.008941,0.017286,-0.043247,-0.032129,-0.042443,0.038963,0.005193,...,0.041410,-0.013864,0.006072,-0.021880,0.004593,-0.000655,-0.010749,0.014302,0.009597,0.021230
1417,-0.012374,-0.091443,0.011125,-0.012697,0.017899,-0.027243,0.009090,-0.045658,0.016444,0.000963,...,0.024073,-0.034080,-0.007774,-0.013833,0.029089,-0.029100,0.012978,-0.005811,-0.024443,-0.044721
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9983,0.038546,0.027496,-0.016591,-0.035070,0.008854,-0.090400,0.025024,0.054788,-0.025481,0.161463,...,0.025095,-0.024409,-0.031695,-0.017486,0.001140,-0.018949,-0.025834,-0.007786,-0.032278,0.032505
9984,-0.102381,-0.052675,-0.171017,-0.009828,-0.041423,-0.054601,0.026533,-0.030477,-0.034544,-0.041807,...,0.039115,-0.018080,-0.031147,-0.052499,0.060150,0.024164,-0.023095,-0.031361,-0.049042,0.068435
9987,0.103157,-0.003135,-0.054280,-0.038549,0.052713,0.015157,-0.010226,0.046086,0.033858,-0.040706,...,0.047149,0.030758,-0.010637,0.002644,-0.001545,0.007841,-0.026210,-0.015868,-0.018016,-0.007701
9989,0.027636,0.005350,0.014875,-0.059225,0.029046,0.009518,0.037518,0.015689,-0.153441,0.024285,...,-0.010151,0.000988,0.018183,-0.002963,-0.000362,-0.000069,0.002973,-0.002093,0.008865,-0.033241


In [1300]:
'''
df = pd.merge(stock_dfs['list'], extracted_df[['PC_000', 'PC_001', 'PC_002', 'PC_003', 'PC_004', 'PC_005']], how='right', left_on='Code', right_index=True)
df.to_csv('PC2.csv')
'''

"\ndf = pd.merge(stock_dfs['list'], extracted_df[['PC_000', 'PC_001', 'PC_002', 'PC_003', 'PC_004', 'PC_005']], how='right', left_on='Code', right_index=True)\ndf.to_csv('PC2.csv')\n"

### クラスタリングの関数

In [1301]:
# クラスタリングパイプラインの準備
from project.modules.clustering import SectorClusteringPipeline

pipeline = SectorClusteringPipeline(stock_dfs['list'])


### クラスタリング(1段階目)


In [1302]:
filter_codes = ["7164","8473","8698","8604","8609","8613","8616","8628","8706","8601","8511","8697","8130","7981","5929","5943","9934","5930","7943","6651","7148","5938","1808","8897","1419","1928","3291","4204","8848","1911","1878","1925","1766","8923","8905","8803","3289","8801","8802","8804","8830","3231","3003","8934","8871"]

extracted_df = extracted_df.loc[filter_codes, :]

In [1303]:
clustered_df = pipeline.execute(extracted_df)
clustered_df.to_csv('clustered.csv')
clustered_df




Unnamed: 0_level_0,CompanyName,MarketCodeName,Sector33CodeName,Sector17CodeName,ScaleCategory,Listing,Cluster_0,Cluster_1,Cluster_2,Cluster_3,Cluster_4,Cluster_5,Cluster_6,Cluster_7,Cluster_8,Cluster_9
Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1419,タマホーム,プライム,建設業,建設・資材,TOPIX Small 1,1,0,0,0,0,0,0,0,0,0,0
1766,東建コーポレーション,プライム,建設業,建設・資材,TOPIX Small 1,1,0,0,0,0,0,0,0,0,0,0
1808,長谷工コーポレーション,プライム,建設業,建設・資材,TOPIX Mid400,1,3,3,3,3,3,3,3,0,3,3
1878,大東建託,プライム,不動産業,不動産,TOPIX Mid400,1,3,3,3,3,3,3,3,0,3,3
1911,住友林業,プライム,建設業,建設・資材,TOPIX Mid400,1,3,3,3,3,3,3,3,0,3,3
1925,大和ハウス工業,プライム,建設業,建設・資材,TOPIX Large70,1,3,3,3,3,3,3,3,0,3,3
1928,積水ハウス,プライム,建設業,建設・資材,TOPIX Large70,1,3,3,3,3,3,3,3,0,3,3
3003,ヒューリック,プライム,不動産業,不動産,TOPIX Mid400,1,2,2,2,2,2,2,2,2,2,2
3231,野村不動産ホールディングス,プライム,不動産業,不動産,TOPIX Mid400,1,2,2,2,2,2,2,2,2,2,2
3289,東急不動産ホールディングス,プライム,不動産業,不動産,TOPIX Mid400,1,2,2,2,2,2,2,2,2,2,2
