## 事前準備

### モジュールのインポート

In [1]:
import warnings
warnings.filterwarnings('ignore', r'All-NaN (slice|axis) encountered')

In [2]:
# 自作モジュール
from utils.paths import Paths
from acquisition.jquants_api_operations import StockAcquisitionFacade
from utils.jquants_api_utils import cli
from calculation.target import TargetCalculator
# 基本モジュール
from datetime import datetime
import pandas as pd
import numpy as np
# クラスタリングで使用
from clustering import UMAPReducer, HDBSCANCluster, EuclideanClusterAssigner, SectorClusterer


## 実行

### 銘柄情報の取得
* stock_lists: 2014年10月からの銘柄一覧
* history_list: 銘柄ごとのScaleCategoryの遍歴

In [1295]:
filter_codes = ["4912","4917","7956","8113","4919","4928","6630","7744","4452","4967","8283","4028","4911","4922","4927"]

filter_codes = [str(x) for x in filter_codes]

In [None]:
filter_condition = "(Listing==1)&((ScaleCategory=='TOPIX Core30')|(ScaleCategory=='TOPIX Large70')|(ScaleCategory=='TOPIX Mid400')|(ScaleCategory=='TOPIX Small 1'))" #現行のTOPIX500""
saf = StockAcquisitionFacade(filter=filter_condition) #filtered_code_list = filter_codes)
stock_dfs = saf.get_stock_data_dict()
stock_dfs['list']

### 価格情報の準備

In [None]:
# 目的変数（日内リターン）を算出
stock_dfs['price']['Target'] = stock_dfs['price']['Close'] / stock_dfs['price']['Open'] - 1
target = stock_dfs['price'][['Date', 'Code', 'Target']]
target = target.set_index(['Date', 'Code'], drop=True).unstack(-1).droplevel(0, axis=1)
target

### PCA処理の実行

In [None]:
from sklearn.decomposition import PCA
end_date = datetime(2022,1,1)
n_components = 115 # 累積寄与率がおおむね80%となる要素数


target = target[target.index <= end_date]
no_missing_residuals = target.dropna(axis=1).T

pca = PCA(n_components = n_components).fit(no_missing_residuals)

explained_ratio_df = pd.DataFrame(np.cumsum(pca.explained_variance_ratio_), 
                                  index=['PC_'+ '{:0=3}'.format(j) for j in range(0, n_components)],
                                  columns=['ExplainedRatio'])
explained_ratio_df

In [None]:
pca_array = pca.transform(no_missing_residuals)

pca_df = pd.DataFrame(pca_array, index=no_missing_residuals.index, columns=['PC_'+ '{:0=3}'.format(j) for j in range(0, n_components)])
pca_df = pca_df[[x for x in pca_df.columns if x not in ['PC_000', 'PC_002', 'PC_003', 'PC_004']]]
extracted_df = pca_df.sort_index(ascending=True)
extracted_df.to_csv('pca_residue.csv')
extracted_df

### クラスタリングの関数

In [1301]:
# セクタークラスタリング用のクラスを準備
clusterer = SectorClusterer(stock_dfs['list'])

# UMAP->HDBSCAN->距離解析を実行する関数
def run_pipeline(df, min_cluster_sizes=None, threshold=0.03):
    reduced = clusterer.apply_umap(df)
    labels = clusterer.apply_hdbscan(reduced, min_cluster_sizes)
    assigned = clusterer.determine_cluster_from_euclidean(reduced, threshold)
    analysis = clusterer.analyze_cluster_distances(reduced, labels['Cluster'])
    return assigned.join(labels).join(analysis, how='left')


### クラスタリング(1段階目)


In [1302]:
filter_codes = ["7164","8473","8698","8604","8609","8613","8616","8628","8706","8601","8511","8697","8130","7981","5929","5943","9934","5930","7943","6651","7148","5938","1808","8897","1419","1928","3291","4204","8848","1911","1878","1925","1766","8923","8905","8803","3289","8801","8802","8804","8830","3231","3003","8934","8871"]

extracted_df = extracted_df.loc[filter_codes, :]

In [None]:
result_df = run_pipeline(extracted_df)
result_df.to_csv('cluster_hdbscan.csv')
result_df
