In [4]:
import numpy as np
import pandas as pd 
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from joblib import Parallel, delayed

# 데이터 불러오기 및 전처리
dfs = {}
keywords = ["auto", "construct", "capital_market", "chemicals", "equipment",
            "transport", "semi", "bank", "steel",
            "telecom", "staples", "discretionary", "kospi"]
for keyword in keywords:
    dfs[keyword] = pd.read_excel(r"total_raw_0.5ver.xlsx", sheet_name=keyword, header=0, index_col=0)
    dfs[keyword].index = pd.to_datetime(dfs[keyword].index)
    dfs[keyword].index = dfs[keyword].index.strftime('%Y-%m-%d')
    dfs[keyword].dropna(inplace=True)

# 특성 선택: 중요한 특성만 사용
selected_features = {}  # 각 키워드별 선택된 특성
for keyword, df in dfs.items():
    X = df.drop(columns=[keyword])  # 종속 변수를 제외한 나머지 특성
    y = df[keyword]  # 종속 변수
    rfc = RandomForestClassifier(random_state=0)
    rfc.fit(X, y)
    selected_feature_indices = np.argsort(rfc.feature_importances_)[::-1][:5]  # 중요한 특성 상위 5개 선택
    selected_features[keyword] = X.columns[selected_feature_indices].tolist()

# 병렬 처리를 위한 함수: 모델 학습
def train_model(keyword, X_train, y_train):
    rfc = RandomForestClassifier(random_state=0)
    param_grid = {'n_estimators': [50, 100, 150], 'max_depth': [None, 10, 20]}
    grid_search = GridSearchCV(rfc, param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)
    return keyword, grid_search.best_estimator_

# 모델 학습을 병렬 처리하여 최적 모델 찾기
best_models = {}
for keyword, df in dfs.items():
    X = df[selected_features[keyword]]  # 선택된 중요한 특성만 사용
    y = df[keyword]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
    best_models[keyword] = train_model(keyword, X_train, y_train)

# 테스트 성능 평가
test_scores = {}
for keyword, model in best_models.items():
    X_test = dfs[keyword][selected_features[keyword]].iloc[-1].values.reshape(1, -1)  # 마지막 행 데이터
    y_test = dfs[keyword].iloc[-1][keyword]
    test_scores[keyword] = model.score(X_test, [y_test])

# 결과 출력
for keyword, score in test_scores.items():
    print(f"{keyword} 키워드의 테스트 성능: {score:.4f}")


AttributeError: 'tuple' object has no attribute 'score'