In [50]:
import numpy as np
import pandas as pd
import importlib

In [51]:
import common

# モジュールの再読み込み
importlib.reload(common)

_common = common.Common()
_common.BASE_PATH

PosixPath('/Users/iwasakitakahiro/github')

In [52]:
train_df = pd.read_csv('../input/学習用データ/train.csv')
test_df = pd.read_csv('../input/評価用データ/test.csv')

In [53]:
train_df[_common.UNIQUE_KEY_COLS] = pd.to_datetime(train_df[_common.UNIQUE_KEY_COLS], utc=True)
test_df[_common.UNIQUE_KEY_COLS] = pd.to_datetime(test_df[_common.UNIQUE_KEY_COLS], utc=True)

train_df = train_df.set_index(_common.UNIQUE_KEY_COLS)
test_df = test_df.set_index(_common.UNIQUE_KEY_COLS)

## 欠損値補完

In [54]:
# 前後の値による線形補完
missing_cols_train = train_df.columns[train_df.isnull().any()].tolist()
missing_cols_test = test_df.columns[test_df.isnull().any()].tolist()

train_df[missing_cols_train] = train_df[missing_cols_train].interpolate(method='linear')
test_df[missing_cols_test] = test_df[missing_cols_test].interpolate(method='linear')

## 特徴量エンジニアリング

In [55]:
train_df = train_df.reset_index()
test_df = test_df.reset_index()

train_df = train_df[_common.TRAIN_FEATURE_COLS]
test_df = test_df[_common.TEST_FEATURE_COLS]

### 発電・供給データ

#### 発電量合計

In [56]:
# "generation" を含む列名だけを抽出
gene_cols = [col for col in train_df.columns if 'generation' in col]

# 合計を新しい列として追加
train_df['gene_sum'] = train_df[gene_cols].sum(axis=1)
test_df['gene_sum'] = test_df[gene_cols].sum(axis=1)

#### 再生可能エネルギー比率

In [57]:
# train_df['sus_amount'] = train_df[_common.SUS_GENE_COLS].sum(axis=1)
# test_df['sus_amount'] = test_df[_common.SUS_GENE_COLS].sum(axis=1)

#### 火力発電比率

In [58]:
# fossil_cols = [col for col in train_df.columns if 'generation_fossil' in col]
# train_df['fossil_amount'] = train_df[fossil_cols].sum(axis=1)
# test_df['fossil_amount'] = test_df[fossil_cols].sum(axis=1)

#### 発電量と供給量の比

In [59]:
train_df['gene_load_ratio'] = train_df['gene_sum'] / train_df['total_load_actual']
test_df['gene_load_ratio'] = test_df['gene_sum'] / test_df['total_load_actual']

#### 発電コストの算出

In [60]:
# # 各発電方式のコスト辞書（EUR/MWh）
# cost_dict = {
#     'biomass': 100,
#     'brown_coal/lignite': 60,
#     'gas': 70,
#     'coal': 80,
#     'oil': 150,
#     'pump_hydro': 0,
#     'runofriver_hydro': 45,
#     'dam_hydro': 60,
#     'nuclear': 50,
#     'other': 80,
#     'other_renewables': 85,
#     'solar': 35,
#     'waste': 100,
#     'wind_onshore': 45
# }

# # 各発電量列 × コスト → 発電コスト列を作成
# train_df['gene_cost'] = (
#     train_df['generation_biomass'] * cost_dict['biomass'] +
#     train_df['generation_fossil_brown_coal/lignite'] * cost_dict['brown_coal/lignite'] +
#     train_df['generation_fossil_gas'] * cost_dict['gas'] +
#     train_df['generation_fossil_hard_coal'] * cost_dict['coal'] +
#     train_df['generation_fossil_oil'] * cost_dict['oil'] +
#     train_df['generation_hydro_pumped_storage_consumption'] * cost_dict['pump_hydro'] +
#     train_df['generation_hydro_run_of_river_and_poundage'] * cost_dict['runofriver_hydro'] +
#     train_df['generation_hydro_water_reservoir'] * cost_dict['dam_hydro'] +
#     train_df['generation_nuclear'] * cost_dict['nuclear'] +
#     train_df['generation_other'] * cost_dict['other'] +
#     train_df['generation_other_renewable'] * cost_dict['other_renewables'] +
#     train_df['generation_solar'] * cost_dict['solar'] +
#     train_df['generation_waste'] * cost_dict['waste'] +
#     train_df['generation_wind_onshore'] * cost_dict['wind_onshore']
# )

# test_df['gene_cost'] = (
#     test_df['generation_biomass'] * cost_dict['biomass'] +
#     test_df['generation_fossil_brown_coal/lignite'] * cost_dict['brown_coal/lignite'] +
#     test_df['generation_fossil_gas'] * cost_dict['gas'] +
#     test_df['generation_fossil_hard_coal'] * cost_dict['coal'] +
#     test_df['generation_fossil_oil'] * cost_dict['oil'] +
#     test_df['generation_hydro_pumped_storage_consumption'] * cost_dict['pump_hydro'] +
#     test_df['generation_hydro_run_of_river_and_poundage'] * cost_dict['runofriver_hydro'] +
#     test_df['generation_hydro_water_reservoir'] * cost_dict['dam_hydro'] +
#     test_df['generation_nuclear'] * cost_dict['nuclear'] +
#     test_df['generation_other'] * cost_dict['other'] +
#     test_df['generation_other_renewable'] * cost_dict['other_renewables'] +
#     test_df['generation_solar'] * cost_dict['solar'] +
#     test_df['generation_waste'] * cost_dict['waste'] +
#     test_df['generation_wind_onshore'] * cost_dict['wind_onshore']
# )

#### 残余需要、残余比率

In [61]:
# 残余需要 = 総需要量 - 再エネ発電量
train_df['residual_demand'] = train_df['total_load_actual'] - train_df[_common.SUS_GENE_COLS].sum(axis=1)
test_df['residual_demand'] = test_df['total_load_actual'] - test_df[_common.SUS_GENE_COLS].sum(axis=1)

# # 残余比率 = 残余需要 ÷ 総需要量
train_df['residual_demand_ratio'] = train_df['residual_demand'] / train_df['total_load_actual']
test_df['residual_demand_ratio'] = test_df['residual_demand'] / test_df['total_load_actual']

#### メリットオーダー（電力市場で電力を供給する順番を、発電コストが安い順に並べたもの）を用いた特徴量

In [62]:
# # 加重平均発電コスト（€/MWh）
# train_df['w_gene_cost'] = train_df['gene_cost'] / train_df['gene_sum']
# test_df['w_gene_cost'] = test_df['gene_cost'] / test_df['gene_sum']

In [63]:
# 高コスト発電比率
train_df['high_cost_ratio'] = (train_df['generation_fossil_gas'] + train_df['generation_fossil_gas'] + train_df['generation_fossil_oil']) / train_df['gene_sum']
test_df['high_cost_ratio'] = (test_df['generation_fossil_gas'] + test_df['generation_fossil_gas'] + test_df['generation_fossil_oil']) / test_df['gene_sum']

In [64]:
# train_df = train_df.drop(columns=gene_cols)
# test_df = test_df.drop(columns=gene_cols)

#### 需給逼迫フラグ
残余需要が過去平均と比べて高水準であればフラグを立てる

In [65]:
q3_residual = train_df['residual_demand'].quantile(0.9)

train_df['tight_supply_flag'] = (train_df['residual_demand'] > q3_residual).astype(int)
test_df['tight_supply_flag']  = (test_df['residual_demand']  > q3_residual).astype(int)

### 時系列特徴量

#### 時間の三角関数の変換

In [66]:
train_df['hour_sin'] = np.sin(2 * np.pi * train_df[_common.UNIQUE_KEY_COLS].dt.hour / 24)
train_df['hour_cos'] = np.cos(2 * np.pi * train_df[_common.UNIQUE_KEY_COLS].dt.hour / 24)

test_df['hour_sin'] = np.sin(2 * np.pi * test_df[_common.UNIQUE_KEY_COLS].dt.hour / 24)
test_df['hour_cos'] = np.cos(2 * np.pi * test_df[_common.UNIQUE_KEY_COLS].dt.hour / 24)

In [67]:
train_df['weekday_sin'] = np.sin(2 * np.pi * train_df[_common.UNIQUE_KEY_COLS].dt.weekday / 7)
train_df['weekday_cos'] = np.cos(2 * np.pi * train_df[_common.UNIQUE_KEY_COLS].dt.weekday / 7)

test_df['weekday_sin'] = np.sin(2 * np.pi * test_df[_common.UNIQUE_KEY_COLS].dt.weekday / 7)
test_df['weekday_cos'] = np.cos(2 * np.pi * test_df[_common.UNIQUE_KEY_COLS].dt.weekday / 7)

In [68]:
train_df['month_sin'] = np.sin(2 * np.pi * train_df[_common.UNIQUE_KEY_COLS].dt.month / 12)
train_df['month_cos'] = np.cos(2 * np.pi * train_df[_common.UNIQUE_KEY_COLS].dt.month / 12)

test_df['month_sin'] = np.sin(2 * np.pi * test_df[_common.UNIQUE_KEY_COLS].dt.month / 12)
test_df['month_cos'] = np.cos(2 * np.pi * test_df[_common.UNIQUE_KEY_COLS].dt.month / 12)

In [69]:
train_df['quarter_sin'] = np.sin(2 * np.pi * train_df[_common.UNIQUE_KEY_COLS].dt.quarter / 4)
train_df['quarter_cos'] = np.cos(2 * np.pi * train_df[_common.UNIQUE_KEY_COLS].dt.quarter / 4)

test_df['quarter_sin'] = np.sin(2 * np.pi * test_df[_common.UNIQUE_KEY_COLS].dt.quarter / 4)
test_df['quarter_cos'] = np.cos(2 * np.pi * test_df[_common.UNIQUE_KEY_COLS].dt.quarter / 4)

In [70]:
# TODO: クオーターも作る、dayofyearインデックスの効果を調べる

In [71]:
train_df['hour'] = np.arctan2(train_df['hour_sin'], train_df['hour_cos'])
train_df['weekday'] = np.arctan2(train_df['weekday_cos'], train_df['weekday_sin'])
train_df['month'] = np.arctan2(train_df['month_cos'], train_df['month_sin'])
train_df['quarter'] = np.arctan2(train_df['quarter_cos'], train_df['quarter_sin'])
test_df['hour'] = np.arctan2(test_df['hour_cos'], test_df['hour_sin'])
test_df['weekday'] = np.arctan2(test_df['weekday_cos'], test_df['weekday_sin'])
test_df['month'] = np.arctan2(test_df['month_cos'], test_df['month_sin'])
test_df['quarter'] = np.arctan2(test_df['quarter_cos'], test_df['quarter_sin'])

train_df = train_df.drop(columns=['hour_sin', 'hour_cos', 'weekday_sin', 'weekday_cos', 'month_sin', 'month_cos', 'quarter_sin', 'quarter_cos'])
test_df = test_df.drop(columns=['hour_sin', 'hour_cos', 'weekday_sin', 'weekday_cos', 'month_sin', 'month_cos', 'quarter_sin', 'quarter_cos'])

In [72]:
train_df['dayofyear'] = train_df[_common.UNIQUE_KEY_COLS].dt.dayofyear
test_df['dayofyear']  = test_df[_common.UNIQUE_KEY_COLS].dt.dayofyear

#### 発電量・供給量の時系列特徴量（ラグ、ローリング、エクスパンディング）

In [73]:
# まず識別用フラグ追加（どちらがtrainかtestか後でわかるように）
train_df['is_train'] = True
test_df['is_train'] = False

# train, testを結合（indexを一意にしたい場合はreset_index(drop=True)も可）
all_df = pd.concat([train_df, test_df], axis=0, sort=False)

# ---- ここでラグ・ローリング特徴量を一括計算 ----
cols = ['gene_sum', 'total_load_actual']
lag_hours       = [1, 24, 48, 72]
rolling_windows = [168, 336, 672]     # 7, 14, 28 日
scale_cols      = cols                # z-score を作る対象

for col in cols:
    # # --- ラグ ---
    # for lag in lag_hours:
    #     all_df[f'{col}_lag{lag}'] = all_df[col].shift(lag)

    # # --- ローリング平均（同じ窓幅で他の統計も可） ---
    # for win in rolling_windows:
    #     all_df[f'{col}_rolling_mean{win}'] = (
    #         all_df[col]
    #         .shift(1)                          # 現在行を除外したい場合
    #         .rolling(window=win, min_periods=1)
    #         .mean()
    #     )

    # --- expanding 平均 / 標準偏差 / z-score ---
    #   ・shift(1) を噛ませれば「t 時点の値を含まない」純粋な過去統計
    past_series = all_df[col].shift(1)

    all_df[f'{col}_exp_mean_to_t'] = past_series.expanding().mean()
    all_df[f'{col}_exp_std_to_t']  = past_series.expanding().std(ddof=0)

    # 0 除算を避けるため std==0 → NaN
    all_df.loc[all_df[f'{col}_exp_std_to_t'] == 0, f'{col}_exp_std_to_t'] = np.nan

    all_df[f'{col}_scaled'] = (
        (all_df[col] - all_df[f'{col}_exp_mean_to_t']) /
        all_df[f'{col}_exp_std_to_t']
    )

# ---- 計算後に再びtrain/testに分割 ----
train_df = (
    all_df[all_df['is_train']]
    .drop(columns=['is_train'])
    .copy()
)

test_df = (
    all_df[~all_df['is_train']]
    .drop(columns=['is_train', _common.TARGET_COL])  # 目的変数は残さない
    .copy()
)


### 気象データ

#### 湿度を露点温度へ変換

In [74]:
def calc_dew_point(temp_k, rh):
    """
    ケルビン温度と相対湿度から露点温度(ケルビン変換)を計算
    """
    import numpy as np
    temp_c = temp_k - 273.15  # K → ℃
    a = 17.62
    b = 243.12
    gamma = np.log(rh / 100) + (a * temp_c) / (b + temp_c)
    return (b * gamma) / (a - gamma) + 273.15

In [75]:
cities = ['madrid', 'barcelona', 'bilbao', 'seville', 'valencia']

# 各都市について露点温度を計算して新しい列に追加
for city in cities:
    temp_col = f'{city}_temp'
    hum_col = f'{city}_humidity'
    dew_col = f'{city}_dew_point'
    if temp_col in train_df.columns and hum_col in train_df.columns:
        train_df[dew_col] = calc_dew_point(train_df[temp_col], train_df[hum_col])
        train_df = train_df.drop(columns=hum_col)
    if temp_col in test_df.columns and hum_col in test_df.columns:
        test_df[dew_col] = calc_dew_point(test_df[temp_col], test_df[hum_col])
        test_df = test_df.drop(columns=hum_col)

  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


#### 人口比率による加重平均

|年|マドリード|バルセロナ|ビルバオ|セビリア|バレンシア|
|-|-|-|-|-|-|
|2015|約13.4%|約11.4%|約0.76%|約1.5%|約1.8%|
|2016|約13.6%|約11.5%|約0.76%|約1.5%|約1.8%|
|2017|約13.7%|約11.6%|約0.76%|約1.5%|約1.8%|
|2018|約13.9%|約11.7%|約0.75%|約1.5%|約1.8%|
|平均|13.7%|11.4%|0.76%|1.5%|1.8%|

In [76]:
pop_ratio = {
    'madrid': 0.137,
    'barcelona': 0.114,
    'bilbao': 0.0076,
    'seville': 0.015,
    'valencia': 0.018
}

In [77]:
weather_cols = ['wind_speed', 'temp', 'pressure', 'dew_point', 'clouds_all']

def add_weighted_weather(df, pop_ratio, weather_cols):
    cities = list(pop_ratio.keys())
    for w_col in weather_cols:
        weighted_vals = np.zeros(len(df))
        weight_sum = sum(pop_ratio[city] for city in cities)
        for city in cities:
            colname = f"{city}_{w_col}"
            weighted_vals += df[colname].fillna(0) * pop_ratio[city]
        df[f'weighted_{w_col}'] = weighted_vals / weight_sum
    # 元カラム名リストを返す（削除用）
    drop_cols = [f"{city}_{w_col}" for city in cities for w_col in weather_cols]
    return df.drop(columns=drop_cols)

# 実行例
train_df = add_weighted_weather(train_df, pop_ratio, weather_cols)
test_df = add_weighted_weather(test_df, pop_ratio, weather_cols)

#### 異常気象フラグ
高温・低温で一定水準を超えたらフラグを立てる
冷暖房度日（HDD/CDD）を調べたい

## 価格を特徴量に入れる

In [78]:
#### ターゲットエンコーディング（曜日×時間）
for df in [train_df, test_df]:
    df['tmp_month'] = pd.to_datetime(df[_common.UNIQUE_KEY_COLS]).dt.month
    df['tmp_weekday'] = pd.to_datetime(df[_common.UNIQUE_KEY_COLS]).dt.weekday
    df['tmp_hour'] = pd.to_datetime(df[_common.UNIQUE_KEY_COLS]).dt.hour

# === ターゲットエンコーディング（曜日 × 時間） ===
group_cols = ['tmp_month', 'tmp_weekday', 'tmp_hour']
encoded_col = "encoded_price_weekday_hour"

# train から平均価格テーブルを作成
mean_price_table = train_df.groupby(group_cols)[_common.TARGET_COL].mean().reset_index()
mean_price_table = mean_price_table.rename(columns={_common.TARGET_COL: encoded_col})

# train/test にエンコード値を結合
train_df = train_df.merge(mean_price_table, on=group_cols, how='left')
test_df = test_df.merge(mean_price_table, on=group_cols, how='left')

# === エンコードに使った列は削除（sin/cosで代替済みのため） ===
train_df = train_df.drop(columns=group_cols)
test_df = test_df.drop(columns=group_cols)

## エンコーディング

In [79]:
def weather_group(col):
    col = str(col)
    if col.find('clear') != -1:
        return 1
    elif col.find('clouds') != -1:
        return 2
    else:
        return 3

In [80]:
# for col in _common.CATEGORY_COLS:
#     new_col = col + '_LabelEn'
#     train_df[new_col] = train_df[col].apply(weather_group)
#     train_df = train_df.drop(columns=col)
#     test_df[new_col] = test_df[col].apply(weather_group)
#     test_df = test_df.drop(columns=col)

#### 祝日フラグ
ChatGPTに2015年〜2018年の休日・祝日にフラグを立てる

In [81]:
from datetime import date
import holidays

es_holidays = holidays.ES()  # this is a dict-like object

In [82]:
# train_df['is_holiday'] = (train_df[_common.UNIQUE_KEY_COLS].dt.date.isin(es_holidays).astype(int))
# test_df['is_holiday'] = (test_df[_common.UNIQUE_KEY_COLS].dt.date.isin(es_holidays).astype(int))

## 主成分分析

In [83]:
def one_hot_encoder(df):
  """
  one hot encoder.

  Args:
      df (pd.DataFrame): source dataframe.

  Returns:
      pd.DataFrame: The one hot encoded dataframe.
  """
  for col in df.columns:
    if df[col].dtype == 'object' or df[col].dtype == 'category':
      one_hot = pd.get_dummies(df[col], prefix=f'{col}_', dtype=int)
      df = df.drop(col, axis=1)
      df = df.join(one_hot)
  return df

def generate_hour_one_hot(df):
  one_hot = pd.get_dummies(df['hour'].astype(str), prefix='hour_', dtype=int)
  df = df.drop('hour', axis=1)
  df = df.join(one_hot)
  return df

def generate_dayofweek_one_hot(df):
  one_hot = pd.get_dummies(df['dayofweek'].astype(str), prefix='dayofweek_', dtype=int)
  df = df.drop('dayofweek', axis=1)
  df = df.join(one_hot)
  return df

def generate_dayofyear_one_hot(df):
  one_hot = pd.get_dummies(df['dayofyear'].clip(upper=365).astype(str), prefix='dayofyear_', dtype=int)
  df = df.drop('dayofyear', axis=1)
  df = df.join(one_hot)
  return df

def setup_time(df):
  """
  setup time(utc=True) values to dataframe index.

  Args:
      df (pd.DataFrame): source dataframe.

  Returns:
      pd.DataFrame: time indexed dataframe.
  """
  df['time'] = pd.to_datetime(df['time'], utc=True)
  df.set_index('time', inplace=True)
  df['hour'] = df.index.hour
  df['dayofweek'] = df.index.dayofweek
  df['dayofyear'] = df.index.dayofyear
  return df

def setup_dataframe(file):
  """
  setup dataframe.

  Args:
      file: csv file.

  Returns:
      pd.DataFrame: dataframe.
  """
  df = pd.read_csv(os.path.join(DIR, file))
  df = setup_time(df)
  #df = generate_hour_one_hot(df)
  #df = generate_dayofweek_one_hot(df)
  #df = generate_dayofyear_one_hot(df)
  df = one_hot_encoder(df)
  return df.fillna(0)

In [84]:
# train_df['year'] = train_df[_common.UNIQUE_KEY_COLS].dt.year
# train_df['month'] = train_df[_common.UNIQUE_KEY_COLS].dt.month

# # Select only numeric columns
# numeric_cols = train_df.select_dtypes(include=np.number).columns.tolist()
# # Exclude 'price_actual', 'year', and 'month' from the numeric columns for correlation calculation against 'price_actual'
# numeric_cols_for_corr = [col for col in numeric_cols if col not in ['price_actual', 'year', 'month', 'hour', 'dayofyear', ]]

# # Create an empty dictionary to store the correlation dataframes
# monthly_yearly_correlations = {}

# # Iterate through each numeric column (excluding 'price_actual', 'year', 'month')
# for col in numeric_cols_for_corr:
#     # Calculate the Spearman correlation for each year and month
#     corr_df = train_df.groupby(['year', 'month'])[[col, 'price_actual']].corr(method='spearman').unstack().iloc[:, 1].unstack()
#     monthly_yearly_correlations[col] = corr_df

# # Combine the correlation dataframes into a single dataframe
# # This will be a MultiIndex dataframe where the outer index is the column name
# # and the inner index is the year and month
# correlation_summary_df = pd.concat(monthly_yearly_correlations, axis=0)

# # Display the resulting dataframe
# correlation_summary_df.fillna(0, inplace=True)
# correlation_summary_df_abs = correlation_summary_df.abs()

# corr_mean_val = correlation_summary_df_abs.mean()
# corr_std_val = correlation_summary_df_abs.std(ddof=0)

# # Calculate the threshold: mean + 2 * std
# target_threshold_2std = corr_mean_val + 2.0 * corr_std_val

# # Identify data points where the absolute correlation is greater than the threshold in any month
# # We use .stack() to make it easier to filter based on values
# anomalous_correlations = correlation_summary_df_abs[correlation_summary_df_abs > target_threshold_2std].stack().reset_index()

# print("Data points where absolute correlation is greater than mean + 2*std in any month:")
# FEATURES = anomalous_correlations['level_0'].unique()
# FEATURES = [column for column in FEATURES if column not in ['price_actual', 'year', 'month', 'hour', 'dayofyear', 'dayofweek']]
# FEATURES

In [85]:
# GENERATION_FEATURES = [column for column in FEATURES if column.startswith('generation')]
# WEATHER_FEATURES = [column for column in FEATURES if column not in GENERATION_FEATURES]

In [86]:
# # prompt: GENERATION_FEATURESに定義された項目でPCAを取得

# import pandas as pd
# import numpy as np
# from sklearn.decomposition import PCA
# from sklearn.metrics.pairwise import cosine_similarity

# # PCAモデルの初期化 (コンポーネント数を指定しない場合は、特徴量の数になる)
# pca_generation = PCA(n_components=3, random_state=42)
# pca_weather = PCA(n_components=5, random_state=42)
# pca_all = PCA(n_components=8, random_state=42)

# # データを変換
# principal_components_g = pca_generation.fit_transform(train_df[GENERATION_FEATURES])
# principal_components_w = pca_weather.fit_transform(train_df[WEATHER_FEATURES])
# principal_components_a = pca_all.fit_transform(train_df[GENERATION_FEATURES + WEATHER_FEATURES])

# def calculate_vector_norm_ratio(vec1: np.ndarray, vec2: np.ndarray) -> float:
#   """
#   Calculate the ratio of the norms of two vectors.

#   Args:
#       vec1 (np.ndarray): The first input vector.
#       vec2 (np.ndarray): The second input vector.

#   Returns:
#       float: The ratio of the norm of vec1 to the norm of vec2.
#              Returns 0 if the norm of vec2 is zero to avoid division by zero.
#   """
#   norm1 = np.linalg.norm(vec1)
#   norm2 = np.linalg.norm(vec2)

#   if norm2 == 0:
#     return 0.0
#   else:
#     return norm1 / norm2

# def calculate_cossim(vec1: np.ndarray, vec2: np.ndarray) -> float:
#   """
#   Calculate the cosine similarity between two vectors.

#   Args:
#       vec1 (np.ndarray): The first input vector.
#       vec2 (np.ndarray): The second input vector.

#   Returns:
#       float: The cosine similarity between the two vectors.
#   """
#   # Reshape vectors for cosine_similarity function if they are 1D
#   vec1 = vec1.flatten()
#   vec2 = vec2.flatten()

#   return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

# def generation_feature_relations(x,pca=pca_generation):
#   """
#   Calculate relations between generation features using PCA.

#   Args:
#       x (np.ndarray): vector of generation features.

#   Returns:
#       cosine_similarity (float): The cosine similarity between two vectors.
#       vector_norm_ratio (float): The ratio of the norms of two vectors.
#       principal_components_1 (first): The principal components of the input vector.
#       principal_components_2 (second): The principal components of the input vector.
#       principal_components_3 (third): The principal components of the input vector.
#   """
#   x_trans = pca.transform([x])
#   x_inv = pca.inverse_transform(x_trans)
#   cosine_similarity_value = calculate_cossim(x.values, x_inv)
#   vector_norm_ratio_value = calculate_vector_norm_ratio(x, x_inv)
#   return cosine_similarity_value, vector_norm_ratio_value,x_trans[0][0],x_trans[0][1],x_trans[0][2]

# def weather_feature_relations(x,pca=pca_weather):
#   """
#   Calculate relations between weather features using PCA.

#   Args:
#       x (np.ndarray): vector of waether features.

#   Returns:
#       cosine_similarity (float): The cosine similarity between two vectors.
#       vector_norm_ratio (float): The ratio of the norms of two vectors.
#       principal_components_1 (first): The principal components of the input vector.
#       principal_components_2 (second): The principal components of the input vector.
#       principal_components_3 (third): The principal components of the input vector.
#       principal_components_4 (4-th): The principal components of the input vector.
#       principal_components_5 (5-th): The principal components of the input vector.
#   """
#   x_trans = pca.transform([x])
#   x_inv = pca.inverse_transform(x_trans)
#   cosine_similarity_value = calculate_cossim(x.values, x_inv)
#   vector_norm_ratio_value = calculate_vector_norm_ratio(x, x_inv)
#   return cosine_similarity_value, vector_norm_ratio_value,x_trans[0][0],x_trans[0][1],x_trans[0][2],x_trans[0][3],x_trans[0][4]

# def all_feature_relations(x,pca=pca_all):
#   """
#   Calculate relations between all features using PCA.

#   Args:
#       x (np.ndarray): vector of all features.

#   Returns:
#       cosine_similarity (float): The cosine similarity between two vectors.
#       vector_norm_ratio (float): The ratio of the norms of two vectors.
#       principal_components_1 (first): The principal components of the input vector.
#       principal_components_2 (second): The principal components of the input vector.
#       principal_components_3 (third): The principal components of the input vector.
#       principal_components_4 (4-th): The principal components of the input vector.
#       principal_components_5 (5-th): The principal components of the input vector.
#       principal_components_5 (6-th): The principal components of the input vector.
#       principal_components_5 (7-th): The principal components of the input vector.
#       principal_components_5 (8-th): The principal components of the input vector.
#   """
#   x_trans = pca.transform([x])
#   x_inv = pca.inverse_transform(x_trans)
#   cosine_similarity_value = calculate_cossim(x.values, x_inv)
#   vector_norm_ratio_value = calculate_vector_norm_ratio(x, x_inv)
#   return cosine_similarity_value, vector_norm_ratio_value,x_trans[0][0],x_trans[0][1],x_trans[0][2],x_trans[0][3],x_trans[0][4],x_trans[0][5],x_trans[0][6],x_trans[0][7]


# # Principal components を DataFrame に変換
# principal_g_df = train_df[GENERATION_FEATURES].apply(lambda x: generation_feature_relations(x), axis=1)
# principal_g_df = pd.DataFrame({'g_cosine_similarity': [x[0] for x in principal_g_df],
#                              'g_vector_norm_ratio': [x[1] for x in principal_g_df],
#                               'pc_g_1': [x[2] for x in principal_g_df],
#                               'pc_g_2': [x[3] for x in principal_g_df],
#                               'pc_g_3': [x[4] for x in principal_g_df],
#                              }, index=train_df.index)
# principal_w_df = train_df[WEATHER_FEATURES].apply(lambda x: weather_feature_relations(x), axis=1)
# principal_w_df = pd.DataFrame({'w_cosine_similarity': [x[0] for x in principal_w_df],
#                              'w_vector_norm_ratio': [x[1] for x in principal_w_df],
#                               'pc_w_1': [x[2] for x in principal_w_df],
#                               'pc_w_2': [x[3] for x in principal_w_df],
#                               'pc_w_3': [x[4] for x in principal_w_df],
#                               'pc_w_4': [x[5] for x in principal_w_df],
#                               'pc_w_5': [x[6] for x in principal_w_df],
#                              }, index=train_df.index)
# principal_a_df = train_df[GENERATION_FEATURES + WEATHER_FEATURES].apply(lambda x: all_feature_relations(x), axis=1)
# principal_a_df = pd.DataFrame({'a_cosine_similarity': [x[0] for x in principal_a_df],
#                              'a_vector_norm_ratio': [x[1] for x in principal_a_df],
#                               'pc_a_1': [x[2] for x in principal_a_df],
#                               'pc_a_2': [x[3] for x in principal_a_df],
#                               'pc_a_3': [x[4] for x in principal_a_df],
#                               'pc_a_4': [x[5] for x in principal_a_df],
#                               'pc_a_5': [x[6] for x in principal_a_df],
#                               'pc_a_6': [x[7] for x in principal_a_df],
#                               'pc_a_7': [x[8] for x in principal_a_df],
#                               'pc_a_8': [x[9] for x in principal_a_df],
#                              }, index=train_df.index)

# # 元のDataFrameに結合することもできます
# train_df_pca = pd.concat([train_df, principal_g_df, principal_w_df, principal_a_df], axis=1)

# principal_g_df_test = test_df[GENERATION_FEATURES].apply(lambda x: generation_feature_relations(x), axis=1)
# principal_g_df_test = pd.DataFrame({'g_cosine_similarity': [x[0] for x in principal_g_df_test],
#                                   'g_vector_norm_ratio': [x[1] for x in principal_g_df_test],
#                                   'pc_g_1': [x[2] for x in principal_g_df_test],
#                                   'pc_g_2': [x[3] for x in principal_g_df_test],
#                                   'pc_g_3': [x[4] for x in principal_g_df_test],
#                                   }, index=test_df.index)
# principal_w_df_test = test_df[WEATHER_FEATURES].apply(lambda x: weather_feature_relations(x), axis=1)
# principal_w_df_test = pd.DataFrame({'w_cosine_similarity': [x[0] for x in principal_w_df_test],
#                                   'w_vector_norm_ratio': [x[1] for x in principal_w_df_test],
#                                   'pc_w_1': [x[2] for x in principal_w_df_test],
#                                   'pc_w_2': [x[3] for x in principal_w_df_test],
#                                   'pc_w_3': [x[4] for x in principal_w_df_test],
#                                   'pc_w_4': [x[5] for x in principal_w_df_test],
#                                   'pc_w_5': [x[6] for x in principal_w_df_test],
#                                   }, index=test_df.index)
# principal_a_df_test = test_df[GENERATION_FEATURES + WEATHER_FEATURES].apply(lambda x: all_feature_relations(x), axis=1)
# principal_a_df_test = pd.DataFrame({'a_cosine_similarity': [x[0] for x in principal_a_df_test],
#                              'a_vector_norm_ratio': [x[1] for x in principal_a_df_test],
#                               'pc_a_1': [x[2] for x in principal_a_df_test],
#                               'pc_a_2': [x[3] for x in principal_a_df_test],
#                               'pc_a_3': [x[4] for x in principal_a_df_test],
#                               'pc_a_4': [x[5] for x in principal_a_df_test],
#                               'pc_a_5': [x[6] for x in principal_a_df_test],
#                               'pc_a_6': [x[7] for x in principal_a_df_test],
#                               'pc_a_7': [x[8] for x in principal_a_df_test],
#                               'pc_a_8': [x[9] for x in principal_a_df_test],
#                              }, index=test_df.index)

# test_df_pca = pd.concat([test_df, principal_g_df_test, principal_w_df_test, principal_a_df_test], axis=1)

In [87]:
# pca_generation.explained_variance_/pca_generation.explained_variance_.sum()

In [88]:
# pca_weather.explained_variance_/pca_weather.explained_variance_.sum()

In [89]:
# pca_all.explained_variance_/pca_all.explained_variance_.sum()

## 特徴量選択

In [90]:
# train_df = train_df.dropna(axis=0, how='any')
# data_x = train_df.drop(columns=[_common.TARGET_COL, _common.UNIQUE_KEY_COLS])
# data_y = train_df[_common.TARGET_COL]

In [91]:
# from sklearn.ensemble import RandomForestRegressor

# rf_reg = RandomForestRegressor(
#     n_estimators=500,
#     n_jobs=-1,
#     max_depth=10,
#     random_state=1234 
# )
# rf_reg = rf_reg.fit(data_x, data_y.values.ravel())

# # 特徴量重要度の取得
# fti = rf_reg.feature_importances_
# feature_importance_df = pd.DataFrame({
#     'feature': data_x.columns,
#     'importance': fti
# }).sort_values('importance', ascending=False).reset_index(drop=True)

# feature_importance_df

In [92]:
# train_df_pca = train_df_pca[['pc_a_1', 'pc_a_2', 'pc_a_3']].reset_index()
# test_df_pca = test_df_pca[['pc_a_1', 'pc_a_2', 'pc_a_3']].reset_index()

In [93]:
# train_df = pd.merge(train_df, train_df_pca, how='left', on=_common.UNIQUE_KEY_COLS)
# test_df = pd.merge(test_df, test_df_pca, how='left', on=_common.UNIQUE_KEY_COLS)

## 前処理済みのファイル出力

In [94]:
train_df = train_df.dropna()

In [95]:
train_df.to_csv('../output/中間データ/学習用データ/train_preprocessed.csv', index=False)
test_df.to_csv('../output/中間データ/評価用データ/test_preprocessed.csv', index=False)

In [96]:
# カラム名のセットを取得
train_cols = set(train_df.columns)
test_cols = set(test_df.columns)

# どちらかにしかないカラム
only_in_train = train_cols - test_cols
only_in_test = test_cols - train_cols

print("train_dfのみに存在:", only_in_train)
print("test_dfのみに存在:", only_in_test)

train_dfのみに存在: {'price_actual'}
test_dfのみに存在: set()
