In [1]:
import numpy as np
import pandas as pd
import pickle

In [2]:
test_df = pd.read_csv('./output/中間データ/test_with_時系列特徴量.csv')
test_df

Unnamed: 0,datetime,y,week,soldout,name,kcal,remarks,event,payday,weather,...,y_5lag,y_rolling_mean_3,y_rolling_max_3,y_rolling_min_3,y_rolling_mean_7,y_rolling_max_7,y_rolling_min_7,y_rolling_mean_14,y_rolling_max_14,y_rolling_min_14
0,2014-10-01,,水,1,メンチカツ,420.0,,,,雨,...,59.0,47.0,56.0,40.0,46.285714,59.0,29.0,54.285714,115.0,29.0
1,2014-10-02,,木,0,バーベキューチキン,415.0,,,,曇,...,50.0,,,,,,,,,
2,2014-10-03,,金,0,豚肉のマスタード焼き,405.0,,,,晴れ,...,45.0,,,,,,,,,
3,2014-10-06,,月,1,麻婆春雨,400.0,,,,雨,...,56.0,,,,,,,,,
4,2014-10-07,,火,0,厚揚げ肉みそ炒め,430.0,,,,晴れ,...,40.0,,,,,,,,,
5,2014-10-08,,水,0,完熟トマトのホットカレー,420.0,,,,晴れ,...,,,,,,,,,,
6,2014-10-09,,木,0,豚キムチ炒め,435.0,,キャリアアップ支援セミナー,,曇,...,,,,,,,,,,
7,2014-10-10,,金,0,ポークカレー,,お楽しみメニュー,,1.0,薄曇,...,,,,,,,,,,
8,2014-10-14,,火,0,若鶏梅肉包揚げ,408.0,,,,快晴,...,,,,,,,,,,
9,2014-10-15,,水,1,ミックスグリル,450.0,料理長のこだわりメニュー,,,雨,...,,,,,,,,,,


#### 日の抽出

In [3]:
test_df['day'] = test_df['datetime'].apply(lambda x : int(x.split('-')[2]))

#### 欠損値補完

In [4]:
test_df['payday'] = test_df['payday'].fillna(0)

In [5]:
test_df['precipitation'] = test_df['precipitation'].apply(lambda x: 0 if x == '--' else float(x) + 0.01)

In [6]:
test_df['kcal'] = test_df['kcal'].fillna(test_df.groupby('name')['kcal'].transform('mean'))
test_df['kcal_missing_flag'] = test_df['kcal'].isna().astype(int)
test_df['kcal'] = test_df['kcal'].fillna(-9999)

#### week

In [7]:
with open('./output/中間データ/target_encoding_week.pkl', 'rb') as f:
    week_map = pickle.load(f)

test_df['week'] = test_df['week'].map(week_map)

#### name

In [8]:
test_df['curry'] = test_df['name'].apply(lambda x : 1 if x.find("カレー") >=0 else 0)

In [9]:
# 読み込み
with open('./output/中間データ/menu_flags.pkl', 'rb') as f:
    menu_flags = pickle.load(f)

popular_menu = menu_flags['popular_menu']
unpopular_menu = menu_flags['unpopular_menu']

# フラグ列を再生成
test_df['popular'] = test_df['name'].apply(lambda x: 1 if x in popular_menu else 0)
test_df['unpopular'] = test_df['name'].apply(lambda x: 1 if x in unpopular_menu else 0)

#### weather

In [10]:
with open('./output/中間データ/target_encoding_weather.pkl', 'rb') as f:
    weather_map = pickle.load(f)

test_df['weather'] = test_df['weather'].map(weather_map)

#### エンコーディング

In [11]:
test_df['remarks'] = test_df['remarks'].notnull().astype(int)

In [12]:
test_df = pd.get_dummies(test_df, columns=['event'])

#### 標準化

In [13]:
# まず数値型のカラムを取得
num_cols = test_df.select_dtypes(include=['int', 'float']).drop(columns=["y"]).columns.tolist()

# その中から、binary (0か1しかない) 特徴量を除外
num_cols = [
    col for col in num_cols
    if test_df[col].nunique() > 2  # 0/1だけならnunique()は2になるので除外
]

In [14]:
outlier_value = -9999

for col in num_cols:
    # 外れ値を除いた部分で平均・標準偏差を計算
    mask = test_df[col] != outlier_value
    mean = test_df.loc[mask, col].mean()
    std = test_df.loc[mask, col].std()

    # 標準化（外れ値は後で0に置換）
    standardized = (test_df[col] - mean) / std

    # 結果を保存
    test_df[col] = standardized

#### 前処理済みのファイルをアウトプット

In [15]:
test_df.to_csv('./output/中間データ/test_preprocessed.csv', index=False)