In [1]:
import numpy as np
import pandas as pd
import datetime
from pmdarima.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import optuna
from sklearn.inspection import PartialDependenceDisplay
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error # 各評価指標


from sklearn.model_selection import TimeSeriesSplit

import warnings
warnings.simplefilter('ignore')

import matplotlib.pyplot as plt
plt.style.use('ggplot') #グラフスタイル
plt.rcParams['figure.figsize'] = [12, 9] #グラフサイズ
plt.rcParams['font.size'] = 14 #フォントサイズ

In [2]:
df=pd.read_csv('dataset.csv',                         
               index_col='cdr_date',           
               parse_dates=True)           
df.head() 

Unnamed: 0_level_0,call_num,acc_get_cnt,acc_get_cnt_lag1,acc_get_cnt_lag7,acc_get_cnt_window7,acc_get_cnt_expanding,cm_flg,cm_cumsum,dow,woy,...,cos7_3,sin7_4,cos7_4,sin7_5,cos7_5,sin7_6,cos7_6,sin7_7,cos7_7,t
cdr_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-06-09,-0.801105,-0.297227,-0.569686,-0.743068,-0.035384,-0.035384,0,8,6,23,...,1.0,-9.797174e-16,1.0,-1.224647e-15,1.0,-1.469576e-15,1.0,-1.714506e-15,1.0,0
2018-06-10,0.0,-0.421072,-0.297227,-0.049538,0.028307,-0.068115,0,8,7,23,...,-0.900969,-0.4338837,-0.900969,-0.9749279,-0.222521,-0.7818315,0.62349,-1.959435e-15,1.0,1
2018-06-11,1.468692,0.978373,-0.421072,0.854528,-0.024769,-0.107332,0,8,1,24,...,0.62349,0.7818315,0.62349,0.4338837,-0.900969,-0.9749279,-0.222521,-2.204364e-15,1.0,2
2018-06-12,-0.567449,-0.185767,0.978373,-0.210536,-0.007077,0.001238,0,8,2,24,...,-0.222521,-0.9749279,-0.222521,0.7818315,0.62349,-0.4338837,-0.900969,-2.449294e-15,1.0,3
2018-06-13,0.250345,0.35915,-0.185767,0.136229,-0.003538,-0.015762,0,8,3,24,...,-0.222521,0.9749279,-0.222521,-0.7818315,0.62349,0.4338837,-0.900969,4.411204e-15,1.0,4


In [4]:
#直近3ヶ月をテストデータにする
train, test = train_test_split(df, test_size=91)

In [5]:
y_train = train['acc_get_cnt']    #目的変数y
X_train = train.drop('acc_get_cnt', axis=1) #説明変数X

y_test = test['acc_get_cnt']              #目的変数y
X_test = test.drop('acc_get_cnt', axis=1) #説明変数X

In [6]:
def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            pass

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

X_train = reduce_mem_usage(X_train)
X_test = reduce_mem_usage(X_test)

Memory usage of dataframe is 0.13 MB
Memory usage after optimization is: 0.04 MB
Decreased by 73.1%
Memory usage of dataframe is 0.02 MB
Memory usage after optimization is: 0.01 MB
Decreased by 73.6%


# Random Forest

In [7]:
regressor = RandomForestRegressor(random_state=123)
regressor.fit(X_train, y_train)

In [8]:
# 特徴量重要度（Feature Importances）
df_importance = pd.DataFrame(zip(X_train.columns, regressor.feature_importances_),
                             columns=["Features","Importance"])
df_importance = df_importance.sort_values("Importance",
                                          ascending=False)
df_importance #確認

Unnamed: 0,Features,Importance
0,call_num,0.397558
3,acc_get_cnt_window7,0.101858
4,acc_get_cnt_expanding,0.091653
1,acc_get_cnt_lag1,0.050899
23,cos7_4,0.038681
2,acc_get_cnt_lag7,0.03462
14,search_cnt,0.03427
21,cos7_3,0.033403
27,cos7_6,0.030142
25,cos7_5,0.028211


In [9]:
train_pred = regressor.predict(X_train)

In [10]:
# 学習データのコピー
y_train_new = y_train.copy()

# 説明変数Xを更新しながら予測を実施
for i in range(len(y_test)):
    
    #当期の予測の実施
    X_value =  X_test.iloc[i:(i+1),:]
    y_value_pred = regressor.predict(X_value)
    y_value_pred = pd.Series(y_value_pred,index=[X_value.index[0]])
    y_train_new = pd.concat([y_train_new,y_value_pred])
    
    #次期の説明変数Xの計算
    lag1_new = y_train_new.iloc[-1] #lag1
    lag7_new = y_train_new.iloc[-7] #lag1
    window7_new = y_train_new[-7:].mean() #window7
    expanding_new = y_train_new.mean() #expanding
    
    #次期の説明変数Xの更新
    X_test.iloc[(i+1):(i+2),1] = lag1_new
    X_test.iloc[(i+1):(i+2),2] = lag7_new
    X_test.iloc[(i+1):(i+2),3] = window7_new
    X_test.iloc[(i+1):(i+2),4] = expanding_new
    
# 予測値の代入
test_pred = y_train_new[-91:]
    
# 更新後の説明変数X
print(X_test)

            call_num  acc_get_cnt_lag1  acc_get_cnt_lag7  acc_get_cnt_window7  \
cdr_date                                                                        
2020-01-01  0.000000         -0.235352          0.173340            -0.107910   
2020-01-02  0.000000          0.223664          0.222920            -0.100739   
2020-01-03  0.216919          0.238277         -0.037153            -0.098545   
2020-01-04 -0.216919          0.247566         -0.854528            -0.057871   
2020-01-05  0.000000         -0.484976          0.086691            -0.005078   
...              ...               ...               ...                  ...   
2020-03-27  0.108459          0.131895         -0.122606             0.024274   
2020-03-28 -1.059570         -0.237039         -1.009087             0.007926   
2020-03-29  0.000000         -1.000046         -0.250042             0.009218   
2020-03-30  1.118164         -0.289549          1.338885             0.003574   
2020-03-31 -0.275391        

In [11]:
print('決定係数（R2） = ', r2_score(y_test, test_pred).round(decimals=3))
print('平均絶対誤差（MAE） = ', mean_absolute_error(y_test, test_pred).round(decimals=3))
print('平均二乗誤差（MSE） = ', mean_squared_error(y_test, test_pred).round(decimals=3))
print('平均二乗平方根誤差（RMSE） = ', np.sqrt(mean_squared_error(y_test, test_pred)).round(decimals=3))

決定係数（R2） =  0.425
平均絶対誤差（MAE） =  0.293
平均二乗誤差（MSE） =  0.135
平均二乗平方根誤差（RMSE） =  0.367


# optuna

In [12]:
# Optunaの目的関数の設定
def objective(trial):
    
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 10, 10000),
        'max_depth': trial.suggest_int('max_depth', 2, 10, log=True),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'max_features': trial.suggest_categorical('max_features', ['auto', 'sqrt', 'log2'])
    }
    
    #モデルのインスタンス生成
    regressor = RandomForestRegressor(**params)
    
    #時系列CV
    ##CVのインスタンス生成　
    tss = TimeSeriesSplit(test_size=91)
    ##CVの結果を格納するハコ
    cv_rmse = []
    ##CVの実行
    for fold, (train_index, valid_index) in enumerate(tss.split(X_train)):
        
        #データ分割
        ##訓練データ
        X_train_data = X_train.iloc[train_index].copy() #説明変数
        y_train_data = y_train.iloc[train_index].copy() #目的変数
        ##検証データ
        X_valid_data = X_train.iloc[valid_index].copy() #説明変数
        y_valid_data = y_train.iloc[valid_index].copy() #目的変数
        
        #学習
        regressor.fit(X_train_data, y_train_data)
        # 学習データのコピー
        y_train_new = y_train.copy()
        
        # 説明変数Xを更新しながら予測を実施
        for i in range(len(y_test)):
            
            #当期の予測の実施
            X_value =  X_test.iloc[i:(i+1),:]
            y_value_pred = regressor.predict(X_value)
            y_value_pred = pd.Series(y_value_pred,index=[X_value.index[0]])
            y_train_new = pd.concat([y_train_new,y_value_pred])
            
            #次期の説明変数Xの計算
            lag1_new = y_train_new.iloc[-1] #lag1
            lag7_new = y_train_new.iloc[-7] #lag1
            window7_new = y_train_new[-7:].mean() #window7
            expanding_new = y_train_new.mean() #expanding
            
            #次期の説明変数Xの更新
            X_test.iloc[(i+1):(i+2),1] = lag1_new
            X_test.iloc[(i+1):(i+2),2] = lag7_new
            X_test.iloc[(i+1):(i+2),3] = window7_new
            X_test.iloc[(i+1):(i+2),4] = expanding_new
            
        # 予測値の代入
        test_pred = y_train_new[-91:]
    
    
        #精度評価（RMSE）
        val_rmse = np.sqrt(mean_squared_error(y_valid_data, test_pred))
        cv_rmse.append(val_rmse)
    
    return np.mean(cv_rmse)

In [13]:
# ハイパーパラメータの探索の実施
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=20, n_jobs=-1)

[I 2024-01-29 00:58:56,472] A new study created in memory with name: no-name-2605ee69-4eb1-408c-9073-52745873fa81
[I 2024-01-29 01:06:15,569] Trial 5 finished with value: 0.4754171424522272 and parameters: {'n_estimators': 1736, 'max_depth': 3, 'min_samples_split': 15, 'min_samples_leaf': 2, 'max_features': 'sqrt'}. Best is trial 5 with value: 0.4754171424522272.
[I 2024-01-29 01:16:35,554] Trial 0 finished with value: 0.4936411142954341 and parameters: {'n_estimators': 2988, 'max_depth': 9, 'min_samples_split': 11, 'min_samples_leaf': 6, 'max_features': 'auto'}. Best is trial 5 with value: 0.4754171424522272.
[I 2024-01-29 01:22:03,887] Trial 2 finished with value: 0.4726386754081443 and parameters: {'n_estimators': 5047, 'max_depth': 8, 'min_samples_split': 18, 'min_samples_leaf': 5, 'max_features': 'log2'}. Best is trial 2 with value: 0.4726386754081443.
[I 2024-01-29 01:23:35,860] Trial 1 finished with value: 0.4744107151671445 and parameters: {'n_estimators': 7946, 'max_depth': 3,

In [14]:
# 最適解の出力
print(study.best_params)

{'n_estimators': 8091, 'max_depth': 4, 'min_samples_split': 15, 'min_samples_leaf': 8, 'max_features': 'sqrt'}


In [15]:
regressor = RandomForestRegressor(random_state=456, **study.best_params)
regressor.fit(X_train, y_train)

In [16]:
# 特徴量重要度（Feature Importances）
df_importance = pd.DataFrame(zip(X_train.columns, regressor.feature_importances_),
                             columns=["Features","Importance"])
df_importance = df_importance.sort_values("Importance",
                                          ascending=False)
df_importance #確認

Unnamed: 0,Features,Importance
0,call_num,0.177792
7,dow,0.086341
27,cos7_6,0.066004
17,cos7_1,0.06369
16,sin7_1,0.061382
26,sin7_6,0.059755
23,cos7_4,0.059001
21,cos7_3,0.057361
25,cos7_5,0.055393
19,cos7_2,0.050748


In [17]:
train_pred = regressor.predict(X_train)

In [18]:
# 学習データのコピー
y_train_new = y_train.copy()

# 説明変数Xを更新しながら予測を実施
for i in range(len(y_test)):
    
    #当期の予測の実施
    X_value =  X_test.iloc[i:(i+1),:]
    y_value_pred = regressor.predict(X_value)
    y_value_pred = pd.Series(y_value_pred,index=[X_value.index[0]])
    y_train_new = pd.concat([y_train_new,y_value_pred])
    
    #次期の説明変数Xの計算
    lag1_new = y_train_new.iloc[-1] #lag1
    lag7_new = y_train_new.iloc[-7] #lag1
    window7_new = y_train_new[-7:].mean() #window7
    expanding_new = y_train_new.mean() #expanding
    
    #次期の説明変数Xの更新
    X_test.iloc[(i+1):(i+2),1] = lag1_new
    X_test.iloc[(i+1):(i+2),2] = lag7_new
    X_test.iloc[(i+1):(i+2),3] = window7_new
    X_test.iloc[(i+1):(i+2),4] = expanding_new
    
# 予測値の代入
test_pred = y_train_new[-91:]
    
# 更新後の説明変数X
print(X_test)

            call_num  acc_get_cnt_lag1  acc_get_cnt_lag7  acc_get_cnt_window7  \
cdr_date                                                                        
2020-01-01  0.000000         -0.235352          0.173340            -0.107910   
2020-01-02  0.000000          0.070826          0.222920            -0.122573   
2020-01-03  0.216919          0.100679         -0.037153            -0.140036   
2020-01-04 -0.216919         -0.044353         -0.854528            -0.141064   
2020-01-05  0.000000         -0.688501          0.086691            -0.117346   
...              ...               ...               ...                  ...   
2020-03-27  0.108459          0.101056         -0.116383             0.013526   
2020-03-28 -1.059570         -0.081650         -0.763067             0.018487   
2020-03-29  0.000000         -0.786740         -0.204179             0.015105   
2020-03-30  1.118164         -0.205804          0.905858             0.014873   
2020-03-31 -0.275391        

In [19]:
print('決定係数（R2） = ', r2_score(y_test, test_pred).round(decimals=3))
print('平均絶対誤差（MAE） = ', mean_absolute_error(y_test, test_pred).round(decimals=3))
print('平均二乗誤差（MSE） = ', mean_squared_error(y_test, test_pred).round(decimals=3))
print('平均二乗平方根誤差（RMSE） = ', np.sqrt(mean_squared_error(y_test, test_pred)).round(decimals=3))


決定係数（R2） =  0.59
平均絶対誤差（MAE） =  0.243
平均二乗誤差（MSE） =  0.096
平均二乗平方根誤差（RMSE） =  0.31


In [20]:
test_pred_reset = test_pred.reset_index()  # インデックスをリセットし、データフレームに変換する

# CSVファイルに書き出す
test_pred_reset.to_csv('rf_pred.csv', index=False)  