In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from sklearn.ensemble import RandomForestRegressor

In [None]:
dir = 'drive/MyDrive/3001 Project/data/'

cal = pd.read_csv(f'{dir}raw data/calendar.csv')
sell = pd.read_csv(f'{dir}raw data/sell_prices.csv.zip', compression = 'zip')
train_eval = pd.read_csv(f'{dir}raw data/sales_train_evaluation.csv.zip',\
                         compression = 'zip')

cal_dummies = pd.read_csv(f'{dir}calendar_w_dummies.csv')
cal_dummies.set_index(['d'], inplace = True)

product = '099'

In [656]:
def get_active(df):
  df['active'] = df.max(axis = 1)
  idx = 0
  for i in range(df.shape[0]):
    if df.iloc[i, -1] > 0:
      idx = int(i)
      break
  
  return df.iloc[idx:, :-1]


def trans_data(product, state, store):
  df = train_eval.copy()
  item = 'FOODS_3_' + product
  s = state + '_' + store
  df = df[(df.state_id == state) & (df.item_id == item) & (df.store_id == s)]
  df.set_index(['id'], inplace = True)

  df = df.iloc[:, 5:].T
  df = get_active(df)
  df['roll_avg_42'] = df.iloc[:, 0].rolling(42).mean()
  df['roll_avg_28'] = df.iloc[:, 0].rolling(28).mean()
  df['roll_avg_14'] = df.iloc[:, 0].rolling(14).mean()
  df['roll_std_42'] = df.iloc[:, 0].rolling(42).std()
  df['roll_std_28'] = df.iloc[:, 0].rolling(28).std()
  df['roll_std_14'] = df.iloc[:, 0].rolling(14).std()
  df = df.merge(cal_dummies, left_index = True, right_index = True)
  if state == 'CA':
    df = df.drop(['snap_TX', 'snap_WI'], axis = 1)
  elif state == 'TX':
    df = df.drop(['snap_CA', 'snap_WI'], axis = 1)
  elif state == 'WI':
    df = df.drop(['snap_CA', 'snap_TX'], axis = 1)
  
  # droping rows with nan
  df = df.dropna()
  Y = df.iloc[1:, 0]
  df = df.iloc[:-1, :]
  # standardize features
  df = (df - df.mean())/df.std()

  return df.iloc[:-56,:],Y.iloc[:-56],df.iloc[-56:-28,:],Y.iloc[-56:-28],df.iloc[-28:,:],Y.iloc[-28:]


def rmsse(train, val, pred):
  num = np.sum(np.square(pred - val))
  idx = 0
  for i in range(train.size):
    if train[i] > 0:
      idx = int(i)
      break
  
  train_sub = train[idx:]
  denom = np.sum(np.square(train_sub[1:].values - train_sub[:-1].values))/(train_sub.size-1)
  rmsse = num/denom

  return np.sqrt(rmsse/28)


def plot_learn_curve(xlist, ylist, xtitle, store_name):
  fig = go.Figure(data = go.Scatter(x = xlist, y = ylist))
  comb_title = 'RMSSE for Different ' + xtitle + ' for FOODS_3_' + product + ' Daily Unit Sales in ' + store_name
  fig.update_layout(title = comb_title, xaxis_title = xtitle,\
                    yaxis_title = 'RMSSE')
  fig.show()


def comb_train_val(X_train, Y_train, X_val, Y_val):
  X = pd.concat([X_train, X_val])
  Y = pd.concat([Y_train, Y_val])
  return X,Y


def plot_daily_truth_pred(truth, pred, store):
  fig = go.Figure()
  fig.add_trace(go.Scatter(x = list(range(28)), y = truth, mode = 'lines',\
                           name = 'Ground Truth'))

  fig.add_trace(go.Scatter(x = list(range(28)), y = pred, mode = 'lines',\
                           name = 'Prediction'))

  title = 'Ground Truth v.s. Random Forest Predictions for FOODS_3_' + product + ' Daily Unit Sales in ' + store
  fig.update_layout(title_text = title, xaxis_title = 'Days',\
                    yaxis_title = 'Unit Sales')
  fig.show()


def plot_sum_truth_pred(truth, pred, store):
  fig = go.Figure([go.Bar(x = ['Ground Truth', 'Prediction'],\
                          y = [np.sum(truth), np.sum(pred)])])

  title = 'Ground Truth v.s. Predictions for Sum of FOODS_3_' + product + ' Sales in ' + store
  fig.update_layout(title_text = title)
  fig.show()


def estimate_depth(X, Y):
  rf = RandomForestRegressor()
  rf.fit(X, Y)
  depth = [estimator.tree_.max_depth for estimator in rf.estimators_]
  return min(depth),max(depth)


def eval_depth(depth, X_train, Y_train, X_val, Y_val):
  RMSSE = []
  for md in depth:
    rf = RandomForestRegressor(max_depth = md)
    rf.fit(X_train, Y_train)
    pred = rf.predict(X_val)
    RMSSE.append(rmsse(Y_train, Y_val, pred))
  
  return RMSSE


def eval_leaf(leafs, md, X_train, Y_train, X_val, Y_val):
  RMSSE = []
  for msl in leafs:
    rf = RandomForestRegressor(max_depth = md, min_samples_leaf = msl)
    rf.fit(X_train, Y_train)
    pred = rf.predict(X_val)
    RMSSE.append(rmsse(Y_train, Y_val, pred))

  return RMSSE


def eval_split(split, md, msl, X_train, Y_train, X_val, Y_val):
  RMSSE = []
  for mss in split:
    rf = RandomForestRegressor(max_depth = md, min_samples_leaf = msl,\
                               min_samples_split = mss)
    rf.fit(X_train, Y_train)
    pred = rf.predict(X_val)
    RMSSE.append(rmsse(Y_train, Y_val, pred))

  return RMSSE


def eval_n(est, md, msl, mss, X_train, Y_train, X_val, Y_val):
  RMSSE = []
  for n in est:
    rf = RandomForestRegressor(max_depth = md, min_samples_leaf = msl,\
                               min_samples_split = mss, n_estimators = n)
    rf.fit(X_train, Y_train)
    pred = rf.predict(X_val)
    RMSSE.append(rmsse(Y_train, Y_val, pred))

  return RMSSE

In [None]:
leaf_range = list(range(5, 51, 5))
est_range = list(range(50, 151, 5))
depth_range = list(range(15, 36, 3))
est_x_label = 'Number of Trees in the Forest'
depth_x_label = 'Maximum Depth'
leaf_x_label = 'Minimum Samples of Leaf Nodes'
split_x_label = 'Minimum Number of Samples to Split'
split_step = 10
leaf_step = 5
leaf_start = 5

# CA1

In [None]:
X_train_CA1,Y_train_CA1,X_val_CA1,Y_val_CA1,X_test_CA1,Y_test_CA1 = trans_data(product,\
                                                                               'CA',\
                                                                               '1')

X_train_val_CA1,Y_train_val_CA1 = comb_train_val(X_train_CA1, Y_train_CA1,\
                                                 X_val_CA1, Y_val_CA1)

store = 'CA1'

In [None]:
RMSSE_depth = eval_depth(depth_range, X_train_CA1, Y_train_CA1, X_val_CA1,\
                         Y_val_CA1)

plot_learn_curve(depth_range, RMSSE_depth, depth_x_label, store)

Optimal Maximum Depth = 21

In [None]:
depth_CA1 = 21

In [None]:
RMSSE_leaf = eval_leaf(leaf_range, depth_CA1, X_train_CA1, Y_train_CA1,\
                       X_val_CA1, Y_val_CA1)

plot_learn_curve(leaf_range, RMSSE_leaf, leaf_x_label, store)

Optimal Minimum Samples for Leaf Nodes = 40

In [None]:
leaf_CA1 = 40

In [None]:
RMSSE_split = eval_split(range(85, 151, split_step), depth_CA1, leaf_CA1,\
                         X_train_CA1, Y_train_CA1, X_val_CA1, Y_val_CA1)

plot_learn_curve(list(range(85, 151, split_step)), RMSSE_split, split_x_label,\
                 store)

Optimal Minimum Number of Samples to Split = 115

In [None]:
split_CA1 = 115

In [None]:
RMSSE_n = eval_n(est_range, depth_CA1, leaf_CA1, split_CA1, X_train_CA1,\
                 Y_train_CA1, X_val_CA1, Y_val_CA1)

plot_learn_curve(est_range, RMSSE_n, est_x_label, store)

Optimal Model:

Maximum Depth = 21

Minimum Samples for Leaf Nodes = 40

Minimum Number of Samples to Split = 115

Number of Tress in the Forest = 60

In [None]:
n_CA1 = 60

In [None]:
rf_CA1 = RandomForestRegressor(max_depth = depth_CA1,\
                               min_samples_leaf = leaf_CA1,\
                               min_samples_split = split_CA1,\
                               n_estimators = n_CA1)

rf_CA1.fit(X_train_val_CA1, Y_train_val_CA1)
pred_CA1 = rf_CA1.predict(X_test_CA1)
rmsse_CA1 = rmsse(Y_train_val_CA1, Y_test_CA1, pred_CA1)
rmsse_CA1

0.615175880227047

In [657]:
plot_daily_truth_pred(Y_test_CA1, pred_CA1, 'CA1')

In [None]:
plot_sum_truth_pred(Y_test_CA1, pred_CA1, 'CA1')

# CA2

In [None]:
X_train_CA2,Y_train_CA2,X_val_CA2,Y_val_CA2,X_test_CA2,Y_test_CA2 = trans_data(product,\
                                                                               'CA',\
                                                                               '2')

X_train_val_CA2,Y_train_val_CA2 = comb_train_val(X_train_CA2, Y_train_CA2,\
                                                 X_val_CA2, Y_val_CA2)

store = 'CA2'

In [None]:
RMSSE_depth = eval_depth(depth_range, X_train_CA2, Y_train_CA2, X_val_CA2,\
                         Y_val_CA2)

plot_learn_curve(depth_range, RMSSE_depth, depth_x_label, store)

Optimal Maximum Depth = 21

In [None]:
depth_CA2 = 21

In [None]:
RMSSE_leaf = eval_leaf(leaf_range, depth_CA2, X_train_CA2, Y_train_CA2,\
                       X_val_CA2, Y_val_CA2)

plot_learn_curve(leaf_range, RMSSE_leaf, leaf_x_label, store)

Optimal Minimum Samples for Leaf Nodes = 10

In [None]:
leaf_CA2 = 10

In [None]:
RMSSE_split = eval_split(range(20, 101, split_step), depth_CA2, leaf_CA2, X_train_CA2,\
                         Y_train_CA2, X_val_CA2, Y_val_CA2)

plot_learn_curve(list(range(20, 101, split_step)), RMSSE_split, split_x_label,\
                 store)

Optimal Minimum Number of Samples to Split = 50

In [None]:
split_CA2 = 50

In [None]:
RMSSE_n = eval_n(est_range, depth_CA2, leaf_CA2, split_CA2, X_train_CA2,\
                 Y_train_CA2, X_val_CA2, Y_val_CA2)

plot_learn_curve(est_range, RMSSE_n, est_x_label, store)

Optimal Model:

Maximum Depth = 21

Minimum Samples for Leaf Nodes = 10

Minimum Number of Samples to Split = 50

Number of Tress in the Forest = 80

In [None]:
n_CA2 = 80
rf_CA2 = RandomForestRegressor(max_depth = depth_CA2,\
                               min_samples_leaf = leaf_CA2,\
                               min_samples_split = split_CA2,\
                               n_estimators = n_CA2)

rf_CA2.fit(X_train_val_CA2, Y_train_val_CA2)
pred_CA2 = rf_CA2.predict(X_test_CA2)
rmsse_CA2 = rmsse(Y_train_val_CA2, Y_test_CA2, pred_CA2)
rmsse_CA2

0.8947989522380245

In [658]:
plot_daily_truth_pred(Y_test_CA2, pred_CA2, 'CA2')

In [None]:
plot_sum_truth_pred(Y_test_CA2, pred_CA2, 'CA2')

# CA3

In [None]:
X_train_CA3,Y_train_CA3,X_val_CA3,Y_val_CA3,X_test_CA3,Y_test_CA3 = trans_data(product,\
                                                                               'CA',\
                                                                               '3')

X_train_val_CA3,Y_train_val_CA3 = comb_train_val(X_train_CA3, Y_train_CA3,\
                                                 X_val_CA3, Y_val_CA3)

store = 'CA3'


In [None]:
RMSSE_depth = eval_depth(depth_range, X_train_CA3, Y_train_CA3, X_val_CA3,\
                         Y_val_CA3)

plot_learn_curve(depth_range, RMSSE_depth, depth_x_label, store)

Optimal Maximum Depth = 15

In [None]:
depth_CA3 = 15

In [None]:
RMSSE_leaf = eval_leaf(leaf_range, depth_CA3, X_train_CA3, Y_train_CA3,\
                       X_val_CA3, Y_val_CA3)

plot_learn_curve(leaf_range, RMSSE_leaf, leaf_x_label, store)

Optimal Minimum Samples of Leaf Nodes = 35

In [None]:
leaf_CA3 = 35

In [None]:
RMSSE_split = eval_split(range(70, 201, split_step), depth_CA3, leaf_CA3,\
                         X_train_CA3, Y_train_CA3, X_val_CA3, Y_val_CA3)

plot_learn_curve(list(range(70, 201, split_step)), RMSSE_split, split_x_label,\
                 store)

Optimal Minimum Number of Samples to Split = 100

In [None]:
split_CA3 = 100

In [None]:
RMSSE_n = eval_n(est_range, depth_CA3, leaf_CA3, split_CA3, X_train_CA3,\
                 Y_train_CA3, X_val_CA3, Y_val_CA3)

plot_learn_curve(est_range, RMSSE_n, est_x_label, store)

Optimal Model:

Maximum Depth = 15

Minimum Samples for Leaf Nodes = 35

Minimum Number of Samples to Split = 100

Number of Tress in the Forest = 60

In [None]:
n_CA3 = 60
rf_CA3 = RandomForestRegressor(max_depth = depth_CA3,\
                               min_samples_leaf = leaf_CA3,\
                               min_samples_split = split_CA3,\
                               n_estimators = n_CA3)

rf_CA3.fit(X_train_val_CA3, Y_train_val_CA3)
pred_CA3 = rf_CA3.predict(X_test_CA3)
rmsse_CA3 = rmsse(Y_train_val_CA3, Y_test_CA3, pred_CA3)
rmsse_CA3

0.6678187462730941

In [659]:
plot_daily_truth_pred(Y_test_CA3, pred_CA3, 'CA3')

In [None]:
plot_sum_truth_pred(Y_test_CA3, pred_CA3, 'CA3')

# CA4

In [None]:
X_train_CA4,Y_train_CA4,X_val_CA4,Y_val_CA4,X_test_CA4,Y_test_CA4 = trans_data(product,\
                                                                               'CA',\
                                                                               '4')

X_train_val_CA4,Y_train_val_CA4 = comb_train_val(X_train_CA4, Y_train_CA4,\
                                                 X_val_CA4, Y_val_CA4)

store = 'CA4'


In [None]:
RMSSE_depth = eval_depth(depth_range, X_train_CA4, Y_train_CA4, X_val_CA4,\
                         Y_val_CA4)

plot_learn_curve(depth_range, RMSSE_depth, depth_x_label, store)

Optimal Maximum Depth = 27

In [None]:
depth_CA4 = 27

In [None]:
RMSSE_leaf = eval_leaf(leaf_range, depth_CA4, X_train_CA4, Y_train_CA4,\
                       X_val_CA4, Y_val_CA4)

plot_learn_curve(leaf_range, RMSSE_leaf, leaf_x_label, store)

Optimal Minimum Samples of Leaf Nodes = 5

In [None]:
leaf_CA4 = 5

In [None]:
RMSSE_split = eval_split(range(15, 101, split_step), depth_CA4, leaf_CA4,\
                         X_train_CA4, Y_train_CA4, X_val_CA4, Y_val_CA4)

plot_learn_curve(list(range(15, 101, split_step)), RMSSE_split, split_x_label,\
                 store)

Optimal Minimum Number of Samples to Split = 35

In [None]:
split_CA4 = 35

In [None]:
RMSSE_n = eval_n(est_range, depth_CA4, leaf_CA4, split_CA4, X_train_CA4,\
                 Y_train_CA4, X_val_CA4, Y_val_CA4)

plot_learn_curve(est_range, RMSSE_n, est_x_label, store)

Optimal Model:

Maximum Depth = 27

Minimum Samples for Leaf Nodes = 5

Minimum Number of Samples to Split = 35

Number of Tress in the Forest = 70


In [None]:
n_CA4 = 70
rf_CA4 = RandomForestRegressor(max_depth = depth_CA4,\
                               min_samples_leaf = leaf_CA4,\
                               min_samples_split = split_CA4,\
                               n_estimators = n_CA4)

rf_CA4.fit(X_train_val_CA4, Y_train_val_CA4)
pred_CA4 = rf_CA4.predict(X_test_CA4)
rmsse_CA4 = rmsse(Y_train_val_CA4, Y_test_CA4, pred_CA4)
rmsse_CA4

0.6741535397951306

In [660]:
plot_daily_truth_pred(Y_test_CA4, pred_CA4, 'CA4')

In [None]:
plot_sum_truth_pred(Y_test_CA4, pred_CA4, 'CA4')

# TX1

In [None]:
X_train_TX1,Y_train_TX1,X_val_TX1,Y_val_TX1,X_test_TX1,Y_test_TX1 = trans_data(product,\
                                                                               'TX',\
                                                                               '1')

X_train_val_TX1,Y_train_val_TX1 = comb_train_val(X_train_TX1, Y_train_TX1,\
                                                 X_val_TX1, Y_val_TX1)

store = 'TX1'


In [None]:
RMSSE_depth = eval_depth(depth_range, X_train_TX1, Y_train_TX1, X_val_TX1,\
                         Y_val_TX1)

plot_learn_curve(depth_range, RMSSE_depth, depth_x_label, store)

Optimal Maximum Depth = 15

In [None]:
depth_TX1 = 15

In [None]:
RMSSE_leaf = eval_leaf(range(leaf_start, 101, leaf_step), depth_TX1,\
                       X_train_TX1, Y_train_TX1, X_val_TX1, Y_val_TX1)

plot_learn_curve(list(range(leaf_start, 101, leaf_step)), RMSSE_leaf,\
                 leaf_x_label, store)

Optimal Minimum Samples of Leaf Nodes = 50

In [None]:
leaf_TX1 = 50

In [None]:
RMSSE_split = eval_split(range(105, 201, split_step), depth_TX1, leaf_TX1,\
                         X_train_TX1, Y_train_TX1, X_val_TX1, Y_val_TX1)

plot_learn_curve(list(range(105, 201, split_step)), RMSSE_split, split_x_label,\
                 store)

Optimal Minimum Number of Samples to Split = 135

In [None]:
split_TX1 = 135

In [None]:
RMSSE_n = eval_n(est_range, depth_TX1, leaf_TX1, split_TX1, X_train_TX1,\
                 Y_train_TX1, X_val_TX1, Y_val_TX1)

plot_learn_curve(est_range, RMSSE_n, est_x_label, store)

Optimal Model:

Maximum Depth = 15

Minimum Samples for Leaf Nodes = 50

Minimum Number of Samples to Split = 135

Number of Tress in the Forest = 60

In [None]:
n_TX1 = 60
rf_TX1 = RandomForestRegressor(max_depth = depth_TX1,\
                               min_samples_leaf = leaf_TX1,\
                               min_samples_split = split_TX1,\
                               n_estimators = n_TX1)

rf_TX1.fit(X_train_val_TX1, Y_train_val_TX1)
pred_TX1 = rf_TX1.predict(X_test_TX1)
rmsse_TX1 = rmsse(Y_train_val_TX1, Y_test_TX1, pred_TX1)
rmsse_TX1


0.6510512974428965

In [661]:
plot_daily_truth_pred(Y_test_TX1, pred_TX1, 'TX1')

In [None]:
plot_sum_truth_pred(Y_test_TX1, pred_TX1, 'TX1')

# TX2

In [None]:
X_train_TX2,Y_train_TX2,X_val_TX2,Y_val_TX2,X_test_TX2,Y_test_TX2 = trans_data(product,\
                                                                               'TX',\
                                                                               '2')

X_train_val_TX2,Y_train_val_TX2 = comb_train_val(X_train_TX2, Y_train_TX2,\
                                                 X_val_TX2, Y_val_TX2)

store = 'TX2'


In [None]:
RMSSE_depth = eval_depth(depth_range, X_train_TX2, Y_train_TX2, X_val_TX2,\
                         Y_val_TX2)

plot_learn_curve(depth_range, RMSSE_depth, depth_x_label, store)

Optimal Maximum Depth = 24

In [None]:
depth_TX2 = 24

In [None]:
RMSSE_leaf = eval_leaf(leaf_range, depth_TX2, X_train_TX2, Y_train_TX2,\
                       X_val_TX2, Y_val_TX2)

plot_learn_curve(leaf_range, RMSSE_leaf, leaf_x_label, store)

Optimal Minimum Samples of Leaf Nodes = 20

In [None]:
leaf_TX2 = 20

In [None]:
RMSSE_split = eval_split(range(45, 101, split_step), depth_TX2, leaf_TX2,\
                         X_train_TX2, Y_train_TX2, X_val_TX2, Y_val_TX2)

plot_learn_curve(list(range(45, 101, split_step)), RMSSE_split, split_x_label,\
                 store)

Optimal Minimum Number of Samples to Split = 45

In [None]:
split_TX2 = 45

In [None]:
RMSSE_n = eval_n(est_range, max_depth_TX2, leaf_TX2, split_TX2, X_train_TX2,\
                 Y_train_TX2, X_val_TX2, Y_val_TX2)

plot_learn_curve(est_range, RMSSE_n, est_x_label, store)

Optimal Model:

Maximum Depth = 24

Minimum Samples for Leaf Nodes = 20

Minimum Number of Samples to Split = 45

Optimal Number of Trees in the Forest = 50

In [None]:
n_TX2 = 50
rf_TX2 = RandomForestRegressor(max_depth = depth_TX2,\
                               min_samples_leaf = leaf_TX2,\
                               min_samples_split = split_TX2,\
                               n_estimators = n_TX2)

rf_TX2.fit(X_train_val_TX2, Y_train_val_TX2)
pred_TX2 = rf_TX2.predict(X_test_TX2)
rmsse_TX2 = rmsse(Y_train_val_TX2, Y_test_TX2, pred_TX2)
rmsse_TX2


0.5418974475010089

In [662]:
plot_daily_truth_pred(Y_test_TX2, pred_TX2, 'TX2')

In [None]:
plot_sum_truth_pred(Y_test_TX2, pred_TX2, 'TX2')

# TX3

In [None]:
X_train_TX3,Y_train_TX3,X_val_TX3,Y_val_TX3,X_test_TX3,Y_test_TX3 = trans_data(product,\
                                                                               'TX',\
                                                                               '3')

X_train_val_TX3,Y_train_val_TX3 = comb_train_val(X_train_TX3, Y_train_TX3,\
                                                 X_val_TX3, Y_val_TX3)

store = 'TX3'

In [None]:
RMSSE_depth = eval_depth(depth_range, X_train_TX3, Y_train_TX3, X_val_TX3,\
                         Y_val_TX3)

plot_learn_curve(depth_range, RMSSE_depth, depth_x_label, store)

Optimal Maximum Depth = 21

In [None]:
depth_TX3 = 21

In [None]:
RMSSE_leaf = eval_leaf(leaf_range, depth_TX3, X_train_TX3, Y_train_TX3,\
                       X_val_TX3, Y_val_TX3)

plot_learn_curve(leaf_range, RMSSE_leaf, leaf_x_label, store)

Minimum Samples of Leaf Nodes = 10

In [None]:
leaf_TX3 = 10

In [None]:
RMSSE_split = eval_split(range(25, 101, split_step), depth_TX3, leaf_TX3,\
                         X_train_TX3, Y_train_TX3, X_val_TX3, Y_val_TX3)

plot_learn_curve(list(range(25, 101, split_step)), RMSSE_split, split_x_label,\
                 store)

Optimal Minimum Number of Samples to Split = 35

In [None]:
split_TX3 = 35

In [None]:
RMSSE_n = eval_n(est_range, depth_TX3, leaf_TX3, split_TX3, X_train_TX3,\
                 Y_train_TX3, X_val_TX3, Y_val_TX3)

plot_learn_curve(est_range, RMSSE_n, est_x_label, store)

Optimal Model:

Maximum Depth = 21

Minimum Samples for Leaf Nodes = 10

Minimum Number of Samples to Split = 35

Number of Tress in the Forest = 50

In [None]:
n_TX3 = 50
rf_TX3 = RandomForestRegressor(max_depth = depth_TX3,\
                               min_samples_leaf = leaf_TX3,\
                               min_samples_split = split_TX3,\
                               n_estimators = n_TX3)

rf_TX3.fit(X_train_val_TX3, Y_train_val_TX3)
pred_TX3 = rf_TX3.predict(X_test_TX3)
rmsse_TX3 = rmsse(Y_train_val_TX3, Y_test_TX3, pred_TX3)
rmsse_TX3


0.5392629890389373

In [667]:
store = 'TX3'
plot_daily_truth_pred(Y_test_TX3, pred_TX3, store)

In [None]:
plot_sum_truth_pred(Y_test_TX3, pred_TX3, store)

# WI1

In [None]:
X_train_WI1,Y_train_WI1,X_val_WI1,Y_val_WI1,X_test_WI1,Y_test_WI1 = trans_data(product,\
                                                                               'WI',\
                                                                               '1')

X_train_val_WI1,Y_train_val_WI1 = comb_train_val(X_train_WI1, Y_train_WI1,\
                                                 X_val_WI1, Y_val_WI1)

store = 'WI1'


In [None]:
RMSSE_depth = eval_depth(depth_range, X_train_WI1, Y_train_WI1, X_val_WI1,\
                             Y_val_WI1)

plot_learn_curve(depth_range, RMSSE_depth, depth_x_label, store)

Optimal Maximum Depth = 18

In [None]:
depth_WI1 = 18

In [None]:
RMSSE_leaf = eval_leaf(leaf_range, depth_WI1, X_train_WI1, Y_train_WI1,\
                       X_val_WI1, Y_val_WI1)

plot_learn_curve(leaf_range, RMSSE_leaf, range_x_label, store)

Optimal Minimum Samples of Leaf Nodes = 15

In [None]:
leaf_WI1 = 15

In [None]:
RMSSE_split = eval_split(range(35, 101, split_step), depth_WI1, leaf_WI1,\
                         X_train_WI1, Y_train_WI1, X_val_WI1, Y_val_WI1)

plot_learn_curve(list(range(35, 101, split_step)), RMSSE_split, split_x_label,\
                 store)

Optimal Number of Samples to Split = 35

In [None]:
split_WI1 = 35

In [None]:
RMSSE_n = eval_n(est_range, depth_WI1, leaf_WI1, split_WI1, X_train_WI1,\
                 Y_train_WI1, X_val_WI1, Y_val_WI1)

plot_learn_curve(est_range, RMSSE_n, est_x_label, store)

Optimal Model:

Maximum Depth = 18

Minimum Samples for Leaf Nodes = 15

Minimum Number of Samples to Split = 35

Number of Tress in the Forest = 60

In [None]:
n_WI1 = 60
rf_WI1 = RandomForestRegressor(max_depth = depth_WI1,\
                               min_samples_leaf = leaf_WI1,\
                               min_samples_split = split_WI1,\
                               n_estimators = n_WI1)

rf_WI1.fit(X_train_val_WI1, Y_train_val_WI1)
pred_WI1 = rf_WI1.predict(X_test_WI1)
rmsse_WI1 = rmsse(Y_train_val_WI1, Y_test_WI1, pred_WI1)
rmsse_WI1

0.5710824112113764

In [669]:
store = 'WI1'
plot_daily_truth_pred(Y_test_WI1, pred_WI1, store)

In [None]:
plot_sum_truth_pred(Y_test_WI1, pred_WI1, store)

# WI2

In [None]:
X_train_WI2,Y_train_WI2,X_val_WI2,Y_val_WI2,X_test_WI2,Y_test_WI2 = trans_data(product,\
                                                                               'WI',\
                                                                               '2')

X_train_val_WI2,Y_train_val_WI2 = comb_train_val(X_train_WI2, Y_train_WI2,\
                                                 X_val_WI2, Y_val_WI2)

store = 'WI2'

In [None]:
RMSSE_depth = eval_depth(depth_range, X_train_WI2, Y_train_WI2, X_val_WI2,\
                         Y_val_WI2)

plot_learn_curve(depth_range, RMSSE_depth, depth_x_label, store)

Optimal Maximum Depth = 24

In [None]:
depth_WI2 = 24

In [None]:
RMSSE_leaf = eval_leaf(range(leaf_start, 151, leaf_step), depth_WI2,\
                       X_train_WI2, Y_train_WI2, X_val_WI2, Y_val_WI2)

plot_learn_curve(list(range(leaf_start, 151, leaf_step)), RMSSE_leaf,\
                 leaf_x_label, store)

Optimal Minimum Samples of Leaf Nodes = 120

In [None]:
leaf_WI2 = 120

In [None]:
RMSSE_split = eval_split(range(245, 301, split_step), depth_WI2, leaf_WI2,\
                         X_train_WI2, Y_train_WI2, X_val_WI2, Y_val_WI2)

plot_learn_curve(list(range(245, 301, split_step)), RMSSE_split, split_x_label,\
                 store)

Optimal Minimum Number of Samples to Split = 255

In [None]:
split_WI2 = 255

In [None]:
RMSSE_n = eval_n(est_range, depth_WI2, leaf_WI2, split_WI2, X_train_WI2,\
                 Y_train_WI2, X_val_WI2, Y_val_WI2)

plot_learn_curve(est_range, RMSSE_n, est_x_label, store)

Optimal Model:

Maximum Depth = 24

Minimum Samples for Leaf Nodes = 120

Minimum Number of Samples to Split = 255

Number of Tress in the Forest = 70

In [None]:
n_WI2 = 70
rf_WI2 = RandomForestRegressor(max_depth = depth_WI2,\
                               min_samples_leaf = leaf_WI2,\
                               min_samples_split = split_WI2,\
                               n_estimators = n_WI2)

rf_WI2.fit(X_train_val_WI2, Y_train_val_WI2)
pred_WI2 = rf_WI2.predict(X_test_WI2)
rmsse_WI2 = rmsse(Y_train_val_WI2, Y_test_WI2, pred_WI2)
rmsse_WI2

1.0696080642687475

In [668]:
store = 'WI2'
plot_daily_truth_pred(Y_test_WI2, pred_WI2, store)

In [None]:
plot_sum_truth_pred(Y_test_WI2, pred_WI2, store)

# WI3

In [None]:
X_train_WI3,Y_train_WI3,X_val_WI3,Y_val_WI3,X_test_WI3,Y_test_WI3 = trans_data(product,\
                                                                               'WI',\
                                                                               '3')

X_train_val_WI3,Y_train_val_WI3 = comb_train_val(X_train_WI3, Y_train_WI3,\
                                                 X_val_WI3, Y_val_WI3)

store = 'WI3'


In [None]:
RMSSE_depth = eval_depth(depth_range, X_train_WI3, Y_train_WI3, X_val_WI3,\
                         Y_val_WI3)

plot_learn_curve(depth_range, RMSSE_depth, depth_x_label, store)

Optimal Maximum Depth = 15

In [None]:
depth_WI3 = 15

In [None]:
RMSSE_leaf = eval_leaf(leaf_range, depth_WI3, X_train_WI3, Y_train_WI3,\
                       X_val_WI3, Y_val_WI3)

plot_learn_curve(leaf_range, RMSSE_leaf, leaf_x_label, store)

Optimal Minimum Samples of Leaf Nodes = 25

In [None]:
leaf_WI3 = 25

In [None]:
RMSSE_split = eval_split(range(55, 151, split_step), depth_WI3, leaf_WI3,\
                         X_train_WI3, Y_train_WI3, X_val_WI3, Y_val_WI3)

plot_learn_curve(list(range(55, 151, split_step)), RMSSE_split, split_x_label,\
                 store)

Optimal Number of Samples to Split = 185

In [None]:
split_WI3 = 85

In [None]:
RMSSE_n = eval_n(est_range, depth_WI3, leaf_WI3, split_WI3, X_train_WI3,\
                 Y_train_WI3, X_val_WI3, Y_val_WI3)

plot_learn_curve(est_range, RMSSE_n, est_x_label, store)

Optimal Model:

Maximum Depth = 15

Minimum Samples for Leaf Nodes = 25

Minimum Number of Samples to Split = 185

Number of Tress in the Forest = 80

In [None]:
n_WI3 = 80
rf_WI3 = RandomForestRegressor(max_depth = depth_WI3,\
                               min_samples_leaf = leaf_WI3,\
                               min_samples_split = split_WI3,\
                               n_estimators = n_WI3)

rf_WI3.fit(X_train_val_WI3, Y_train_val_WI3)
pred_WI3 = rf_WI3.predict(X_test_WI3)
rmsse_WI3 = rmsse(Y_train_val_WI3, Y_test_WI3, pred_WI3)
rmsse_WI3

0.5788263023813098

In [666]:
plot_daily_truth_pred(Y_test_WI3, pred_WI3, store)

In [None]:
plot_sum_truth_pred(Y_test_WI3, pred_WI3, store)