### Imports

In [1]:
import os
import pickle
import numpy as np
import pandas as pd
from glob import glob
from os.path import basename, dirname, isdir, isfile, join
from tqdm.notebook import tqdm

import xgboost as xgb
import lightgbm as lgb
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, make_scorer, mean_absolute_error
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate

from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set()

In [2]:
root = "/Volumes/hd_4tb/results/training/*/*"
dfs = list()
files = sorted(glob(root))
for folder in tqdm(files):
    df = pd.read_csv(join(folder, "norm_simple.csv"))
    df["pred"] = np.load(join(folder, "pred.npy"))
    dfs.append(df)
    
df = pd.concat(dfs)
df.reset_index(drop=True).to_feather("/Volumes/hd_4tb/results/summary/norm_all.feather")

files = glob("/Volumes/hd_4tb/results/training/0110/*/norm_simple.csv")
dfs = list()
for f in files:
    df = pd.read_csv(f)
    df["pred"] = np.load(join(dirname(f), "pred.npy"))
    dfs.append(df)

df = pd.concat(dfs)
train_y = df.pop("pred")
preds = lgbm_model.predict(df)
guess = (df["next"] + df["prev"]) / 2
np.mean(np.abs(train_y - preds)), np.mean(np.abs(train_y - guess))
np.std(np.abs(train_y - preds)), np.std(np.abs(train_y - guess))

### Split data

<div hidden>
# root = "/Volumes/hd_4tb/results/training/*/*"
# dfs = list()
# for folder in tqdm(glob(root)):
#     df = pd.read_csv(join(folder, "simple.csv"))
#     df["pred"] = np.load(join(folder, "pred.npy"))
#     dfs.append(df)
    
# df = pd.concat(dfs)
</div>

In [1]:
import pandas as pd
df = pd.read_feather("/Users/pstetz/Desktop/confidential/.project/summary/norm_all.feather")
# df = df.sample(frac=1)
# df = df.reset_index(drop=True)

In [3]:
features = [c for c in df.columns if "next" not in c]
df = df[features]

In [4]:
train_x, test_x = train_test_split(df, random_state=5, train_size=0.80)
train_y = train_x.pop("pred")
test_y = test_x.pop("pred")

### Training

In [5]:
params = {
#     "num_iteration": [3000, 4000, 4500],
#     'max_depth': [5, 7, 15], 
#     'reg_alpha': [0, 0.1], 'reg_lambda': [0.1, 1], 
    "learning_rate": [0.05, 0.1, 0.15],
    "min_split_gain": [0.2, 0.25], #"min_data_in_leaf": [60], #'min_child_weight': 4,
#     "n_estimators": [125] #, "feature_fraction": 0.5
}
# mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)
mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)
model = lgb.LGBMRegressor(
    boosting_type="gbdt",
    objective="mean_absolute_error",
    learning_rate=0.1,
    min_data_in_leaf=60,
    min_split_gain=0.25,
    n_estimators=125,
#     num_leaves=85,
    max_depth=-1,
)
# gs = GridSearchCV(model, param_grid=params, cv=5, verbose=10, scoring=mae_scorer)
model.fit(train_x, train_y)

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.1, max_depth=-1,
              min_child_samples=20, min_child_weight=0.001, min_data_in_leaf=60,
              min_split_gain=0.25, n_estimators=125, n_jobs=-1, num_leaves=31,
              objective='mean_absolute_error', random_state=None, reg_alpha=0.0,
              reg_lambda=0.0, silent=True, subsample=1.0,
              subsample_for_bin=200000, subsample_freq=0)

### Cross validate

In [45]:
mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)
model = lgb.LGBMRegressor(
    boosting_type="gbdt",
    objective="mse",
    learning_rate=0.05,
    min_data_in_leaf=80,
    min_split_gain=0.3,
    n_estimators=130,
    num_leaves=90,
    max_depth=-1,
)

cross_validate(model, train_x, train_y, cv=5, scoring=mse_scorer)

{'fit_time': array([16.28457785, 21.97705793, 29.57167625, 30.89967799, 20.19947004]),
 'score_time': array([0.59632802, 0.50218415, 0.52898574, 0.85004783, 0.46883893]),
 'test_score': array([-0.69077921, -0.68144191, -0.68156026, -0.68215386, -0.67898879])}

### Save model

In [15]:
model_filepath = "/Users/pstetz/Desktop/confidential/.project/run/lgbm/4_mae.pkl"

if not isdir(dirname(model_filepath)):
    os.makedirs(dirname(model_filepath))
    
with open(model_filepath, 'wb') as f:
    pickle.dump(model, f)