In [1]:
#Standard Imports
import pandas as pd
import numpy as np
import os
import time
import joblib

#Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

#Modelling
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
import xgboost as xgb

from sklearn.metrics import mean_squared_error as mse

In [2]:
#Load from preprocessing.py
from preprocessing import load_avvail_data, engineer_features, timeseries_aggregate

In [3]:
MODEL_DIR = os.path.join('models')
MODEL_VERSION = 0.1
MODEL_VERSION_NOTE = '-'

In [4]:
def _model_train(X,y, random_state = 1234):
    """
    Train models and select the best one out of DecisionTreeRegression,  GradientBoostingRegression, AdaBoostRegression
    and XGBoostRegressor
    """
    start_time = time.time()
    
    #Train_Test_Split Data
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = random_state)
    
    
    ##Train Models
    
    GridSearchParameters = {'criterion': ['mse', 'mae', 'friedman_mse'],
                            'max_depth': [None, 10,20,50],
                            'max_features': ['auto', 'sqrt', 'log2']}, \
    {'criterion': ['mse', 'mae'],
     'max_features' : ['auto', 'sqrt'] }, \
    {'loss' : ['ls', 'lad', 'huber', 'quantile'],
     'learning_rate' : [0.1,0.01,0.001]}, \
    {'loss' : ['linear', 'square',],
     'learning_rate' : [0.1,0.01,0.5]}, \
    {'learning_rate': [0.05, 0.1, 0.01],
     'max_depth': [1, 5, 50],
     'n_estimators': [100, 1000, 500],
     'seed': [1234] }

    params = {
        'DTR_P' : GridSearchParameters[0],
        'RFR_P' : GridSearchParameters[1],
        'GBR_P' : GridSearchParameters[2],
        'ADA_P' : GridSearchParameters[3],
        'XGB_P' : GridSearchParameters[4],
    }
    
    regressor_dict = {
        'DTR' : DecisionTreeRegressor(random_state = random_state),
        'RFR' : RandomForestRegressor(random_state = random_state),
        'GBR' : GradientBoostingRegressor(random_state = random_state),
        'ADA' : AdaBoostRegressor(random_state = random_state),
        'XGB' : xgb.XGBRegressor()

    }
    
    models = {}
    
    for model_name in regressor_dict:
        
        pipe = Pipeline(steps = [('scaler', StandardScaler()),
                                ('regressor', regressor_dict[model_name])])
        grid = GridSearchCV(regressor_dict[model_name],
                           param_grid = params[model_name + '_P'], cv = 5)
        grid.fit(X_train, y_train)
        
        models[model_name] = grid
        
     
    model_scores = []
    
    #Test which model is optimal.
    for model in models:
        y_pred = models[model].predict(X_test)
        rmse = np.sqrt(mse(y_pred, y_test))
        model_scores.append(rmse)
    
    model_index = np.argmin(model_scores)
    best_model =  list(models.values())[model_index]
    
    print(f'The best model is {list(models.keys())[model_index]}')
    
    
    #Retrain on best model.
    best_model.fit(X,y)
    
    #Save model.
    if not os.path.exists(MODEL_DIR):
        os.makedirs(MODEL_DIR)

    saved_model = os.path.join(MODEL_DIR, f'model_{MODEL_VERSION}')
    joblib.dump(best_model, saved_model)
    
    
    print(f'The time taken was {time.time() - start_time}')
    return(model_scores)    
        
        
        

In [5]:
ts_all = pd.read_csv(os.path.join('data', 'ts-data', 'ts-all.csv'))

In [6]:
from preprocessing import engineer_features
eng_df = engineer_features(ts_all)
eng_df

Unnamed: 0,target,revenue_7d,revenue_14d,revenue_30d,revenue_60d,revenue_365d,views_30d,purchases_30d,dates
0,198307.76,14450.540,14450.540,14450.540,14450.540,14450.540,15047.0,3223.0,2017-11-28
1,183857.22,27862.500,27862.500,27862.500,27862.500,27862.500,29206.0,6500.0,2017-11-29
2,170445.26,41152.750,41152.750,41152.750,41152.750,41152.750,44972.0,9502.0,2017-11-30
3,157155.01,50840.030,50840.030,50840.030,50840.030,50840.030,57496.0,12061.0,2017-12-01
4,147467.73,52283.290,52283.290,52283.290,52283.290,52283.290,60809.0,12463.0,2017-12-02
...,...,...,...,...,...,...,...,...,...
606,28864.49,27837.311,85890.361,168756.531,369473.871,2537160.485,202176.0,38247.0,2019-07-27
607,28864.49,27801.821,85702.681,168336.191,366792.931,2540310.745,203641.0,38491.0,2019-07-28
608,25714.23,24273.641,62620.731,172710.731,364102.411,2541950.795,210666.0,39691.0,2019-07-29
609,21339.69,36481.170,73617.071,187291.511,377218.761,2551819.085,216624.0,40458.0,2019-07-30


In [7]:
X = eng_df.drop(['target','dates'], axis = 1)
y = eng_df.target

In [8]:
_model_train(X,y)

The best model is XGB
The time taken was 102.73463678359985


[31247.23772385812,
 20834.69736057729,
 23891.91945895753,
 43921.146460412,
 19584.745513780083]

In [9]:
a = {'b': 1},{'c': 2}

In [10]:
a[1]

{'c': 2}

In [11]:
a[0]

{'b': 1}

In [12]:
    GridSearchParameters = {'criterion': ['mse', 'mae', 'friedman_mse'],'max_depth': [None, 10,20,50],'max_features': ['auto', 'sqrt', 'log2']},{'criterion': ['mse', 'mae'],'max_features' : ['auto', 'sqrt'] }

In [13]:
GridSearchParameters[1]

{'criterion': ['mse', 'mae'], 'max_features': ['auto', 'sqrt']}

In [14]:
#Train_Test_Split Data
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 1234)

In [15]:
y_test

565    187187.440
288    324514.801
298    295694.360
339    321515.082
557    150539.670
          ...    
95     244756.411
146     97205.131
78     143326.244
59     140734.147
185    229099.440
Name: target, Length: 123, dtype: float64

In [16]:
a = {'b': 1}

In [17]:
list(a.keys())

['b']