# AI ENTERPRISE WORKFLOW CERTIFICATION

### Capstone Project - Part 2. Model Building and Selection.

### 1. State the different modelling approaches that you will compare to address the opportunity at hand.

We need not necessarily treat this problem as a time series one. We can take the past revenue as a feature and use regression models. To ensure that do violate any of the assumptions required for a Linear Regression, we shall primarily focus on tree based methods.

Our analysis will work as follows:

**Preprocessing:**
* Feature Engineer
* Train_Test_Split
* Standard Scaling


**Model Selection:**

For each of the top 10 countries train a separate model:
* Decision Tree Regression
* Random Forest Regression
* Gradient Boost Regression
* Ada Boost Regression
* XGBoost Regression

Tune parameters on each of these models, and the select which is the best for each country.

In [6]:
%%writefile data_modelling.py


#Standard Imports
import pandas as pd
import numpy as np
import os
import time
import joblib

#Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

#Modelling
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
import xgboost as xgb

from sklearn.metrics import mean_squared_error as mse


#Load from preprocessing.py
from data_ingestion import load_ts, engineer_features, TS_DIR


MODEL_DIR = os.path.join('models')
MODEL_VERSION = 0.1
MODEL_VERSION_NOTE = '-'


def _model_train(dataset,tag,test = False):
    """
    Train models and select the best one out of DecisionTreeRegression,  GradientBoostingRegression, AdaBoostRegression
    and XGBoostRegressor. Feed the model the timeseries_datasets.
    """
    
    dataset = engineer_features(dataset)
    
    
    X = dataset.drop(['target','dates'], axis = 1)
    y = dataset.target
    
    #Train_Test_Split Data
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.33, random_state = 0)
    
    
    ##Train Models
    
    GridSearchParameters = {'criterion': ['mse', 'mae', 'friedman_mse'],
                            'max_depth': [None, 10,20,50],
                            'max_features': ['auto', 'sqrt', 'log2']}, \
    {'criterion': ['mse', 'mae'],
     'max_features' : ['auto', 'sqrt'] }, \
    {'loss' : ['ls', 'lad', 'huber', 'quantile'],
     'learning_rate' : [0.1,0.01,0.001]}, \
    {'loss' : ['linear', 'square',],
     'learning_rate' : [0.05, 0.1, 0.01]}, \
    {'learning_rate': [0.05, 0.1, 0.01],
     'max_depth': [1, 5, 50],
     'n_estimators': [100, 1000, 500]
    }

    params = {
        'DTR_P' : GridSearchParameters[0],
        'RFR_P' : GridSearchParameters[1],
        'GBR_P' : GridSearchParameters[2],
        'ADA_P' : GridSearchParameters[3],
        'XGB_P' : GridSearchParameters[4],
    }
    
    regressor_dict = {
        'DTR' : DecisionTreeRegressor(random_state = 0),
        'RFR' : RandomForestRegressor(random_state = 0),
        'GBR' : GradientBoostingRegressor(random_state = 0),
        'ADA' : AdaBoostRegressor(random_state = 0),
        'XGB' : xgb.XGBRegressor(seed = 0)

    }
    
    models = {}
    
    for model_name in regressor_dict:
        
        pipe = Pipeline(steps = [('scaler', StandardScaler()),
                                ('regressor', regressor_dict[model_name])])
        grid = GridSearchCV(regressor_dict[model_name],
                           param_grid = params[model_name + '_P'], cv = 5)
        grid.fit(X_train, y_train)
        
        models[model_name] = grid
        
     
    model_scores = []
    
    #Test which model is optimal.
    for model in models:
        y_pred = models[model].predict(X_test)
        rmse = np.sqrt(mse(y_pred, y_test))
        model_scores.append(rmse)
    
    model_index = np.argmin(model_scores)
    model_name = list(models.keys())[model_index]
    best_model =  list(models.values())[model_index]
    
    print(f'The best model for {tag}\'s revenue is {model_name}')
   
    
    #Retrain on best model.
    best_model.fit(X,y)
    
    #Save model.
    if not os.path.isdir(MODEL_DIR):
        os.mkdir(MODEL_DIR)
    
    if test:
        saved_model = os.path.join(MODEL_DIR, f'test-{tag}-{model_name}.joblib')
    else:
        saved_model = os.path.join(MODEL_DIR, f'sl-{tag}-{model_name}.joblib')
        
    
    joblib.dump(best_model,saved_model)
    
    
    
def model_train(ts_dir, test = False):
    """
    Train the models for each of the top ten countries (+ all).
    """
    #Check Directories
    if not os.path.isdir(MODEL_DIR):
        os.mkdir(MODEL_DIR)
    
    #Load ts files
    ts = load_ts(ts_dir)
    
    for country,df in ts.items():
        _model_train(df, country, test = test)

        
def model_load(prefix = 'sl',ts_dir = TS_DIR, model_dir = MODEL_DIR):
    """
    Function to load in Train Models
    """
    
    models_list = [file for file in os.listdir(model_dir) if file[0:len(prefix)] == prefix]
    
    if len(models_list) == 0:
        raise Exception(f'No models found with prefix: {prefix}. Did you train them?')
        
    models = {re.sub('.joblib','',model[len(prefix):]: }
    
    
        
    
        
    

if __name__ == "__main__":
    
    run_start = time.time()
    model_train(TS_DIR)
    
    
    m, s = divmod(time.time()-run_start,60)
    h, m = divmod(m, 60)
    print("...running time:", "%d:%02d:%02d"%(h, m, s))
    print('Complete')   
        

Overwriting data_modelling.py


In [7]:
#%run data_modelling.py

Ingesting timeseries data from files...
The best model for all's revenue is XGB


KeyboardInterrupt: 