In [3]:
import pandas as pd
import numpy as np
from datetime import datetime
import warnings
warnings.filterwarnings("ignore")
import seaborn as sns
import matplotlib.pyplot as plt
get_ipython().magic('matplotlib inline')
sns.set(style="white", color_codes=True)
import copy
import pdb
from collections import defaultdict
import operator
import time
#######################################
# Models
from .models import build_arima_model, build_sarimax_model, build_var_model, \
                    build_pyflux_model, build_prophet_model, run_ensemble_model
# Utils
from .utils import colorful, load_ts_data, convert_timeseries_dataframe_to_supervised, \
                   time_series_plot, print_static_rmse, print_dynamic_rmse

ModuleNotFoundError: No module named '__main__.models'; '__main__' is not a package

In [2]:
def make_ts_magic(trainfile, ts_column, sep=',', target=None, score_type='rmse',
                  forecast_period=2, timeinterval='', non_seasonal_pdq=None,
                  seasonality=False, seasonal_period=12, seasonal_PDQ=None,
                  conf_int=0.95, model_type="stats", verbose=0):

    start_time = time.time()

    ##### Best hyper-parameters in statsmodels chosen using the best aic, bic or whatever. Select here.
    stats_scoring = 'aic'
    seed = 99


    ########## This is where we start the loading of the data file ######################
    if isinstance(trainfile, str):
        if trainfile != '':
            try:
                ts_df = load_ts_data(trainfile, ts_column, sep, target)
                print('    File loaded successfully. Shape of data set = %s' %(ts_df.shape,))
            except:
                print('File could not be loaded. Check the path or filename and try again')
                return
    elif isinstance(trainfile, pd.DataFrame):
        print('Input is data frame. Performing Time Series Analysis')
        ts_df = load_ts_data(trainfile, ts_column, sep, target)
    else:
        print('File name is an empty string. Please check your input and try again')
        return
    df_orig = copy.deepcopy(ts_df)
    if ts_df.shape[1] == 1:
        ### If there is only one column, you assume that to be the target column ####
        target = list(ts_df)[0]
    if not isinstance(ts_column, str):
        ### If ts_column is a number, then it means you need to convert it to a named variable
        ts_column = list(ts_df)[ts_column]
    if isinstance(target,list):
        target = target[0]
        print('    Taking the first column in target list as Target variable = %s' %target)
    else:
        print('    Target variable = %s' %target)
    preds = [x for x in list(ts_df) if x not in [ts_column,target]]

    ##################################################################################################
    ### Turn the time series index into a variable and calculate the difference.
    ### If the difference is not in days, then it is a hourly or minute based time series
    ### If the difference a multiple of days, then test it for weekly, monthly, qtrly, annual etc.
    ##################################################################################################
    if ts_df.index.dtype=='int' or ts_df.index.dtype=='float':
        ### You must convert the ts_df index into a date-time series using the ts_column given ####
        ts_df = ts_df.set_index(ts_column)
    ts_index = ts_df.index

    ################    IF TIME INTERVAL IS NOT GIVEN DO THIS   ########################
    #######   This is where the program tries to tease out the time period in the data set ###########
    ##################################################################################################
    if timeinterval == '':
        ts_index = pd.to_datetime(ts_df.index)
        diff = (ts_index[1] - ts_index[0]).to_pytimedelta()
        diffdays = diff.days
        diffsecs = diff.seconds
        if diffsecs == 0:
            diff_in_hours = 0
            diff_in_days = abs(diffdays)
        else:
            diff_in_hours = abs(diffdays*24*3600 + diffsecs)/3600
        if diff_in_hours == 0 and diff_in_days >= 1:
            print('Time series input in days = %s' % diff_in_days)
            if diff_in_days == 7:
                print('it is a Weekly time series.')
                timeinterval = 'weeks'
            elif diff_in_days == 1:
                print('it is a Daily time series.')
                timeinterval = 'days'
            elif 28 <= diff_in_days < 89:
                print('it is a Monthly time series.')
                timeinterval = 'months'
            elif 89 <= diff_in_days < 178:
                print('it is a Quarterly time series.')
                timeinterval = 'qtr'
            elif 178 <= diff_in_days < 360:
                print('it is a Semi Annual time series.')
                timeinterval = 'qtr'
            elif diff_in_days >= 360:
                print('it is an Annual time series.')
                timeinterval = 'years'
            else:
                print('Time Series time delta is unknown')
                return
        if diff_in_days == 0:
            if diff_in_hours == 0:
                print('Time series input in Minutes or Seconds = %s' % diff_in_hours)
                print('it is a Minute time series.')
                timeinterval = 'minutes'
            elif diff_in_hours >= 1:
                print('it is an Hourly time series.')
                timeinterval = 'hours'
            else:
                print('It is an Unknown Time Series delta')
                return
    else:
        print('Time Interval is given as %s' % timeinterval)

    ################# This is where you test the data and find the time interval #######
    timeinterval = timeinterval.strip().lower()
    if timeinterval in ['months', 'month', 'm']:
        timeinterval = 'months'
        seasonal_period = 12
    elif timeinterval in ['days', 'daily', 'd']:
        timeinterval = 'days'
        seasonal_period = 30
        # Commented out b/c resample only works with DatetimeIndex, not Index
        # ts_df = ts_df.resample('D').sum()
    elif timeinterval in ['weeks', 'weekly', 'w']:
        timeinterval = 'weeks'
        seasonal_period = 52
    elif timeinterval in ['qtr', 'quarter', 'q']:
        timeinterval = 'qtr'
        seasonal_period = 4
    elif timeinterval in ['years', 'year', 'annual', 'y', 'a']:
        timeinterval = 'years'
        seasonal_period = 1
    elif timeinterval in ['hours', 'hourly', 'h']:
        timeinterval = 'hours'
        seasonal_period = 24
    elif timeinterval in ['minutes', 'minute', 'min', 'n']:
        timeinterval = 'minutes'
        seasonal_period = 60
    elif timeinterval in ['seconds', 'second', 'sec', 's']:
        timeinterval = 'seconds'
        seasonal_period = 60
    else:
        timeinterval = 'months'
        seasonal_period = 12

    ########################### This is where we store all models in a nested dictionary ##########
    mldict = lambda: defaultdict(mldict)
    ml_dict = mldict()
    try:
        if model_type.lower() == 'all':
            print('Running all model types. This will take a long time. Be Patient...')
    except:
        print('Check if your model type is a string or one of the available types of models')
    ######### This is when you need to use FB Prophet ###################################
    ### When the time interval given does not match the tested_timeinterval, then use FB.
    #### Also when the number of rows in data set is very large, use FB Prophet, It is fast.
    #########                 FB Prophet              ###################################
    if model_type.lower() in ['prophet','all']:
        name = 'FB_Prophet'
        print(colorful.BOLD + '\nRunning Facebook Prophet Model...' + colorful.END)
        # try:
        #### If FB prophet needs to run, it needs to be installed. Check it here ###
        model, forecast_df, rmse, norm_rmse = build_prophet_model(
                                    ts_df, ts_column, target, forecast_period,
                                    score_type, verbose, conf_int)
        ml_dict[name]['model'] = model
        ml_dict[name]['forecast'] = forecast_df['yhat'].values
        ##### Make sure that RMSE works, if not set it to np.inf  #########
        if score_type == 'rmse':
            score_val = rmse
        else:
            score_val = norm_rmse
        # except:
        #     print('    FB Prophet may not be installed or Model is not running...')
        #     score_val = np.inf
        ml_dict[name][score_type] = score_val
    elif model_type.lower() in ['stats','all']:
        ##### First let's try the following models in sequence #########################################
        nsims = 100   ### this is needed only for M-H models in PyFlux
        name = 'PyFlux'
        print(colorful.BOLD + '\nRunning PyFlux Model...' + colorful.END)
        try:
            ml_dict[name]['model'], ml_dict[name]['forecast'], rmse, norm_rmse = \
                build_pyflux_model(ts_df, target, p_max, q_max, d_max, forecast_period,
                                   'MLE', nsims, score_type, verbose)
            if isinstance(rmse,str):
                print('    PyFlux not installed. Install PyFlux and run it again')
                score_val = np.inf
                rmse = np.inf
                norm_rmse = np.inf
        except:
            print('    PyFlux model error: predictions not available.')
            score_val = np.inf
            rmse = np.inf
            norm_rmse = np.inf
        ##### Make sure that RMSE works, if not set it to np.inf  #########
        if score_type == 'rmse':
            score_val = rmse
        else:
            score_val = norm_rmse
        ml_dict[name][score_type] = score_val
        ################### Let's build an ARIMA Model and add results #################
        name = 'ARIMA'
        print(colorful.BOLD + '\nRunning Non Seasonal ARIMA Model...' + colorful.END)
        try:
            ml_dict[name]['model'], ml_dict[name]['forecast'], rmse, norm_rmse = build_arima_model(ts_df[target],
                                                    stats_scoring,p_max,d_max,q_max,
                                    forecast_period=forecast_period,method='mle',verbose=verbose)
        except:
            print('    ARIMA model error: predictions not available.')
            score_val = np.inf
        if score_type == 'rmse':
            score_val = rmse
        else:
            score_val = norm_rmse
        ml_dict[name][score_type] = score_val
        ############# Let's build a SARIMAX Model and get results ########################
        name = 'SARIMAX'
        print(colorful.BOLD + '\nRunning Seasonal SARIMAX Model...' + colorful.END)
        # try:
        ml_dict[name]['model'], ml_dict[name]['forecast'], rmse, norm_rmse = build_sarimax_model(ts_df[target], stats_scoring, seasonality,
                                                seasonal_period, p_max, d_max, q_max,
                                                forecast_period,verbose)
        # except:
        #     print('    SARIMAX model error: predictions not available.')
        #     score_val = np.inf
        if score_type == 'rmse':
            score_val = rmse
        else:
            score_val = norm_rmse
        ml_dict[name][score_type] = score_val
        ########### Let's build a VAR Model - but first we have to shift the predictor vars ####
        name = 'VAR'
        if len(preds) == 0:
            print('No VAR model since number of predictors is zero')
            rmse = np.inf
            norm_rmse = np.inf
        else:
            try:
                if df_orig.shape[1] > 1:
                    preds = [x for x in list(df_orig) if x not in [target]]
                    print(colorful.BOLD + '\nRunning VAR Model...' + colorful.END)
                    print('    Shifting %d predictors by 1 to align prior predictor values with current target values...'
                                            %len(preds))
                    ts_df[preds] = ts_df[preds].shift(1)
                    ts_df.dropna(axis=0,inplace=True)
                    ml_dict[name]['model'], ml_dict[name]['forecast'], rmse, norm_rmse = build_var_model(ts_df[[target]+preds],stats_scoring,
                                                forecast_period, p_max, q_max)
                else:
                    print(colorful.BOLD + '\nNo predictors available. Skipping VAR model...' + colorful.END)
                    score_val = np.inf
            except:
                print('    VAR model error: predictions not available.')
                rmse = np.inf
                norm_rmse = np.inf
        ################################################################
        if score_type == 'rmse':
            score_val = rmse
        else:
            score_val = norm_rmse
        ########################################################################
        ml_dict[name][score_type] = score_val
    elif model_type.lower() in ['ml','all']:
        ########## Let's build a Machine Learning Model now with Time Series Data ################
        name = 'ML'
        if len(preds) == 0:
            print('No ML model since number of predictors is zero')
            rmse = np.inf
            norm_rmse = np.inf
        else:
            try:
                if df_orig.shape[1] > 1:
                    preds = [x for x in list(ts_df) if x not in [target]]
                    print(colorful.BOLD + '\nRunning Machine Learning Models...' + colorful.END)
                    print('    Shifting %d predictors by lag=%d to align prior predictor with current target...'
                                % (len(preds), lag))
                    # ipdb.set_trace()
                    dfxs, target, preds = convert_timeseries_dataframe_to_supervised(ts_df[preds+[target]],
                                            preds+[target], target, n_in=lag, n_out=0, dropT=False)
                    train = dfxs[:-forecast_period]
                    test = dfxs[-forecast_period:]
                    best = run_ensemble_model(train[preds], train[target], 'TimeSeries',
                                              score_type, verbose)
                    bestmodel = best[0]
                    ml_dict[name]['model'] = bestmodel
                    ### Certain models dont have random state => so dont do this for all since it will error
                    #best.set_params(random_state=0)
                    ml_dict[name]['forecast'] = bestmodel.fit(train[preds],train[target]).predict(test[preds])
                    rmse, norm_rmse = print_dynamic_rmse(test[target].values,
                                                bestmodel.predict(test[preds]),
                                                train[target].values)
                    #### Plotting actual vs predicted for RF Model #################
                    plt.figure(figsize=(5, 5))
                    plt.scatter(train.append(test)[target].values,
                                np.r_[bestmodel.predict(train[preds]), bestmodel.predict(test[preds])])
                    plt.xlabel('Actual')
                    plt.ylabel('Predicted')
                    plt.show()
                    ############ Draw a plot of the Time Series data ######
                    time_series_plot(dfxs[target], chart_time=timeinterval)
                else:
                    print(colorful.BOLD + '\nNo predictors available. Skipping Machine Learning model...' + colorful.END)
                    score_val = np.inf
            except:
                print('    For ML model, evaluation score is not available.')
                score_val = np.inf
        ################################################################
        if score_type == 'rmse':
            score_val = rmse
        else:
            score_val = norm_rmse
            rmse = np.inf
            norm_rmse = np.inf
        ########################################################################
        ml_dict[name][score_type] = score_val
    else:
        print('The model_type should be either stats, prophet, ml or all. Your input is not available.')
        return ml_dict
    ######## Selecting the best model based on the lowest rmse score ######
    f1_stats = {}
    for key, val in ml_dict.items():
        f1_stats[key] = ml_dict[key][score_type]
    best_model_name = min(f1_stats.items(), key=operator.itemgetter(1))[0]
    print(colorful.BOLD + '\nBest Model is:' + colorful.END)
    print('    %s' % best_model_name)
    best_model = ml_dict[best_model_name]['model']
    #print('    Best Model Forecasts: %s' %ml_dict[best_model_name]['forecast'])
    print('    Best Model Score: %0.2f' % ml_dict[best_model_name][score_type])
    return ml_dict

In [None]:
make_ts_magic(trainfile, ts_column, sep=',', target=None, score_type='rmse',
              forecast_period=2, timeinterval='', non_seasonal_pdq=None,
              seasonality=False, seasonal_period=12, seasonal_PDQ=None,
              conf_int=0.95, model_type="stats", verbose=0):
