### Objective
The objective of the notebook is to get the significant features using lasso cvglmnet

In [0]:
import pandas as  pd
import numpy as np
import datetime as dt
import os
import logging
import scipy
from datetime import datetime
import sys
import joblib
import multiprocessing
import random
import dotsi

import glmnet_python
from cvglmnet import cvglmnet
from glmnet_python.cvglmnetCoef import cvglmnetCoef

from glmnetSet import glmnetSet
from glmnetPredict import glmnetPredict
from glmnet import glmnet
from cvelnet import cvelnet
from cvlognet import cvlognet
from cvmultnet import cvmultnet
from cvmrelnet import cvmrelnet
from cvfishnet import cvfishnet

In [0]:
# logging part
p_dir = "/tmp/"
log_file = "feature_selection" + " (" +datetime.today().strftime('%Y-%m-%d-%H-%M-%S')+ ").log"

logger = logging.getLogger('custom_log')
logger.setLevel(logging.DEBUG)

# Applying necessary formatter
fh = logging.FileHandler(p_dir+log_file)
formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
fh.setFormatter(formatter)
logger.addHandler(fh)

#### Processing Config file
Dependent variable, date variable, modeling granularity & other related modeling details are provided in the form of a config file.Each TS Algorithm and the related hyperparameter values to be tried should given in the config.yml file

In [0]:
%run ../0_Config.ipynb

In [0]:
logger.info("Config file read")

# For exporting the config file
temp_config = app_config.copy()

In [0]:
# Create the algo directory for storing the results
output_directory = app_config['output_dir_path']
root_dir = "Feature_Selection"
algorithm = "Lasso"
algo_path = os.path.join(output_directory,root_dir,algorithm)
if not os.path.exists(algo_path):
    os.makedirs(algo_path)
logger.info("Created algorithm directory")    

logs_path = os.path.join(output_directory,root_dir,'logs',algorithm)
if not os.path.exists(logs_path):
    os.makedirs(logs_path)
logger.info("Created logs directory")

config_path = os.path.join(app_config['output_dir_path'],root_dir,"config")
if not os.path.exists(config_path):
    os.makedirs(config_path)
logger.info("Created config directory")

#### Broadcasting the required variables
Variables suffixed with "_conf" are taken from the config file

In [0]:
modeling_granularity_conf = app_config["modeling_granularity"]
# print(modeling_granularity_conf)

# Rename Start date and DV config
dv_config = app_config["dependent_variable"]
ds_config = app_config["date_var"]

# pos and neg corr broadcast
corr_config_broadcast = dotsi.Dict({"value":app_config['feature_selection']['exogenous_variables']})

broadcast_granularity = dotsi.Dict({"value":modeling_granularity_conf})
broadcast_req_info = dotsi.Dict({"value":app_config['feature_selection']['lasso_cvglmnet']})
broadcast_tracking = dotsi.Dict({"value":app_config['tracking']})
logger.info("Broadcasted the required variables")

In [0]:
# -*- coding: utf-8 -*-
"""
--------------------------------------------------------------------------
 cvglmnet.m: cross-validation for glmnet
--------------------------------------------------------------------------

 DESCRIPTION:
    Does k-fold cross-validation for glmnet, produces a plot, and returns
    a value for lambdau. Cross-validation is not implemented for Cox model yet.

 USAGE:

    Note that like glmnet, all arguments are keyword-only:
 
    CVerr = cvglmnet(x, y, family, options, type, nfolds, foldid,
    parallel, keep, grouped);

    Fewer input arguments(more often) are allowed in the call. Default values
    for the arguments are used unless specified by the user.
        
=======================
INPUT ARGUMENTS
 x           nobs x nvar scipy 2D array of x parameters (as in glmnet).
 y           nobs x nc scipy Response y as in glmnet.
 family      Response type as family in glmnet.
 options     Options as in glmnet.
 ptype       loss to use for cross-validation. Currently five options, not
             all available for all models. The default is ptype='deviance', which uses
             squared-error for Gaussian models (a.k.a ptype='mse' there), deviance for
             logistic and Poisson regression, and partial-likelihood for the Cox
             model (Note that CV for cox model is not implemented yet). 
             ptype='class' applies to binomial and multinomial logistic
             regression only, and gives misclassification error. ptype='auc' is for
             two-class logistic regression only, and gives area under the ROC curve.
             ptype='mse' or ptype='mae' (mean absolute error) can be used by all models
             except the 'cox'; they measure the deviation from the fitted mean to the
             response.  
 nfolds      number of folds - default is 10. Although nfolds can be as
             large as the sample size (leave-one-out CV), it is not recommended for
             large datasets. Smallest value allowable is nfolds=3.
 foldid      an optional vector of values between 1 and nfold identifying
             what fold each observation is in. If supplied, nfold can be
             missing.
 parallel    If True, use parallel computation to fit each fold. 
 keep        If keep=True, a prevalidated array is returned containing
             fitted values for each observation and each value of lambda.
             This means these fits are computed with this observation and
             the rest of its fold omitted. The foldid vector is also
             returned. Default is keep=False.   
 grouped     This is an experimental argument, with default true, and can
             be ignored by most users. For all models except the 'cox',
             this refers to computing nfolds separate statistics, and then
             using their mean and estimated standard error to describe the
             CV curve. If grouped=false, an error matrix is built up at
             the observation level from the predictions from the nfold
             fits, and then summarized (does not apply to
             type='auc'). For the 'cox' family, grouped=true obtains the 
             CV partial likelihood for the Kth fold by subtraction; by
             subtracting the log partial likelihood evaluated on the full
             dataset from that evaluated on the on the (K-1)/K dataset.
             This makes more efficient use of risk sets. With
             grouped=FALSE the log partial likelihood is computed only on
             the Kth fold.

=======================
OUTPUT ARGUMENTS:
 A dict() is returned with the following fields.
 lambdau     the values of lambda used in the fits.
 cvm         the mean cross-validated error - a vector of length
             length(lambdau). 
 cvsd        estimate of standard error of cvm.
 cvup        upper curve = cvm+cvsd.
 cvlo        lower curve = cvm-cvsd.
 nzero       number of non-zero coefficients at each lambda.
 name        a text string indicating type of measure (for plotting
             purposes). 
 glmnet_fit  a fitted glmnet object for the full data.
 lambda_min  value of lambda that gives minimum cvm.
 lambda_1se  largest value of lambda such that error is within 1 standard
             error of the minimum. 
 class       Type of regression - internal usage.
 fit_preval  if keep=true, this is the array of prevalidated fits. Some
             entries can be NA, if that and subsequent values of lambda
             are not reached for that fold.
 foldid      if keep=true, the fold assignments used.

 DETAILS:
    The function runs glmnet nfolds+1 times; the first to get the lambda
    sequence, and then the remainder to compute the fit with each of the 
    folds omitted. The error is accumulated, and the average error and 
    standard deviation over the folds is computed. Note that cvglmnet 
    does NOT search for values for alpha. A specific value should be 
    supplied, else alpha=1 is assumed by default. If users would like to 
    cross-validate alpha as well, they should call cvglmnet with a 
    pre-computed vector foldid, and then use this same fold vector in 
    separate calls to cvglmnet with different values of alpha. 

 LICENSE: GPL-2

 AUTHORS:
    Algorithm was designed by Jerome Friedman, Trevor Hastie and Rob Tibshirani
    Fortran code was written by Jerome Friedman
    R wrapper (from which the MATLAB wrapper was adapted) was written by Trevor Hasite
    The original MATLAB wrapper was written by Hui Jiang,
    and is updated and maintained by Junyang Qian.
    This Python wrapper (adapted from the Matlab and R wrappers) is written by Balakumar B.J., 
    Department of Statistics, Stanford University, Stanford, California, USA.

 REFERENCES:
    Friedman, J., Hastie, T. and Tibshirani, R. (2008) Regularization Paths for Generalized Linear Models via Coordinate Descent, 
    http://www.jstatsoft.org/v33/i01/
    Journal of Statistical Software, Vol. 33(1), 1-22 Feb 2010
    
    Simon, N., Friedman, J., Hastie, T., Tibshirani, R. (2011) Regularization Paths for Cox's Proportional Hazards Model via Coordinate Descent,
    http://www.jstatsoft.org/v39/i05/
    Journal of Statistical Software, Vol. 39(5) 1-13

    Tibshirani, Robert., Bien, J., Friedman, J.,Hastie, T.,Simon, N.,Taylor, J. and Tibshirani, Ryan. (2010) Strong Rules for Discarding Predictors in Lasso-type Problems,
    http://www-stat.stanford.edu/~tibs/ftp/strong.pdf
    Stanford Statistics Technical Report

 SEE ALSO:
    cvglmnetPlot, cvglmnetCoef, cvglmnetPredict, and glmnet.

 EXAMPLES:
 
      # Gaussian
      x = scipy.random.rand(100, 10)
      y = scipy.random.rand(100, 1)
      cvfit = cvglmnet(x = x, y = y)
      cvglmnetPlot(cvfit)
      print( cvglmnetCoef(cvfit) )
      print( cvglmnetPredict(cvfit, x[0:5, :], 'lambda_min') )
      cvfit1 = cvglmnet(x = x, y = y, ptype = 'mae')
      cvglmnetPlot(cvfit1)
      
      # Binomial
      x = scipy.random.rand(100, 10)
      y = scipy.random.rand(100,1)
      y = (y > 0.5)*1.0
      fit = cvglmnet(x = x, y = y, family = 'binomial', ptype = 'class')    
      cvglmnetPlot(fit)
      
      # poisson
      x = scipy.random.rand(100,10)
      y = scipy.random.poisson(size = [100, 1])*1.0
      cvfit = cvglmnet(x = x, y = y, family = 'poisson')
      cvglmnetPlot(cvfit)
      
      # Multivariate Gaussian:
      x = scipy.random.rand(100, 10)
      y = scipy.random.rand(100,3)
      cvfit = cvglmnet(x = x, y = y, family = 'mgaussian')      
      cvglmnetPlot(cvfit)
       
      # Multinomial
      x = scipy.random.rand(100,10)
      y = scipy.random.rand(100,1)
      y[y < 0.3] = 1.0
      y[y < 0.6] = 2.0
      y[y < 1.0] = 3.0
      cvfit = cvglmnet(x = x, y = y, family = 'multinomial')
      cvglmnetPlot(cvfit) 
      
      #cox
      Not implemented for cvglmnet.py


    
 % Cox
    n=1000;p=30;
    nzc=p/3;
    x=randn(n,p);
    beta=randn(nzc,1);
    fx=x(:,1:nzc)*beta/3;
    hx=exp(fx);
    ty=exprnd(1./hx,n,1);
    tcens=binornd(1,0.3,n,1);
    y=cat(2,ty,1-tcens);
    foldid=randsample(10,n,true);
    fit1_cv=cvglmnet(x,y,'cox',[],[],[],foldid);
    cvglmnetPlot(fit1_cv);
    
 % Parallel
    matlabpool;
    x=randn(1e3,100);
    y=randn(1e3,1);
    tic;
    cvglmnet(x,y);
    toc;
    tic;
    cvglmnet(x,y,[],[],[],[],[],true);
    toc;

"""


def cvglmnet(*, x,
             y,
             family = 'gaussian',
             ptype = 'default',
             nfolds = 10,
             foldid = scipy.empty([0]),
             parallel = False,
             keep = False,
             grouped = True,
             **options):

    options = glmnetSet(options)

    if 0 < len(options['lambdau']) < 2:
        raise ValueError('Need more than one value of lambda for cv.glmnet')
    
    nobs = x.shape[0]

    # we should not really need this. user must supply the right shape
    # if y.shape[0] != nobs:
    #    y = scipy.transpose(y)
        
    # convert 1d python array of size nobs to 2d python array of size nobs x 1
    if len(y.shape) == 1:
        y = scipy.reshape(y, [y.size, 1])

    # we should not really need this. user must supply the right shape       
    # if (len(options['offset']) > 0) and (options['offset'].shape[0] != nobs):
    #    options['offset'] = scipy.transpose(options['offset'])
    
    if len(options['weights']) == 0:
        options['weights'] = scipy.ones([nobs, 1], dtype = scipy.float64)

    # main call to glmnet        
    glmfit = glmnet(x = x, y = y, family = family, **options)    

    is_offset = glmfit['offset']
    options['lambdau'] = glmfit['lambdau']
    
    nz = glmnetPredict(glmfit, scipy.empty([0]), scipy.empty([0]), 'nonzero')
    if glmfit['class'] == 'multnet':        
        nnz = scipy.zeros([len(options['lambdau']), len(nz)])
        for i in range(len(nz)):
            nnz[:, i] = scipy.transpose(scipy.sum(nz[i], axis = 0))
        nz = scipy.ceil(scipy.median(nnz, axis = 1))    
    elif glmfit['class'] == 'mrelnet':
        nz = scipy.transpose(scipy.sum(nz[0], axis = 0))
    else:
        nz = scipy.transpose(scipy.sum(nz, axis = 0))
    
    if len(foldid) == 0:
        ma = scipy.tile(scipy.arange(nfolds), [1, int(scipy.floor(nobs/nfolds))])
        mb = scipy.arange(scipy.mod(nobs, nfolds))
        mb = scipy.reshape(mb, [1, mb.size])
        population = scipy.append(ma, mb, axis = 1)
        mc = scipy.random.permutation(len(population))
        mc = mc[0:nobs]
        foldid = population[mc]
        foldid = scipy.reshape(foldid, [foldid.size,])
    else:
        nfolds = scipy.amax(foldid) + 1
        
    if nfolds < 3:
        raise ValueError('nfolds must be bigger than 3; nfolds = 10 recommended')        
        
    cpredmat = list()
    foldid = scipy.reshape(foldid, [foldid.size, ])
    if parallel == True:
        num_cores = multiprocessing.cpu_count()
        sys.stderr.write("[status]\tParallel glmnet cv with " + str(num_cores) + " cores\n")
        cpredmat = joblib.Parallel(n_jobs=num_cores)(joblib.delayed(doCV)(i, x, y, family, foldid, nfolds, is_offset, **options) for i in range(nfolds))
    else:
        for i in range(nfolds):
            newFit = doCV(i, x, y, family, foldid, nfolds, is_offset, **options)
            cpredmat.append(newFit)
        
    if cpredmat[0]['class'] == 'elnet':
        cvstuff = cvelnet( cpredmat, options['lambdau'], x, y \
                          , options['weights'], options['offset'] \
                          , foldid, ptype, grouped, keep)
    elif cpredmat[0]['class'] == 'lognet':
        cvstuff = cvlognet(cpredmat, options['lambdau'], x, y \
                          , options['weights'], options['offset'] \
                          , foldid, ptype, grouped, keep)
    elif cpredmat[0]['class'] == 'multnet':
        cvstuff = cvmultnet(cpredmat, options['lambdau'], x, y \
                          , options['weights'], options['offset'] \
                          , foldid, ptype, grouped, keep)
    elif cpredmat[0]['class'] == 'mrelnet':
        cvstuff = cvmrelnet(cpredmat, options['lambdau'], x, y \
                          , options['weights'], options['offset'] \
                          , foldid, ptype, grouped, keep)
    elif cpredmat[0]['class'] == 'fishnet':
        cvstuff = cvfishnet(cpredmat, options['lambdau'], x, y \
                           , options['weights'], options['offset'] \
                           , foldid, ptype, grouped, keep)
    elif cpredmat[0]['class'] == 'coxnet':
        raise NotImplementedError('Cross-validation for coxnet not implemented yet.')
        #cvstuff = cvcoxnet(cpredmat, options['lambdau'], x, y \
        #                  , options['weights'], options['offset'] \
        #                  , foldid, ptype, grouped, keep)
 
    cvm = cvstuff['cvm']
    cvsd = cvstuff['cvsd']
    cvname = cvstuff['name']

    CVerr = dict()
    CVerr['lambdau'] = options['lambdau']       
    CVerr['cvm'] = scipy.transpose(cvm)
    CVerr['cvsd'] = scipy.transpose(cvsd)
    CVerr['cvup'] = scipy.transpose(cvm + cvsd)
    CVerr['cvlo'] = scipy.transpose(cvm - cvsd)
    CVerr['nzero'] = nz
    CVerr['name'] = cvname
    CVerr['glmnet_fit'] = glmfit
    if keep:
        CVerr['fit_preval'] = cvstuff['fit_preval']
        CVerr['foldid'] = foldid
    if ptype == 'auc':
        cvm = -cvm
    CVerr['lambda_min'] = scipy.amax(options['lambdau'][cvm <= scipy.amin(cvm)]).reshape([1])  
    idmin = options['lambdau'] == CVerr['lambda_min']
    semin = cvm[idmin] + cvsd[idmin]
    CVerr['lambda_1se'] = scipy.amax(options['lambdau'][cvm <= semin]).reshape([1])
    CVerr['class'] = 'cvglmnet'
    
    return(CVerr)
        
# end of cvglmnet
#==========================
def doCV(i, x, y, family, foldid, nfolds, is_offset, **options):
    which = foldid == i
    opts = options.copy()
    opts['weights'] = opts['weights'][~which, ]
    opts['lambdau'] = options['lambdau']
    if is_offset:
        if opts['offset'].size > 0:
            opts['offset'] = opts['offset'][~which, ]
    xr = x[~which, ]
    yr = y[~which, ]
    newFit = glmnet(x = xr, y = yr, family = family, **opts)    
    return(newFit)

In [0]:
def get_forecast_UDF(df_data: pd.DataFrame)-> pd.DataFrame:
    """Function to perform model building using the entire data to get the significant features utilizing the broadcasted details from the config file

    Parameters
    ----------
    df_data : pd.DataFrame
        The dataset containing values for all the required variables

    Returns
    -------
    pd.DataFrame
        Returns a dataframe with the granularity x variable
    """
    try:
        df_data = df_data.sort_values(by=['ds'],ascending=True).reset_index(drop = True)
        
        # broadcast_granularity
        broadcast_gran = broadcast_granularity.value
        
        req_info = broadcast_req_info.value
        # Appending regressors based on the sign of correlation
        corr_var = corr_config_broadcast.value
        regressors = list(set(corr_var["positive_corr"] + corr_var["negative_corr"]+corr_var['uncertain_corr']))
        
        temp_list1 = []
        # Removing regressors based on the correlation
        if(corr_var["consider_correlation"]):   
            for x in corr_var["positive_corr"]:
                if(df_data[['y',x]].corr().iloc[0][1]<0):
                    temp_list1.append(x)
            for x in corr_var["negative_corr"]:
                if (x not in temp_list1):
                    if(df_data[['y',x]].corr().iloc[0][1]>0):
                        temp_list1.append(x)   
            regressors = list(set(regressors) - set(temp_list1))
            
        # Checking for variance in the regressor
        temp_list2 = []
        if len(regressors)>0:
            for ex_var in regressors:  
                mean = df_data[ex_var].mean()
                std = df_data[ex_var].std()
                if mean == 0:
                    if std <= 0.001:
                        temp_list2.append(ex_var)
                else:
                    if abs(std/mean) <= 0.01:
                        temp_list2.append(ex_var)

        if(len(temp_list2)>0):
            for var in temp_list2:
                regressors.remove(var)
        reg_len = len(regressors)
        
        ### IDVs and DV
        X=df_data[regressors].to_numpy()
        y=df_data[['y']].astype(float).to_numpy()

        #################### Adding lower and upper limits #################
        cl = np.array([np.repeat(-np.inf,reg_len), np.repeat(np.inf,reg_len)], dtype = scipy.float64)
        if(req_info['consider_limits']):
            zero_lower_cols = list(set(regressors) & set(corr_var["positive_corr"]))
            zero_upper_cols = list(set(regressors) & set(corr_var["negative_corr"]))
            
            for j in zero_lower_cols:
                if j in regressors:
                    indx = regressors.index(j)
                    cl[0][indx]= 0  

            for j in zero_upper_cols:
                if j in regressors:
                    indx = regressors.index(j)
                    cl[1][indx]= 0     
            limits_check = (pd.DataFrame({"IDV":regressors,"Lower_limit":cl[0],"Upper_limit":cl[1]})).sort_values(by = "IDV")

        ########################## Adding Penalty factor ########################
        pfac = np.ones([1, reg_len])
        zero_pen_cols = req_info['zero_penalty_vars']
        zero_pen_cols = [var for var in zero_pen_cols if var in regressors]
        if(reg_len <= len(zero_pen_cols)):
            zero_pen_cols = zero_pen_cols[:-1]
        for j in zero_pen_cols:
            indx = regressors.index(j)
            pfac[0][indx]= 0

        penalty_check = (pd.DataFrame({"IDV":regressors,"Penalty":pfac[0]})).sort_values(by = "IDV")
        
        ##################### Adding weights ######################
        wts = np.ones((len(df_data),1))
        
        # Run lasso model 20 times and including only those variables that appear more than 10 times
        seeds = [2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61, 67, 71]
        # Model Function call for the selected seeds
        model_coefficients_check = pd.DataFrame()
        for se in seeds:
            random.seed(se)
            cvfit = cvglmnet(x = X.copy(), y = y.copy(), family = 'gaussian',ptype = 'mse', nfolds = 20,weights=wts,penalty_factor = pfac,cl=cl)
            coef_val = (cvglmnetCoef(cvfit, s = 'lambda_1se'),cvfit['lambda_1se'])[0]
            model_coefficients_2 = pd.DataFrame(data = [["Intercept",coef_val[0]]],columns =["IDV","Value"])
            for i in range(0,reg_len):
                if(abs(coef_val[i+1])>0):
                    k = pd.DataFrame(data = [[regressors[i],coef_val[i+1]]],columns =["IDV","Value"])
                    model_coefficients_2 = model_coefficients_2.append(k,ignore_index = True)
            model_coefficients_check = model_coefficients_check.append(model_coefficients_2,ignore_index = True)

        model_coefficients_Acting =  model_coefficients_check.groupby('IDV').size().reset_index().rename(columns = {0:"Frequency"})
        model_coefficients_Acting = pd.merge(model_coefficients_Acting,model_coefficients_check,on="IDV",how = "left")
        model_coefficients_Acting =model_coefficients_Acting.drop_duplicates(subset=['IDV'],keep = 'first').reset_index(drop = True)
        model_coefficients_final = model_coefficients_Acting[model_coefficients_Acting['Frequency']>=10].reset_index(drop = True)
        model_coefficients_final = model_coefficients_final[model_coefficients_final['IDV']!= "Intercept"]
        for var in broadcast_gran:
            model_coefficients_final[var] = df_data[var].iloc[0]
            model_coefficients_final[var] = model_coefficients_final[var].astype(str)
        model_coefficients_final = model_coefficients_final[broadcast_gran + ['IDV']]
        model_coefficients_final['status'] = 'success'
        return model_coefficients_final
      
    except Exception as e:
        results_pd = pd.DataFrame(columns = broadcast_granularity.value + ['IDV','status'],index = range(1))
        results_pd[broadcast_granularity.value] = df_data[broadcast_granularity.value].head(1).reset_index(drop = True)
        for x in broadcast_granularity.value:
            results_pd[x] = results_pd[x].astype(str)
        results_pd['status'] = str(e)  
        return results_pd

#### Loading the latest Missing_value_treatment file
##### Please update the reading path with the required data path if "Missing value treatment" was not run

In [0]:
# Reading the latest input file based on timestamp
all_files = [file for file in os.listdir(app_config['output_dir_path']+"/Data_Processing/Missing_value_treatment")]
missing_op_files = [file for file in all_files if "Missing_value_treatment_results (" in file]
missing_op_files = [file.replace(".csv","") for file in missing_op_files]
version_dates = [datetime.strptime(x.split('(')[1].replace(')',''), '%Y-%m-%d-%H-%M-%S') for x in missing_op_files]
max_date = max(version_dates)
max_date = max_date.strftime('%Y-%m-%d-%H-%M-%S')
req_file_name = [x for x in missing_op_files if max_date in x]
missing_op_file_path = os.path.join(app_config['output_dir_path']+"/Data_Processing/Missing_value_treatment",req_file_name[0] + ".csv")
# print(missing_op_file_path)

# Reading the data
df = pd.read_csv(missing_op_file_path)
# print(df.shape)

df.rename(columns = {ds_config:"ds", dv_config:"y"}, inplace = True)
df['ds'] = pd.to_datetime(df['ds'])
df[modeling_granularity_conf] = df[modeling_granularity_conf].astype(str)

logger.info("Data loaded")

df['gran_tempp'] = df[modeling_granularity_conf].astype(str).sum(axis=1)
unique_pdts = df['gran_tempp'].unique()
df_f = pd.DataFrame()
for pdt in unique_pdts:
    df_f = pd.concat([df_f,get_forecast_UDF(df[df['gran_tempp']==pdt])])
# display(df_f)

In [0]:
df_f.to_csv(algo_path+"/lasso_feature_selection_results ("+datetime.today().strftime('%Y-%m-%d-%H-%M-%S')+").csv", index = False)

In [0]:
# Exporting config file
config_file_name = "config_for_exp_id_"+str(broadcast_tracking.value['mlflow_experiment_id']) + " (" +datetime.today().strftime('%Y-%m-%d-%H-%M-%S-%f')[:-3]+").yml"
config_path1 = os.path.join(config_path,config_file_name)
with open(config_path1, 'w') as file:
    yaml.dump(temp_config, file, default_flow_style=False,sort_keys=False)

In [0]:
# Move from tmp directory to req. location in datalake
import platform
plat_sys = platform.system()

if(plat_sys!='Windows'):
    log_file = log_file.replace(' (', '\ \(').replace(')','\)')
    os.system('mv /tmp/{0} {1}'.format(log_file,logs_path))