# NEM DATA CHALLENGE, 2nd Phase: 
## ML Model Development Module

* Author: Sergio Díaz
* Date: September 2017

In [7]:
%matplotlib notebook

In [8]:
import os
import math
import pickle
import numpy as np
import pandas as pd
from scipy import stats
import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.dates as mdates
from sklearn.metrics import mean_squared_error, r2_score,mean_absolute_error

In [9]:
def main():
    
    options = { 'test_score' :          0, # model error metric wrt test data 
                'historical_pwr':       0, # y_pred vs y_test on 5 months SCADA (time series)  
                'validation_score' :    0, # model error metrics wrt Validation data
                'validation_pwr'   :    0, # y_pred vs y_val on 24 hr scada dates (time-series)
                'pwr_curve_suboptimal': 0, # pwr_curve with sub-optimal data pairs 
                'pwr_curve_predicted':  0, # ML predicted pwr_curve
                'pwr_curve_residuals':  0, # Show outliers on Train_data
                'save_model':           0,
                'minutal_frequency':   10,
              }
    
    # *******************READ HISTORICAL SCADA 
    df_hist_raw = read_historical_scada(options)
    
    # DATA Prepprocessing and model definition
    df_hist, df_train, df_test, model = data_preprocessing (df_hist_raw, options)
    
    # Prediction using ML
    predict(df_hist, df_train, df_test, model, options)
    
    # Score results comparing with validation 24scada
    model_test(options)
    
if __name__ == "__main__":
    main()

## PREDICTION

In [6]:
def predict(df_hist, df_train, df_test, model, options):
    ## ML MODEL CREATION

    # define paths
    cwd = os.getcwd()
    # path to ML models
    model_path = os.path.join(cwd,'models')

    # ******************************************
    # ML MODEL DEFINITION, TRAINING and SCORE

    # Define features for model training and target label    
    features = ['WNACWindSpeed', 'WNACAmbTemp']
    target = 'WTURPower' 
    predict = 'energy_production'

    # Loop over assets
    assets = df_train.asset.unique()
    for i, asset in enumerate(assets):
        train_mask = df_train.asset==asset
        test_mask = df_test.asset==asset

        # Data definition for ML training
        x_train = df_train.loc[train_mask, features]
        y_train = df_train.loc[train_mask, target]
        
        x_test  = df_test.loc[test_mask, features]
        y_test  = df_test.loc[test_mask, target]

        model.fit(x_train, y_train)    

        #********************
        # SAVE ML MODEL ?
        
        if options['save_model'] == 1:
            out_file = 'model_'+ asset + '.sav'
            if not os.path.exists(model_path):
                os.mkdir(result_path)
            with open(model_path +  '/' + out_file, 'wb') as f:
                pickle.dump(model, f)

        #*******************
        # MLModel metrics    
        model_score = model.score(x_test,y_test)
        y_pred = model.predict(x_test)
        mse = mean_squared_error(y_test, y_pred)*100
        mae = mean_absolute_error(y_test,y_pred)*100
        r2  = r2_score(y_test, y_pred)*100      

        # Build dataframe with predicted values. Used for visualization
        if i == 0:
            df_prediction = pd.DataFrame(data=y_pred, index=x_test.index, columns=[predict])
            df_prediction['asset']=asset                             
        else:
            df_concat = pd.DataFrame(data=y_pred, index=x_test.index, columns=[predict])
            df_concat['asset']=asset                     
            df_prediction = pd.concat([df_prediction,df_concat], axis=0)

        # SHOWN TEST SCORES **************************************************
        if options['test_score']==1:
            if i==0: print('Machine Learing model score wiht historical scada') 
            print('Asset: {}'.format(asset) )
            print("100x Mean squared error: {:.5f}".format(mse))
            print("100x r2_score: {:.5f}".format(r2))
            print("100x Mean absolute error: {:.5f}".format(mae))
            print("")  

        #*************************************************************
        # PLOT Predictions for Y_pred on time series
        if options['historical_pwr']==1:
            mystyle()
            dfA = df_hist[df_hist.asset == asset]
            dfB = df_prediction[df_prediction.asset == asset]

            fig, ax1 = plt.subplots( figsize=(9,4) )
            dfA[target].plot(ax=ax1, lw=.5)
            dfB[predict].plot(style='g.-', lw=0, ax=ax1)
            ax1.set_title('Historical SCADA records for asset: '+ asset)
            ax1.set_ylabel(target)
            ax1.set_xlabel('month-day')
            myFmt = mdates.DateFormatter('%m-%d')
            ax1.xaxis.set_major_formatter(myFmt)
            plt.setp( ax1.xaxis.get_majorticklabels(), rotation=0 );
            if i ==0:
                pass
                #sfig('raw_pwr')   


## MODEL SCORE USING EVALUATION DATA

In [5]:
def model_test(options):

    # define paths
    cwd = os.getcwd()
    # path to ML models
    model_path = os.path.join(cwd,'models')
    
    # *****************************************
    # READ EVALUATION SCADA DATA PROVIDED BY NEM

    df_scada = read_evaluation_scada()     

    # Define features for model fit and target label    
    features = ['WNACWindSpeed', 'WNACAmbTemp']
    target = 'WTURPower'
    predict = 'energy_production'

    # Loop over assets
    assets = df_scada.asset.unique()
    for i, asset in enumerate(assets):

        # DATA Selection
        mask = df_scada.asset==asset
        x_eval = df_scada.loc[mask, features]
        y_eval = df_scada.loc[mask, target]

        # read previously trained ML odel
        in_file = 'model_'+ asset + '.sav'
        with open(model_path +  '/' + in_file, 'rb') as f:
            model = pickle.load(f) 

        # ML Prediction
        y_pred = model.predict(x_eval)
        mse = mean_squared_error(y_eval, y_pred)*100
        mae = mean_absolute_error(y_eval,y_pred)*100
        r2  = r2_score(y_eval, y_pred)*100

        # Build dataframe with predicted values. Used for visualization
        if i == 0:
            df_prediction = pd.DataFrame(data=y_pred, index=x_eval.index, columns=[predict])
            df_prediction['asset']=asset                             
        else:
            df_concat = pd.DataFrame(data=y_pred, index=x_eval.index, columns=[predict])
            df_concat['asset']=asset                     
            df_prediction = pd.concat([df_prediction,df_concat], axis=0)

        # SHOW Model VALIDATION score
        if options['validation_score']==1:
            if i==0: print('Machine Learing model score for evaluation scada')
            print('Asset: {}'.format(asset) )    
            print("100x Mean squared error: {:.5f}".format(mse))
            print("100x r2_score: {:.5f}".format(r2))
            print("100x Mean absolute error: {:.5f}".format(mae))
            print('')

        # PLOT Predictions TIME SERIES
        if options['validation_pwr']==1:
            mystyle()
            dfA = df_scada[df_scada.asset == asset]
            dfB = df_prediction[df_prediction.asset == asset]

            fig, ax1 = plt.subplots( figsize=(9.5,6) )
            dfA[target].plot(ax=ax1, marker='.', lw=.3)
            dfB[predict].plot(style='g.-', lw=0, ax=ax1)
            ax1.set_title('Prediction Output for asset: '+ asset)
            ax1.set_ylabel(target)
            plt.setp( ax1.xaxis.get_majorticklabels(), rotation=0 );

  

## DATA PREPROCESS

In [4]:
def data_preprocessing(df_hist, options):
    ''' Data cleaning using preliminar ML model '''
    
    df_hist = df_hist.dropna(axis=0)  # remove any row with nans
    
    # ******************************************
    # CLEAN HISTORICAL scada USING ML and residuals

    # Define features for model training and target label    
    features = ['WNACWindSpeed', 'WNACAmbTemp']
    target = 'WTURPower' 

    # Loop over assets
    assets = df_hist.asset.unique()
    for i, asset in enumerate(assets):
        asset_mask = df_hist.asset==asset

        # Data definition for ML training
        y_data = df_hist.loc[asset_mask, target]
        x_data = df_hist.loc[asset_mask, features]
        x_train, x_test, y_train, y_test, model = ML_model(x_data,y_data) #FUN Call
        model.fit(x_train, y_train)

        y_pred = model.predict(x_train)
        # calculate squared error
        error= 1000*(y_pred - y_train)**2
        error_lim = 1.0 # error limit for valid values
        
        if i == 0:
            df_train = x_train.copy()
            df_train[target] = y_train
            df_train['asset'] = asset
            df_train['pred_pwr'] = y_pred
            df_train['error'] = error
            df_train['valid'] = df_train.error <= error_lim
        else:
            df_concat = x_train.copy()
            df_concat[target] = y_train
            df_concat['asset'] = asset
            df_concat['pred_pwr'] = y_pred
            df_concat['error'] = error
            df_concat['valid'] = df_concat.error <= error_lim            
            df_train = pd.concat([df_train, df_concat], axis=0)

        if i == 0:
            df_test = x_test.copy()
            df_test[target] = y_test
            df_test['asset'] = asset                             
        else:
            df_concat2 = x_test.copy()
            df_concat2[target] = y_test
            df_concat2['asset'] = asset                     
            df_test = pd.concat([df_test, df_concat2], axis=0)                  
 
        # plot Power Curves and residuals

        if options['pwr_curve_residuals']==1:   
            
            dfA = df_hist[df_hist.asset==asset]
            dfB = df_train[df_train.asset==asset]
            dfB_remove = dfB[dfB.valid==False]
            dfC = df_test[df_test.asset==asset]

            #fig, (ax1,ax2,ax3) = plt.subplots(3,1, figsize=(9,18) )
            fig, (ax1,ax2) = plt.subplots(2,1, figsize=(9,12) )
            s=10 # marker size
            
            dfA.plot.scatter('WNACWindSpeed', 'WTURPower', ax=ax1, marker='.', s=s, c='b', label='5-Months-Historical')
            dfB.plot.scatter('WNACWindSpeed', 'WTURPower', ax=ax2, marker='.', s=s, c='g', label='MLM_Train_Data')
            dfB_remove.plot.scatter('WNACWindSpeed', 'WTURPower', ax=ax2, marker='.', s=s, c='r', label='Outliers')  
            
            ax1.set_title('Power Curve for asset: '+ asset)
            ax2.set_title('Power Curve for asset: '+ asset)
            #dfC.plot.scatter('WNACWindSpeed', 'WTURPower', 
            #                 ax=ax3, marker='.', s=s, c='b', label ='MLM_Tests_Data')
            
            
        # SHOW Power Curves and sub_optimal datapoints
        
        if options['pwr_curve_suboptimal']==1:   
            df_A = df_hist[df_hist.asset==asset]
            
            # Remove near-zero Power values at windspeed higher than 0.1
            mask = (df_A.WTURPower < 0.001) & (df_A.WNACWindSpeed > 0.10) 
            df_B = df_A[mask]
            # Remove sub-optimal Power values 
            mask = (df_A.WTURPower < 0.95) & (df_A.WNACWindSpeed > 0.35) 
            df_C = df_A[mask]

            fig, (ax1) = plt.subplots(1,1, figsize=(7,5) )
            s=2 # marker size
            df_A.plot.scatter('WNACWindSpeed','WTURPower', ax=ax1, marker='.',s=1,c='g',label='OK')
            df_C.plot.scatter('WNACWindSpeed','WTURPower', ax=ax1, marker='+',s=15,c='m',label ='NOK under-rated-pwr')    
            df_B.plot.scatter('WNACWindSpeed','WTURPower', ax=ax1, marker='x',s=15,c='b',label='NOK zero-pwr')
            ax1.set_title('Power Curve for asset: '+ asset)
            
        # plot PREDICTED Power Curves 
        
        if options['pwr_curve_predicted']==1:   
            df_A = df_train[df_train.asset==asset]
            fig, (ax1) = plt.subplots(1,1, figsize=(9,6) )
            df_A.plot.scatter('WNACWindSpeed', 'pred_pwr', ax=ax1, marker='.', s=10, c='k')
            ax1.set_title('Power Curve for asset: '+ asset)
            ax1.set_ylabel('Predicted WTURPower')
            
    ## RESIDUALS INSPECTION (visual aid during development)
    execute = 0
    if execute == 1:
        df_train.head()
        df_train.describe()
        df_train.quantile(.95)
        print ('Descriptive stats for error dataframe')
        print ( df_train.pivot( columns='asset', values='error').describe() )
        print ( ' 95% quantile for error column')
        print ( df_train.pivot( columns='asset', values='error').quantile(.95) )
        print ('')
        # visualize residuals kde
        ast='A001'
        fig, ax1 = plt.subplots(figsize=(9,3) )
        sns.kdeplot(df_train.loc[df_train.asset==ast,'error'], label=ast, ax=ax1)
    
    # DATA CLEANING 
    # set execute to 1 to remove outliers from ml model training data
    execute = 0
    if execute==1: 
        # Remove residuals higher than 1.0
        #df_train =  df_train[df_train.valid==True]
        
        # Remove near-zero Power values at windspeed higher than 0.1
        mask = np.invert((df_train.WTURPower < 0.001) & (df_train.WNACWindSpeed > 0.10)) 
        df_train = df_train[mask]

        # Remove sub-optimal Power values 
        mask = np.invert((df_train.WTURPower < 0.95) & (df_train.WNACWindSpeed > 0.35)) 
        df_train = df_train[mask]
    
    return df_hist, df_train, df_test, model

## ML FUNCTIONS

In [3]:
def ML_model(x_data, y_data):
    ''' Machine Learning Model Definition '''
    
    from sklearn.model_selection import train_test_split
    from sklearn.tree import DecisionTreeRegressor
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.ensemble import GradientBoostingRegressor
    #from xgboost.sklearn import XGBRegressor
    
    x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=.3, random_state=0) 
    
    selector = 3
    
    if selector == 1:
        model = DecisionTreeRegressor(max_depth=7, random_state=0, splitter='best')
    elif selector == 2:
        model = GradientBoostingRegressor(n_estimators=100,  learning_rate=0.1, max_depth=7, random_state=0, 
                                          loss='ls')
    elif selector==3:    
        model = RandomForestRegressor(max_depth=9,random_state=0)

    elif selector==4:
        params = {
        'objective': "reg:linear",
        'max_depth': 5,
        'learning_rate': 1.0,
        'silent': 1.0,
        'n_estimators': 10}
        model = XGBRegressor(**params)
        
    return x_train, x_test, y_train, y_test, model      

## READ CSV FILES

In [2]:
def read_historical_scada(options):
    '''select and read 1, 2, or 10 minutal SCADA datasets to train ML model'''
    
    cwd = os.getcwd()
    
    frequency = options['minutal_frequency']
    
    if frequency == 10:
        in_path = os.path.join(cwd,'datasets', 'historical', 'raw')
        file_ext =  '-10m.csv'
        
    elif frequency == 2:
        in_path = os.path.join(cwd,'datasets', 'historical', 'raw', '2m_results')
        file_ext =  '-2m.csv'
        
    elif frequency == 1:
        in_path = os.path.join(cwd,'datasets', 'historical', 'raw', '1m_results')
        file_ext =  '-1m.csv'       

    
    months = ['04','05','06','07','08']
    for i, month in enumerate (months):
        in_file = '2015-'+ month + file_ext        
        in_csv = os.path.join(in_path,in_file)
        if i==0:           
            df = pd.read_csv(in_csv)
        else:
            df_cat = pd.read_csv(in_csv)
            df = pd.concat([df,df_cat], axis=0)
            
    df['datetime'] = pd.to_datetime(df['timestamp'], unit='s')
    df.set_index('datetime', inplace=True)
    return df  


def read_evaluation_scada():
    ''' '''
    cwd = os.getcwd()
    in_path = os.path.join(cwd,'datasets', 'evaluation', 'online')
    dates = ['2015-09-01',
             '2015-09-09',
             '2015-09-17',
             '2015-09-25',
             '2015-10-03',
             '2015-10-10']
    for i, date in enumerate (dates):
        in_file = date +'.csv'         
        in_csv  = os.path.join(in_path,in_file)
        if i==0:           
            df = pd.read_csv(in_csv)
        else:
            df_concat = pd.read_csv(in_csv)
            df = pd.concat([df,df_concat], axis=0)
            
    df['datetime'] = pd.to_datetime(df['unixtime'], unit='s')
    df.set_index('datetime', inplace=True)
    
    ## DATA CLEANING
    df = df.dropna(axis=0) 
    
    # clean evaluation scada data?
    execute=0
    if execute==1:
        # Remove near-zero Power values at windspeed higher than 0.1
        mask = np.invert((df.WTURPower<0.001) & (df.WNACWindSpeed>0.1)) 
        df = df[mask]
        
    return df  

In [1]:
def mystyle():
    '''set some default parameters for visualizations'''
    
    plt.style.use('seaborn-white')
    plt.rcParams ['axes.grid']=False 
    plt.rcParams ['axes.spines.left']   = True   # display axis spines
    plt.rcParams ['axes.spines.bottom'] = True
    plt.rcParams ['axes.spines.top']    = False
    plt.rcParams ['axes.spines.right']  = False 
    
def sfig(figname='myfig'):
    
    name = figname+'.png'
    path = r'C:\Home00Ser\Python\NEM Challenge\Fase2\report\figs'
    fname = os.path.join(path,name)

    plt.savefig(fname, dpi=900, facecolor='w', edgecolor='w',
            orientation='portrait', papertype=None, format=None,
            transparent=False, bbox_inches='tight', pad_inches=0.1,
            frameon=None)     
    

In [93]:
''' VERSION CTRL

V01: ok: cleaned and commented for file sharing.
     ok: pending better historical data cleaning
     ok: pending model scoring using evaluation scada data

V02: ok: model scoring on evaluation scada
     ok: check data cleaning and prediction of near zero power values
     
V03: ok: Script better orginized in functions

V04: ok: Improve data cleaning using ML model and residuals
     ok: assess impact of cleaing method
     ok: remove previous data preprocessing strategy: done
     ok: write MAIN function

V05: ok: train models with 2-minutal datasets and quantify gain. Score imporves, go for 1min
     
V06: pend: detect Nan values in input files and avoid prediction. improve results for 09.09
     ok:   perform data cleaning on train_data only.  
     pend: try XGB     
     
     
''';