# NEM DATA CHALLENGE, Phase 1
## Visualizations for data analysis.

Following data preprocessing (data horizontalization and resampling from 1s to 10m intervals), this code aims to produce a number of visualizations to explor data and the select plots to be included in the technical report. 

In [None]:
%matplotlib inline
import os
import numpy as np
import pandas as pd
from scipy.stats import pearsonr
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import matplotlib.dates as mdates
import statsmodels.api as sm
from scipy.interpolate import UnivariateSpline

In [None]:
# see the pre-defined styles provided.
#plt.style.available

In [None]:
def read_csv_file():
    ''' read SCADA csv file resampled at 10 min intervals'''

    cwd = os.getcwd()
    in_path  = os.path.join(cwd,'data')
    in_file = 'may2015_scada_10m.csv'
    in_csv = os.path.join(in_path,in_file)
    df = pd.read_csv(in_csv)
    indx =df.timestamp[df.timestamp==1433116800].index # index positions where datetime= 1433116800
    df.loc[indx,'timestamp']=1433116800-1 # subtract one second to avoid date 01-06-2015
    df['timestamp'] = pd.to_datetime(df['timestamp'], unit='s')
    df.set_index('timestamp', drop=True, inplace=True)
    df = df[sensor_list()] # reorder columns according to sensor list
    df1 = df[df.asset=='A001']
    df2 = df[df.asset=='A002']
    df3 = df[df.asset=='A003']
    df4 = df[df.asset=='A004']
    df5 = df[df.asset=='A005']
    return df,df1,df2,df3,df4,df5   

def sensor_list():
        NAC = ['WNACWindSpeed','WNACDirection','WNACWindDirection','WNACAmbTemp','WNACNacelleTemp']
        GEN = ['WGENPhase1Temp','WGENPhase2Temp','WGENPhase3Temp',
               'WGENGenSpeed','WGENBearNDETemp','WGENBearDETemp']
        ROT = ['WROTPitchAngleSP','WROTSpeed','WROTPitchAngleAvg']
        GDC = ['WGDCTrafoPhase1Temp','WGDCTrafoPhase2Temp','WGDCTrafoPhase3Temp']
        TUR = ['WTURReactivePower','WTURReactivePowerAux','WTURPowerAux','WTURPower']
        HDR = ['WHDRGroupOilPress','WHDRGroupOilTemp']
        TRM = ['WTRMOilTemp','WTRMBearTemp'] 
        YAW = ['WYAWPressure']
        CNV = ['WCNVCosPhi','WCNVNetVoltage'] #converter
        cols = NAC+GEN+ROT+GDC+TUR+HDR+TRM+YAW+CNV+['asset']
        return cols

# Lists

HCOR = ['WNACWindSpeed','WGENGenSpeed', 'WGENPhase1Temp','WROTSpeed','WTURPower','WTURReactivePower','WCNVCosPhi'  ]
NAC = ['WNACWindSpeed','WNACDirection','WNACWindDirection','WNACAmbTemp','WNACNacelleTemp']
GEN = ['WGENPhase1Temp','WGENPhase2Temp','WGENPhase3Temp',
        'WGENGenSpeed','WGENBearNDETemp','WGENBearDETemp']
ROT = ['WROTPitchAngleSP','WROTSpeed','WROTPitchAngleAvg']
GDC = ['WGDCTrafoPhase1Temp','WGDCTrafoPhase2Temp','WGDCTrafoPhase3Temp']
TUR = ['WTURReactivePower','WTURReactivePowerAux','WTURPowerAux','WTURPower']
HDR = ['WHDRGroupOilPress','WHDRGroupOilTemp']
TRM = ['WTRMOilTemp','WTRMBearTemp']
YAW = ['WYAWPressure']
CNV = ['WCNVCosPhi','WCNVNetVoltage']
TEMPS = ['WNACAmbTemp','WNACNacelleTemp','WGENPhase1Temp','WGENBearNDETemp','WGENBearDETemp', 
         'WGDCTrafoPhase1Temp','WHDRGroupOilTemp' ,'WTRMOilTemp','WTRMBearTemp' ]
ALL =  NAC+GEN+ROT+GDC+TUR+HDR+TRM+YAW+CNV+['asset']
ALL2 = NAC+GEN+ROT+GDC+TUR+HDR+TRM+YAW+CNV
LIS = ['WNACWindSpeed','WTURPower','WROTSpeed']
ASS = ['asset']    

In [None]:
# Read csv and assign one dataframe per asset + one global dataset
dfall,df1,df2,df3,df4,df5 = read_csv_file()
# create dataframe as the mean of dataframes df1 to df5
df6 = dfall.groupby(dfall.index).mean()

# Resample info to daily basis
dfd= dfall.groupby('asset').resample('D').mean()
dfd.reset_index(level=['asset'], inplace=True)
dfd1 = dfd[dfd.asset=='A001']
dfd2 = dfd[dfd.asset=='A002']
dfd3 = dfd[dfd.asset=='A003']
dfd4 = dfd[dfd.asset=='A004']
dfd5 = dfd[dfd.asset=='A005']
dfd6 = dfd.groupby(dfd.index).mean()

In [None]:
# dataframe with all scada variables and assets (A001 to A005)
dfall.head()

### Calculate the Correlation Matrix

In [None]:
# As example, calculate the correlation matrix for asset A003 (df3)
dfc=df3.corr()
x= 0.4  # Define threshold value
dfc = dfc[(dfc>=x)& (dfc<1)|(dfc<=-x)&(dfc>-1)]
dfc = dfc.dropna(axis=0, how='all')
dfc = dfc.dropna(axis=1, how='all')
dfc

In [None]:
df1.describe().T

### Identify Missing Values in csv files

In [None]:
df=dfall
nans= pd.isnull(df).any(1).nonzero()[0]
nansx = df.index[nans]
print(len(nansx))
nansx

## Time Series

In [None]:
df = df1.copy()
col1 = ['WTURPower']
col2 = ['WNACWindSpeed']
fig, (ax1, ax2) = plt.subplots(2, 1, sharex=True, sharey=False, figsize=(9,6) )
df[col1].plot( ax=ax1)
df[col2].plot( ax=ax2)

### Cumulative distribution

In [None]:
col1=['WNACWindSpeed']
col2 = ['WTURPower']
df=df5.copy()
df[col1].plot.hist(alpha=0.7, bins=500,figsize=(8,4), cumulative=True, normed=True);
df[col2].plot.hist(alpha=0.7, bins=500,figsize=(8,4), cumulative=True, normed = True);

### Comparison between each asset signal to mean time series.

In [None]:
# A total of 28 SCADA variables are contained in the csv files. 
# limite the number of plots showns by setting N to a value in between 1 and 28

N=2
for item in ALL2[0:N]:
    #col = ['WNACWindSpeed','WTURPower']
    fig, ((ax1, ax2),( ax3, ax4),( ax5, ax6)) = plt.subplots(3, 2, figsize=(9,5), sharex=True, sharey=True )
    (df1[item]-df6[item]).plot( ax=ax1)
    #ax1.set_ylabel('Units')
    ax1.set_title(item)
    (df2[item]-df6[item]).plot( ax=ax2)
    (df3[item]-df6[item]).plot( ax=ax3)
    (df4[item]-df6[item]).plot( ax=ax4)
    (df5[item]-df6[item]).plot( ax=ax5)
    (df6[item]-df6[item]).plot( ax=ax6)

### Time series for All assets

In [None]:
# A total of 28 SCADA variables are contained in the csv files. 
# limite the number of plots showns by setting N to a value in between 1 and 28
N=2

for item in ALL2[0:N]:
    fig, ((ax1, ax2),( ax3, ax4),( ax5, ax6)) = plt.subplots(3, 2, figsize=(9,5), sharex=True, sharey=True )
    (df1[item]).plot( ax=ax1)
    #ax1.set_ylabel('Units')
    ax1.set_title(item)
    (df2[item]).plot( ax=ax2)
    (df3[item]).plot( ax=ax3)
    (df4[item]).plot( ax=ax4)
    (df5[item]).plot( ax=ax5)
    (df6[item]).plot( ax=ax6)

### BOXPLOT Group by asset

In [None]:
# A total of 28 SCADA variables are contained in the csv files. 
# limite the number of plots showns by setting N to a value in between 1 and 28
N=2

sensors = dfall.columns.tolist()
for sensor in sensors[0:N]:
    fig, ax = plt.subplots( figsize=(5, 3))
    dfall[[sensor, 'asset']].boxplot(by='asset', ax=ax, showmeans=True, whis=[1, 99], sym='.')
    ax.set_title(sensor)
    #ax.suptitle=''
    ax.yaxis.grid(False)
    ax.xaxis.grid(False)
    fig.suptitle('')
    #sfig('bp-'+sensor)

### Time Series Plot showing sub-optimal data-pairs

In [None]:
dfA= df2.copy()
dfB = dfA[(dfA.WTURPower<0.95)&(dfA.WNACWindSpeed>0.35)]
item1 = 'WNACWindSpeed'
fig, (ax1) = plt.subplots( figsize=(9,3) )
dfA[item1].plot(ax=ax1, lw=.3, legend=item1)
dfB[item1].plot(style='r.-', lw=0, ax=ax1)
myFmt = mdates.DateFormatter('%d')
ax1.xaxis.set_major_formatter(myFmt)
ax1.set_xlabel('May 2015')

plt.setp( ax1.xaxis.get_majorticklabels(), rotation=0 );

In [None]:
# Variables list

NAC = ['WNACWindSpeed','WNACDirection','WNACWindDirection','WNACAmbTemp','WNACNacelleTemp']
GEN = ['WGENPhase1Temp','WGENPhase2Temp','WGENPhase3Temp',
        'WGENGenSpeed','WGENBearNDETemp','WGENBearDETemp']
ROT = ['WROTPitchAngleSP','WROTSpeed','WROTPitchAngleAvg']
GDC = ['WGDCTrafoPhase1Temp','WGDCTrafoPhase2Temp','WGDCTrafoPhase3Temp']
TUR = ['WTURReactivePower','WTURReactivePowerAux','WTURPowerAux','WTURPower']
HDR = ['WHDRGroupOilPress','WHDRGroupOilTemp']
TRM = ['WTRMOilTemp','WTRMBearTemp']
YAW = ['WYAWPressure']
CNV = ['WCNVCosPhi','WCNVNetVoltage']
TEMPS = ['WNACAmbTemp','WNACNacelleTemp','WGENPhase1Temp','WGENBearNDETemp','WGENBearDETemp', 
         'WGDCTrafoPhase1Temp','WHDRGroupOilTemp' ,'WTRMOilTemp','WTRMBearTemp' ]

In [None]:
dfA= df1.copy()
dfB = dfA[(dfA.WTURPower<0.95)&(dfA.WNACWindSpeed>0.35)]
item1 = 'WTRMBearTemp'
item2 = 'WTRMOilTemp'
item3 = 'WGENBearNDETemp'
item4 = 'WROTPitchAngleSP'

fig, ((ax1,ax2),(ax3,ax4)) = plt.subplots(2,2, sharex=True, figsize=(15,10) )

dfA[item1].plot(ax=ax1, lw=.3)#, legend=item1)
dfA[item2].plot(ax=ax2, lw=.3)#, legend=item2)
dfA[item3].plot(ax=ax3, lw=.3)#, legend=item3)
dfA[item4].plot(ax=ax4, lw=.3)#, legend=item4)

dfB[item1].plot(style='r.-', lw=0, ax=ax1, legend='')
dfB[item2].plot(style='r.-', lw=0, ax=ax2, legend='')
dfB[item3].plot(style='r.-', lw=0, ax=ax3, legend='')
dfB[item4].plot(style='r.-', lw=0, ax=ax4, legend='')

ax1.set_title(item1)
ax2.set_title(item2)
ax3.set_title(item3)
ax4.set_title(item4)

ax1.spines['bottom'].set_color('none')
ax2.spines['bottom'].set_color('none')

myFmt = mdates.DateFormatter('%d')
ax3.xaxis.set_major_formatter(myFmt)
ax3.set_xlabel('May 2015')
ax4.xaxis.set_major_formatter(myFmt)
ax4.set_xlabel('May 2015')

plt.setp( ax3.xaxis.get_majorticklabels(), rotation=0 )
plt.setp( ax4.xaxis.get_majorticklabels(), rotation=0 );

In [None]:
dfA= df2.copy()
dfB = dfA[(dfA.WTURPower<0.95)&(dfA.WNACWindSpeed>0.35)]

item1 = 'WGENBearDETemp'
item2 = 'WROTPitchAngleSP'
item3 = 'WHDRGroupOilTemp'
item4 = 'WTRMOilTemp'

fig, ((ax1,ax2),(ax3,ax4)) = plt.subplots(2,2, sharex=True, figsize=(15,10) )

dfA[item1].plot(ax=ax1, lw=.3)#, legend=item1)
dfA[item2].plot(ax=ax2, lw=.3)#, legend=item2)
dfA[item3].plot(ax=ax3, lw=.3)#, legend=item3)
dfA[item4].plot(ax=ax4, lw=.3)#, legend=item4)

dfB[item1].plot(style='r.-', lw=0, ax=ax1, legend='')
dfB[item2].plot(style='r.-', lw=0, ax=ax2, legend='')
dfB[item3].plot(style='r.-', lw=0, ax=ax3, legend='')
dfB[item4].plot(style='r.-', lw=0, ax=ax4, legend='')

ax1.set_title(item1)
ax2.set_title(item2)
ax3.set_title(item3)
ax4.set_title(item4)

ax1.spines['bottom'].set_color('none')
ax2.spines['bottom'].set_color('none')

myFmt = mdates.DateFormatter('%d')
ax3.xaxis.set_major_formatter(myFmt)
ax3.set_xlabel('May 2015')
ax4.xaxis.set_major_formatter(myFmt)
ax4.set_xlabel('May 2015');

### PAIR SCATTER PLOTS

In [None]:
HCOR = ['WNACWindSpeed','WGENGenSpeed','WROTSpeed','WTURPower','WTURReactivePower','WCNVCosPhi'  ]
sns.set(style="ticks")
df=df6[HCOR]
sns.pairplot(df, diag_kind='hist', markers='.', size=2);

## POWER CURVE ZOOM to detect sub optimal points

In [None]:
fig, (ax1) = plt.subplots( sharex=True, sharey=True,figsize=(5,5) )
ax1.set_ylim([0.2,1.1])
ax1.set_xlim([0.2,0.6])
df1.plot.scatter('WNACWindSpeed', 'WTURPower', c='r', ax=ax1, marker='.', label='A001')
df2.plot.scatter('WNACWindSpeed', 'WTURPower', c='b', ax=ax1, marker='.', label='A002')
df3.plot.scatter('WNACWindSpeed', 'WTURPower', c='g', ax=ax1, marker='.', label='A003')
df4.plot.scatter('WNACWindSpeed', 'WTURPower', c='y', ax=ax1, marker='.', label='A004')
df5.plot.scatter('WNACWindSpeed', 'WTURPower', c='c', ax=ax1, marker='.', label= 'A005');

In [None]:
fig, ax = plt.subplots( figsize=(8, 3))
y = 'WNACAmbTemp'
df6.boxplot(y, by=df6.index.day, ax=ax, rot=0, fontsize=8, showmeans=True)
ax.set_title('WNAC Ambient Temperature variation in May 2015')
fig.suptitle('')
ax.set_xlabel("Day");

## DAILY MEASUREMENTS

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(8, 3))
x=df6.index.dayofyear-120
sns.boxplot(x, df6.WNACAmbTemp, ax=ax);

### KDE Distributions

In [None]:
# A total of 28 SCADA variables are contained in the csv files. 
# limite the number of plots showns by setting N to a value in between 1 and 28
N=2
sensors = dfall.columns.tolist()
for sensor in sensors[0:N]:
    fig, ax = plt.subplots(figsize=(3, 2))
    sns.kdeplot(df1[sensor], label="")
    sns.kdeplot(df2[sensor], label="")
    sns.kdeplot(df3[sensor], label="")
    sns.kdeplot(df4[sensor], label="")
    sns.kdeplot(df5[sensor], label="")   
    ax.set_title(sensor)

## Plot all columns in dataframe as time series

In [None]:
df = df1.copy()
COL = NAC+GEN+ROT+GDC+TUR+HDR+TRM+YAW
df[COL].plot(subplots=True, figsize=(8,50));

In [None]:
fig, (ax1) = plt.subplots(figsize=(10,5), sharex=True, sharey=True )
colors = ['b','c']
for idx , item in enumerate (['WNACAmbTemp','WNACNacelleTemp']):
    df2[item].plot(color =colors[idx], ax=ax1, label=item)
    #ax1.set_ylabel('Units')
    ax1.set_title(item)

### Ambien and Nacelle Temperature

In [None]:
a = ['WNACAmbTemp']
b = ['WNACNacelleTemp']
df=df6

fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(11,5))
plt.subplots_adjust(hspace = .001)

df.plot(x=df.index, y=a, color ='c', ax=ax1);
df.plot(x=df.index, y=b, color ='b', ax=ax1);
ax1.set_title('Ambient and Nacelle Temperature variation during May 2015');
ax1.set_xlabel('')
ax1.set_xticklabels([])
ax1.spines['bottom'].set_color('none')

x = df.index.day
y = df6.WNACAmbTemp
sns.boxplot(x,y , ax=ax2, color='white', width=.5, fliersize=0)
ax2.set_title('')
ax2.set_xlabel('')

# figure styles
sns.set_style('white')
sns.set_context('paper', font_scale=1.2)
#sns.set_style('ticks', {'axes.edgecolor': '0', 'xtick.color': '0', 'ytick.color': '0'})
sns.despine(offset=5, trim=True)
plt.setp(ax2.artists, edgecolor = 'k', facecolor='w')
plt.setp(ax2.lines, color='k');

#sfig('TempVar')

### Cp vs Turbine Tip Speed Curve

In [None]:
df=df1.copy()
df['Cp']=df['WTURPower']/(df['WNACWindSpeed']**3)
df['Cp']= df['Cp']/df['Cp'].max()
df['lambda']=df['WROTSpeed']/df['WNACWindSpeed']
df['lambda']= df['lambda']/df['lambda'].max()
L = 'lambda'
Ws = 'WNACWindSpeed'
Cp = 'Cp'
fig, ax1 = plt.subplots(1, 1, sharex=True, sharey=True,figsize=(10,4) )
df.plot.scatter(L,Cp , ax=ax1, marker='.');
#ax1.set_xlabel('WNACWindSpeed')
#ax1.set_ylabel('WROTSpeed')
ax1.set_title('Asset')
ax1.set_ylim(-.25, 1);

In [None]:
df=dfall.copy()
df=df[df.WNACWindSpeed>0.06]
p = np.polyfit(df['WNACWindSpeed'], df['WTURPower'], 10)
Pfit = np.poly1d(p)
df['Pfit'] = Pfit(df['WNACWindSpeed'])

y = 'WTURPower'
x = 'WNACWindSpeed'

fig, ax1 = plt.subplots(figsize=(10,4) )
df.plot.scatter(x, y ,ax=ax1, marker='.');

In [None]:
def mystyle():
    
    
    plt.style.use('seaborn-white')
    plt.rcParams ['axes.grid']=False 
    plt.rcParams ['axes.spines.left']   = True   # display axis spines
    plt.rcParams ['axes.spines.bottom'] = True
    plt.rcParams ['axes.spines.top']    = False
    plt.rcParams ['axes.spines.right']  = False
    
    
    #plt.style.use('ggplot')
    #mpl.rcParams['font.size'] = 12
    #mpl.rcParams['legend.fontsize'] = 'large'
    #mpl.rcParams['figure.titlesize'] = 'medium'
    #mpl.rcParams['grid.color'] = 'k'
    #mpl.rcParams['grid.linestyle'] = ':'
    #mpl.rcParams['grid.linewidth'] = 0.5
    #plt.rcParams['axes.facecolor'] = 'white'
    
    ### AXES
    # default face and edge color, default tick sizes,
    # default fontsizes for ticklabels, and so on.  See
    # http://matplotlib.org/api/axes_api.html#module-matplotlib.axes
    #axes.facecolor      : white   # axes background color
    #axes.edgecolor      : black   # axes edge color
    #axes.linewidth      : 0.8     # edge linewidth
    #axes.grid           : False   # display grid or not
    #axes.titlesize      : large   # fontsize of the axes title
    #axes.titlepad       : 6.0     # pad between axes and title in points
    #axes.labelsize      : medium  # fontsize of the x any y labels
    #axes.labelpad       : 4.0     # space between label and axis
    #axes.labelweight    : normal  # weight of the x and y labels
    #axes.labelcolor     : black
    #axes.axisbelow      : 'line'  # draw axis gridlines and ticks below
                                   # patches (True); above patches but below
                                   # lines ('line'); or above all (False)

    #axes.formatter.limits : -7, 7 # use scientific notation if log10
                                   # of the axis range is smaller than the
                                   # first or larger than the second
    #axes.formatter.use_locale : False # When True, format tick labels
                                       # according to the user's locale.
                                       # For example, use ',' as a decimal
                                       # separator in the fr_FR locale.
    #axes.formatter.use_mathtext : False # When True, use mathtext for scientific
                                         # notation.
    #axes.formatter.useoffset      : True    # If True, the tick label formatter
                                             # will default to labeling ticks relative
                                             # to an offset when the data range is
                                             # small compared to the minimum absolute
                                             # value of the data.
    #axes.formatter.offset_threshold : 4     # When useoffset is True, the offset
                                             # will be used when it can remove
                                             # at least this number of significant
                                             # digits from tick labels.

    # axes.spines.left   : True   # display axis spines
    # axes.spines.bottom : True
    # axes.spines.top    : True
    # axes.spines.right  : True

    #axes.xmargin        : .05  # x margin.  See `axes.Axes.margins`
    #axes.ymargin        : .05  # y margin See `axes.Axes.margins`
