### Understanding:
dataset: (row 1_time k = row 2_time k-1 = row 3_time k-2)
#### TASK: build 2 models (The inputs are without n_k, try and add it to see its contribution?)：
#### 1. Black-box inverter model: (model of ideal inverter: u_x_k-1= d_x_k-2 * u_dc_k-2)
#### inputs: d_a/b/c_k-3, d_a/b/c_k-2, i_a/b/c_k-1, i_a/b/c_k, u_dc_k-1,u_dc_k,
#### targets: u_a/b/c_k-1
#### 2. Black-box inverter compensation scheme:
#### inputs: u_a/b/c_k-1, d_a/b/c_k-3, i_a/b/c_k-3, i_a/b/c_k-2, u_dc_k-3, u_dc_k-2,
#### targets: d_a/b/c_k-2

## Data Preprocessing & Visualization

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('../input/inverter-data-set/Inverter Data Set.csv')
df.shape

In [None]:
df.head()

In [None]:
#列出所有欄位及其資料類型：全部都是float
df.dtypes

In [None]:
#查看缺失值：
def missing_values_table(df):
    mis_val = df.isnull().sum()
    #print(mis_val)
    mis_val_percent = 100*mis_val/len(df) 
    mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
    mis_val_table_ren_columns=mis_val_table.rename(
    columns = {0: 'Missing Values', 1: '% of total values'})
    mis_val_table_ren_columns =mis_val_table_ren_columns[mis_val_table_ren_columns.iloc[:,1]!=0].sort_values('% of total values',ascending=False).round(1)
    print('your selected has' +str(df.shape[1])+'columns.\n' 'there are' + str(mis_val_table_ren_columns.shape[0])+ ' columns that have missing values')
    
    return mis_val_table_ren_columns

In [None]:
df_misstb = missing_values_table(df)
#-->沒有缺失值

In [None]:
df.shape

In [None]:
df.hist(figsize=(17,17))

In [None]:
df['n_k'].describe()

### EDA -Correlation Coefficients:  between columns + between features and targets

In [None]:
#顯示所有欄位和target（u_a/b/c_k-1）的相關
#from kirgson's notebook
corr = df.corr()
# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)

plt.figure(figsize=(14,14))
_ = sns.heatmap(corr, mask=mask, cmap=cmap, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

In [None]:
#各栏位和目标(target)的相关系数浏览
targets = ['u_a_k-1','u_b_k-1','u_c_k-1']
for i in targets:
    df_corr = df.corr()[i].sort_values()

    # print the strongest correlation coefficients(positive & negative)
    print('strongest positive correlations with target: \n', df_corr.tail(8))
    print('strongest negative correlations with target: \n', df_corr.head(8))

### Visualization
#### Features in Phase a (head 5000 samples)

In [None]:
## rows有按照時間順序排列，phase a的d, i, u隨著時間k的變化图。
# 2500 samples/switch
p_sample= 5000

col_iak1=df['i_a_k-1'].head(p_sample)
p_iak1 = plt.subplot(4,1,1)
col_iak1.plot(use_index = True, figsize = (20,10), title='phase a')
plt.setp(p_iak1.get_xticklabels(), visible=False)
p_iak1.set_ylabel('Phase Currents in A')

col_uak1 = df['u_a_k-1'].head(p_sample)
p_uak1 = plt.subplot(4,1,2)
col_uak1.plot(use_index = True)
plt.setp(p_uak1.get_xticklabels(), visible=False)
p_uak1.set_ylabel('Mean Phase Voltages in V')

col_udck1 = df['u_dc_k-1'].head(p_sample)
p_udck1 = plt.subplot(4,1,3)
col_udck1.plot(use_index = True)
plt.setp(p_udck1.get_xticklabels(), visible=False)
p_udck1.set_ylabel('DC-link voltage in V')

col_nk = df['n_k'].head(p_sample)
p_nk = plt.subplot(4,1,4)
col_nk.plot(use_index = True)
p_nk.set_ylabel('Speed')

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline

In [None]:
#三個相位之間的currents之間的關係圖：
# p_iabc = df[['i_a_k','i_b_k','i_c_k']].head(5000)
# pd.plotting.scatter_matrix(p_iabc, alpha=0.2)
#三個相位之間的voltages之間的關係圖：
# p_uabc = df[['u_a_k-1','u_b_k-1','u_c_k-1']].head(5000)
# pd.plotting.scatter_matrix(p_uabc, alpha=0.2)

In [None]:
# u, udc, n, d, i 兩兩之間的關係圖
p_5 = df[['u_a_k-1','u_dc_k-1','i_a_k-1','n_k','d_a_k-2']].head(5000)
pd.plotting.scatter_matrix(p_5, alpha=0.2)

### Feature Engineering
domain knowledge...

## Model 1: Inverter Model 
#### Defined: Supervised - Regression problem
#### 0. Ideal model(Baseline) 1. Neural Network; 2. Random Forest

### Standardization

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [None]:
# Data Cutting
dfx1 = df[['i_a_k','i_a_k-1','i_b_k','i_b_k-1','i_c_k','i_c_k-1','u_dc_k','u_dc_k-1','d_a_k-2','d_a_k-3','d_b_k-2','d_b_k-3','d_c_k-2','d_c_k-3']]
dfy1 = df[['u_a_k-1','u_b_k-1','u_c_k-1','u_dc_k-2']]

In [None]:
# dfx1.shape
dfy1.shape

In [None]:
x_train1, x_test1, y_train1, y_test1 = train_test_split(dfx1, dfy1, test_size = 0.2)
x_train1, x_val1, y_train1, y_val1 = train_test_split(x_train1, y_train1, test_size = 0.25)

In [None]:
print(x_train1.shape, x_val1.shape, x_test1.shape, y_train1.shape, y_val1.shape, y_test1.shape)

In [None]:
# 3 targets y
y_tr1a = y_train1['u_a_k-1']
y_tr1b = y_train1['u_b_k-1']
y_tr1c = y_train1['u_c_k-1']

y_va1a = y_val1['u_a_k-1']
y_va1b = y_val1['u_b_k-1']
y_va1c = y_val1['u_c_k-1']

y_te1a = y_test1['u_a_k-1']
y_te1b = y_test1['u_b_k-1']
y_te1c = y_test1['u_c_k-1']

In [None]:
from sklearn import preprocessing

In [None]:
# Standardization
scaler = preprocessing.StandardScaler()

scaler.fit(x_train1)
x_train1_std = scaler.transform(x_train1)
x_val1_std = scaler.transform(x_val1)
x_test1_std = scaler.transform(x_test1)

In [None]:
# 檢查確認標準化後的均值
print('mean of standardized test dataset', round(x_test1_std[:,5].mean()))
print('std of standardized test dataset', round(x_train1_std[:,8].std()))

In [None]:
x_train1.columns

### 0. The Ideal Inverter Model : u_x_k-1 = d_x_k-2 * u_dc_k-2 (baseline)
(phase a)

In [None]:
# fix
df00 = pd.DataFrame(x_test1, columns = x_train1.columns)

In [None]:
df00['udc'] = y_test1['u_dc_k-2']

In [None]:
df00

In [None]:
df00['pred00'] = df00['d_a_k-2']*df00['udc']

In [None]:
df00['true'] = pd.DataFrame(y_te1a)

In [None]:
from sklearn.metrics import mean_squared_error

mse1a = mean_squared_error(df00['true'] , df00['pred00'])
mse1a

### 1. Neural Network
#### To Be Continued: try different units (10/20/30/40/50) and more hidden layers(2/3/4)

#### Build a Neural Network: 3 Layers

In [None]:
from keras import models
from keras import layers
# nn1: 1 hidden layer
nn1 = models.Sequential()
nn1.add(layers.Dense(units=30, activation = 'relu', input_shape =(14,)))
nn1.add(layers.Dense(units=30, activation = 'relu'))
nn1.add(layers.Dense(units=1))
nn1.compile(loss='mse', optimizer='Adam', metrics=['mse'])

In [None]:
# nn2: k=5, u=3 (u=50: bad)
nn2 = models.Sequential()
nn2.add(layers.Dense(units=30, activation = 'relu', input_shape =(14,)))
nn2.add(layers.Dense(units=30, activation = 'relu'))
nn2.add(layers.Dense(units=30, activation = 'relu'))
nn2.add(layers.Dense(units=30, activation = 'relu'))
nn2.add(layers.Dense(units=30, activation = 'relu'))
nn2.add(layers.Dense(units=30, activation = 'relu'))
nn2.add(layers.Dense(units=1))
nn2.compile(loss='mse', optimizer='Adam', metrics=['mse'])

#### (1) 14features --> target1: u_a_k-1

In [None]:
nnfit1 = nn1.fit(x_train1_std, y_tr1a, 
                    epochs=30, batch_size=500,
                   validation_data=(x_val1_std, y_va1a))

In [None]:
# Training history Visualization
tra_loss = nnfit1.history['loss']
te_loss = nnfit1.history['val_loss']

epoch_count = range(1, len(tra_loss)+1)
plt.plot(epoch_count, tra_loss,'r--')
plt.plot(epoch_count, te_loss, 'b-')

plt.legend(['training loss','test loss'])
plt.xlabel('epoch')
plt.ylabel('loss')

In [None]:
# nn2:
nn2fit1 = nn2.fit(x_train1_std, y_tr1a, 
                    epochs=30, batch_size=500,
                   validation_data=(x_val1_std, y_va1a))

In [None]:
# Training history Visualization
tra_loss = nnfit2.history['loss']
te_loss = nnfit2.history['val_loss']

epoch_count = range(1, len(tra_loss)+1)
plt.plot(epoch_count, tra_loss,'r--')
plt.plot(epoch_count, te_loss, 'b-')

plt.legend(['training loss','test loss'])
plt.xlabel('epoch')
plt.ylabel('loss')

#### (2) 14features --> target2: u_b_k-1

In [None]:
nnfit2 = nn2.fit(x_train1_std, y_tr1b, 
                    epochs=30, batch_size=500,
                   validation_data=(x_test1_std, y_te1b))

In [None]:
tra_loss2 = nnfit2.history['loss']
te_loss2 = nnfit2.history['val_loss']
epoch_count = range(1, len(tra_loss2)+1)
plt.plot(epoch_count, tra_loss2,'r--')
plt.plot(epoch_count, te_loss2, 'b-')

plt.legend(['training loss','test loss'])
plt.xlabel('epoch')
plt.ylabel('loss')

#### (3) 14features --> target3: u_c_k-1

In [None]:
nnfit3 = nn2.fit(x_train1_std, y_tr1c, 
                    epochs=30, batch_size=500,
                   validation_data=(x_test1_std, y_te1c))

In [None]:
tra_loss3 = nnfit3.history['loss']
te_loss3 = nnfit3.history['val_loss']
epoch_count = range(1, len(tra_loss3)+1)
plt.plot(epoch_count, tra_loss3,'r--')
plt.plot(epoch_count, te_loss3, 'b-')

plt.legend(['training loss','test loss'])
plt.xlabel('epoch')
plt.ylabel('loss')

### 2. RandomForest
#### 14 features --> target: u_a_k-1

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
# time: 4m50s
rfr = RandomForestRegressor(n_estimators=100)
rfr.fit(x_train1_std, y_tr1a)

In [None]:
rfrpred = rfr.predict(x_test1_std)

In [None]:
from sklearn.metrics import mean_squared_error
mserfr = mean_squared_error(y_te1a, rfrpred)
mserfr

In [None]:
y_te1a

In [None]:
rfrpred

In [None]:
feat_importance = rfr.feature_importances_

In [None]:
x_colname = list(x_train1.columns)

In [None]:
feat_impor_df = pd.DataFrame({'feature': x_colname, 'importance': feat_importance})

In [None]:
feat_impor_df.sort_values('importance', ascending=False)

## Model 2: Inverter Compensation Scheme
#### Defined: Supervised - Regression problem

### Preprocessing: Dataset cutting + Standadization

In [None]:
# cutting dataset
dfx2 = df[['u_a_k-1','u_b_k-1','u_c_k-1','d_a_k-3','d_b_k-3','d_c_k-3','i_a_k-3','i_b_k-3','i_c_k-3','i_a_k-2','i_b_k-2','i_c_k-2','u_dc_k-3','u_dc_k-2']]
dfy2 = df[['d_a_k-2','d_b_k-2','d_c_k-2']]
x_train2, x_test2, y_train2, y_test2 = train_test_split(dfx2, dfy2, test_size = 0.3)

In [None]:
# x_train2, x_test2, y_train2, y_test2 = train_test_split(dfx2, dfy2, test_size = 0.2)
# x_train2, x_val2, y_train2, y_val2 = train_test_split(x_train2, y_train2, test_size = 0.25)

In [None]:
print(x_train2.shape, x_test2.shape, y_train2.shape, y_test2.shape)

In [None]:
# 3 targets y
y_tr2a = y_train2['d_a_k-2']
y_tr2b = y_train2['d_b_k-2']
y_tr2c = y_train2['d_c_k-2']

# y_va1a = y_val1['u_a_k-1']
# y_va1b = y_val1['u_b_k-1']
# y_va1c = y_val1['u_c_k-1']

y_te2a = y_test2['d_a_k-2']
y_te2b = y_test2['d_b_k-2']
y_te2c = y_test2['d_c_k-2']

In [None]:
# Standardization
scaler2 = preprocessing.StandardScaler()

scaler2.fit(x_train2)
x_train2_std = scaler2.transform(x_train2)
# x_val2_std = scaler2.transform(x_val2)
x_test2_std = scaler2.transform(x_test2)

In [None]:
# 檢查確認標準化後的均值
print('mean of standardized test dataset', round(x_test2_std[:,5].mean()))
print('std of standardized test dataset', round(x_train2_std[:,8].std()))

### 1. Neural Network
#### To Be Continued: try different units (10/20/30/40/50) & k (1/2/3)

#### 1. 14 features --> target 1: d_a_k-2

In [None]:
nnfit2a = nn2.fit(x_train2_std, y_tr2a,
                  epochs=30, batch_size=300,
                   validation_data=(x_test2_std, y_te2a))

In [None]:
tra_loss2a = nnfit2a.history['loss']
te_loss2a = nnfit2a.history['val_loss']
epoch_count = range(1, len(tra_loss2a)+1)
plt.plot(epoch_count, tra_loss2a,'r--')
plt.plot(epoch_count, te_loss2a, 'b-')

plt.legend(['training loss','test loss'])
plt.xlabel('epoch')
plt.ylabel('loss')

#### 2. 14 features --> target 2: d_b_k-2 

In [None]:
nnfit2b = nn2.fit(x_train2_std, y_tr2b,
                  epochs=30, batch_size=300,
                   validation_data=(x_test2_std, y_te2b))
# loss
tra_loss2b = nnfit2b.history['loss']
te_loss2b = nnfit2b.history['val_loss']
epoch_count = range(1, len(tra_loss2b)+1)
plt.plot(epoch_count, tra_loss2b,'r--')
plt.plot(epoch_count, te_loss2b, 'b-')

plt.legend(['training loss','test loss'])
plt.xlabel('epoch')
plt.ylabel('loss')

**3. 14 features --> target 3: d_c_k-2**

In [None]:
nnfit2c = nn2.fit(x_train2_std, y_tr2c,
                  epochs=30, batch_size=300,
                   validation_data=(x_test2_std, y_te2c))
tra_loss2c = nnfit2c.history['loss']
te_loss2c = nnfit2c.history['val_loss']
epoch_count = range(1, len(tra_loss2c)+1)
plt.plot(epoch_count, tra_loss2c,'r--')
plt.plot(epoch_count, te_loss2c, 'b-')

plt.legend(['training loss','test loss'])
plt.xlabel('epoch')
plt.ylabel('loss')