In [None]:
import pandas as pd
import numpy as np
from numpy import mean, std, absolute
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge, LassoLarsIC, BayesianRidge, PoissonRegressor
# import xgboost as xg
import scipy as sp
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import r2_score as r2
from sklearn.preprocessing import normalize

plt.style.use('ggplot')

In [None]:
!pip install openpyxl

In [None]:
data = pd.read_excel('Data/Gasification Data.xlsx', index_col=0, header=0)
data = data.iloc[1:223,:14] 
# display(list(data.columns.values))

# Drop null balues and store dataframe in dataframe 2
data=data.dropna()

#Check Null values again after removing
print(data.isnull().values.any())
print(data.isna().values.any())

X = data.iloc[:, :9]
y = data.iloc[:, 9:]
input_columns = list(X.columns.values)
input_columns = [i.split(' [', 1)[0] for i in input_columns]
output_columns = list(y.columns.values)
output_columns = [i.split(' [', 1)[0] for i in output_columns]
print(input_columns, output_columns)

Xvals = X.values
yvals = y.values
print(Xvals.shape, yvals.shape)

Xnorm = np.zeros_like(Xvals)
ynorm = np.zeros_like(yvals)
for idx in range(len(input_columns)):
    Xnorm[:, idx] = (Xvals[:,idx]-min(Xvals[:,idx]))/(max(Xvals[:,idx])-min(Xvals[:,idx]))
for odx in range(len(output_columns)):
    ynorm[:, odx] = (yvals[:,odx]-min(yvals[:,odx]))/(max(yvals[:,odx])-min(yvals[:,odx]))

print(np.max(Xnorm), np.max(ynorm))
print(np.min(Xnorm), np.min(ynorm))

In [None]:
sns.histplot(Xnorm[:, 0])

## Regression Analysis

In [None]:
# constants
methods = ['Linear', 'Ridge', 'LARS', 'RF', 'Bag', 'GradientBoost']
number_of_folds = 11
ridge_reg = [1.0, 10.0, 100.0, 1000.0, 1e6]
n_estimators = [10, 50, 100, 500, 1000]
learning_rate = [1e-3, 0.01, 0.1, 1]
max_depth = 30

methods_extended = []
for met in methods:
    if met == 'Linear' or met == 'LARS':
        methods_extended.append(met)
    elif met == 'Ridge':
        for reg in ridge_reg:
            methods_extended.append(met + '_Reg_' + str(reg))
    elif met == 'RF':
        for rfest in n_estimators:
            methods_extended.append(met + '_est_' + str(rfest))
    elif met == 'Bag':
        for best in n_estimators:
            methods_extended.append(met + '_est_' + str(best))
    elif met == 'GradientBoost':
        for gbest in n_estimators:
            for lr in learning_rate:
                methods_extended.append(met + '_est_' + str(gbest) + 
                                        '_lr_' + str(lr))
print(methods_extended)

In [None]:
## Splitting the data into 70% training set and 30% test set
import time
seed = 42
X_train, X_test, y_train, y_test = train_test_split(Xnorm, ynorm, test_size=.3, random_state=seed)

prediction = {}
error = {}
r2value = {}
for met in methods_extended:
    print(met)
    score = {}
    prediction[met] = np.zeros_like(y_test) 
    
    string_extract = met.split('_')
    
    for idx in range(y.shape[1]):
        training_x = np.asarray(X_train)
        testing_x = np.asarray(X_test)
        training_y = np.asarray(y_train)[:, idx]
        testing_y = np.asarray(y_test)[:, idx]

        if string_extract[0] == 'Linear':
            start_linear = time.time()
            regr_multilin = LinearRegression()
            regr_multilin.fit(training_x, training_y)
            prediction[met][:, idx] = regr_multilin.predict(testing_x)
            end_linear = time.time()
            print('CT-Linear', end_linear-start_linear)

        elif string_extract[0] == 'Ridge':
            start_ridge = time.time()
            regr_multiridge = Ridge(alpha=float(string_extract[-1]))
            regr_multiridge.fit(training_x, training_y)
            prediction[met][:, idx] = regr_multiridge.predict(testing_x)
            end_ridge = time.time()
            print('CT-ridge', end_ridge-start_ridge)

        elif string_extract[0] == 'LARS':
            start_lars = time.time()
            regr_multilasso = LassoLarsIC()
            regr_multilasso.fit(training_x, training_y)
            prediction[met][:, idx] = regr_multilasso.predict(testing_x)
            end_lars = time.time()
            print('CT-lars', end_lars-start_lars)

        elif string_extract[0] == 'RF':
            start_rf = time.time()
            regr_multirf = RandomForestRegressor(n_estimators=int(string_extract[-1]),
                                                                  max_depth=max_depth,
                                                                  random_state=0)
            regr_multirf.fit(training_x, training_y)
            prediction[met][:, idx] = regr_multirf.predict(testing_x)
            end_rf = time.time()
            print('CT-rf', end_rf-start_rf)

        elif string_extract[0] == 'Bag':
            start_bag = time.time()
            regr_multibag = BaggingRegressor(n_estimators=int(string_extract[-1]),
                                                                  random_state=0)
            regr_multibag.fit(training_x, training_y)
            prediction[met][:, idx] = regr_multibag.predict(testing_x)
            end_bag = time.time()
            print('CT-bag', end_bag-start_bag)

        elif string_extract[0] == 'GradientBoost':
            start_gb = time.time()
            regr_multigb = GradientBoostingRegressor(n_estimators=int(string_extract[-3]), 
                                                                          learning_rate=float(string_extract[-1]),
                                                                          random_state=seed, loss='huber')
            regr_multigb.fit(training_x, training_y)
            prediction[met][:, idx] = regr_multigb.predict(testing_x)
            end_gb = time.time()
            print('CT-gb', end_gb-start_gb)   
    error[met] = np.sqrt(mse(y_test, prediction[met], multioutput='raw_values'))
    r2value[met] = r2(y_test, prediction[met], multioutput='raw_values')

In [None]:
import openpyxl

mean_error = []
mean_std = []
mean_r2 = []
for met in methods_extended:
    mean_error.append(np.mean(error[met]))
    mean_std.append(np.std(error[met]))
    mean_r2.append(np.mean(r2value[met]))
## Identifying the best model
np.savetxt('mean_error.csv', mean_error)
np.savetxt('mean_r2.csv', mean_r2)
mim = np.argmin(mean_error)
print(methods_extended[mim])
mi2 = np.argmax(mean_r2)
print(methods_extended[mi2])

## Plotting the Results

In [None]:
### Plot
for met in methods_extended:
    for odx, out in enumerate(output_columns):
#         plt.subplot(5, 1, odx +1)
        plt.figure(figsize=(8, 4))
        plt.plot(np.asarray(y_test)[:, odx], label='True value', color='blue')
        plt.plot(np.asarray(prediction[met])[:, odx], color='red', 
                 label='Predicted value \n (RMSE = %.4f \n R2=%.3f)' % (error[met][odx], (r2value[met][odx])))
        plt.xlabel("Features", fontsize=12)
        plt.ylabel("Values", fontsize=12)
        plt.rc('xtick',labelsize=12)
        plt.rc('ytick',labelsize=12)
        plt.title("%s (%s)" % (met, out), fontsize=14)
        plt.legend(fontsize=12)

        plt.savefig('Results/Prediction_%s_%s.pdf' % (met, out))
        plt.clf()
#     plt.show()