### Initialise

In [120]:
## Import necessary packages
import matplotlib.pyplot as plt; 
import pandas as pd; 
import numpy as np; 

from sklearn.model_selection import train_test_split,cross_val_score;
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

from sklearn.preprocessing import QuantileTransformer
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.compose import TransformedTargetRegressor
import xgboost as xg

import python.data as datameta

def rd(a):
    return round(a,4)
def mse(a,b):
    err = rd(mean_squared_error(a,b)); 
    return err; 
def rmse(a,b):
    err = rd(mse(a,b)**0.5); 
    return err; 
def mae(a,b):
    err = rd(mean_absolute_error(a,b)); 
    return err; 
def r2(a,b):
    err = rd(r2_score(a,b)**0.5); 
    return err; 

### Preprocess Data

In [121]:
data = datameta.processData('data/clean_data.csv'); 
data.head()

Index(['Time (h)', 'Aeration rate(Fg:L/h)', 'Sugar feed rate(Fs:L/h)',
       'Acid flow rate(Fa:L/h)', 'Base flow rate(Fb:L/h)',
       'Air head pressure(pressure:bar)', 'Substrate concentration(S:g/L)',
       'Penicillin concentration(P:g/L)', 'Vessel Volume(V:L)', 'pH(pH:pH)',
       'Temperature(T:K)', 'PAA flow(Fpaa:PAA flow (L/h))',
       'Oil flow(Foil:L/hr)', 'Ammonia shots(NH3_shots:kgs)', 'Water Flow'],
      dtype='object') 

 (113935, 15)


Unnamed: 0,Time (h),Aeration rate(Fg:L/h),Sugar feed rate(Fs:L/h),Acid flow rate(Fa:L/h),Base flow rate(Fb:L/h),Air head pressure(pressure:bar),Substrate concentration(S:g/L),Penicillin concentration(P:g/L),Vessel Volume(V:L),pH(pH:pH),Temperature(T:K),PAA flow(Fpaa:PAA flow (L/h)),Oil flow(Foil:L/hr),Ammonia shots(NH3_shots:kgs),Water Flow
0,0.0,0.181818,0.040541,0.0,0.133858,0.0,0.008306,1.791808e-27,0.049276,0.767126,0.258427,0.333333,0.0,0.0,0.00447
1,0.00069,0.181818,0.040541,0.0,0.227649,0.0,0.008715,2.763729e-05,0.04948,0.800686,0.249064,0.333333,0.0,0.0,0.008252
2,0.00138,0.181818,0.040541,0.0,0.241342,0.0,0.009107,2.761905e-05,0.049685,0.836653,0.243446,0.333333,0.0,0.0,0.004363
3,0.00207,0.181818,0.040541,0.0,0.168071,0.0,0.009492,2.760247e-05,0.049787,0.860582,0.237828,0.333333,0.0,0.0,0.001972
4,0.002761,0.181818,0.040541,0.039866,0.084036,0.0,0.009864,2.758782e-05,0.049838,0.865835,0.234082,0.333333,0.0,0.0,0.000502


In [122]:
# Split into x and y
xkeys = [
    "Time (h)", "Aeration rate(Fg:L/h)", "Sugar feed rate(Fs:L/h)","Acid flow rate(Fa:L/h)",
    "Base flow rate(Fb:L/h)","Water Flow","Substrate concentration(S:g/L)","PAA flow(Fpaa:PAA flow (L/h))",
    "Oil flow(Foil:L/hr)", "Ammonia shots(NH3_shots:kgs)",
    # Converted variables
    "Air head pressure(pressure:bar)", "Temperature(T:K)", "pH(pH:pH)", "Vessel Volume(V:L)"
]
(x,y) = datameta.xy_split(data,xkeys)
xtrain, xtest, ytrain, ytest=train_test_split(x, y, test_size=0.1)
ytest = ytest.to_numpy().flatten(); 

x:  14  keys in  14 cols
y:  1  keys in  1 cols


In [123]:
# TESTING OUT ALL TYPES
def row(name,ytest, yhat):
    return [name, r2(ytest, yhat),mae(ytest, yhat),mse(ytest, yhat)]

# BASIC MODELS
# models = [
#     LinearRegression(), 
#     DecisionTreeRegressor(), 
#     SGDRegressor(loss='squared_error')
# ]

# for index,model in enumerate(models):
#     model.fit(xtrain, ytrain.values.ravel()); 
#     models[index] = row(model.__class__.__name__, ytest, model.predict(xtest))

# models = pd.DataFrame(models, columns=["","R2","MAE", "MSE"]); 
# models.style.highlight_max(color='#4f4').highlight_min(color='#f44'); 
# models

In [125]:
models=[
    GradientBoostingRegressor(), 
    xg.XGBRFRegressor(),
    RandomForestRegressor()
]

for index,model in enumerate(models):
    mod = TransformedTargetRegressor(regressor=model, transformer=QuantileTransformer(output_distribution='normal'))
    mod.fit(xtrain, ytrain.values.ravel()); 
    yhat = mod.predict(xtest)

    models[index] = row(model.__class__.__name__.replace("Regressor", ""), ytest, mod.predict(xtest))

models = pd.DataFrame(models, columns=["","R2","MAE", "MSE"]); 
models.style.highlight_max(color='#4f4').highlight_min(color='#f44'); 
models

Unnamed: 0,Unnamed: 1,R2,MAE,MSE
0,GradientBoosting,0.9924,0.0235,0.0011
1,XGBRF,0.9894,0.0269,0.0016
2,RandomForest,0.9987,0.0076,0.0002


In [134]:
# Tuning
from sklearn.model_selection import GridSearchCV

#evaluate metrics
r2(ytest, yhat), mae(ytest, yhat), rmse(ytest, yhat)

Fitting 5 folds for each of 96 candidates, totalling 480 fits
[CV 1/5] END ......max_depth=2, n_estimators=40;, score=0.746 total time=   4.2s
[CV 2/5] END ......max_depth=2, n_estimators=40;, score=0.746 total time=   4.2s
[CV 4/5] END ......max_depth=2, n_estimators=40;, score=0.749 total time=   3.5s
[CV 3/5] END ......max_depth=2, n_estimators=40;, score=0.753 total time=   3.8s
[CV 5/5] END ......max_depth=2, n_estimators=40;, score=0.755 total time=   3.8s
[CV 3/5] END ......max_depth=2, n_estimators=45;, score=0.753 total time=   4.2s
[CV 2/5] END ......max_depth=2, n_estimators=45;, score=0.747 total time=   4.3s
[CV 1/5] END ......max_depth=2, n_estimators=45;, score=0.746 total time=   4.3s
[CV 4/5] END ......max_depth=2, n_estimators=45;, score=0.748 total time=   3.7s
[CV 5/5] END ......max_depth=2, n_estimators=45;, score=0.755 total time=   3.7s
[CV 1/5] END ......max_depth=2, n_estimators=50;, score=0.746 total time=   3.9s
[CV 2/5] END ......max_depth=2, n_estimators=50

(0.9908, 0.0252, 0.0374)

In [None]:
# dt = DecisionTreeRegressor(); 
# dt.fit(xtrain, ytrain); 

# ypred = dt.predict(xtest); 
# print("TEST:\n",ytest,"\nPRED:\n", ypred)

In [None]:
# def isClose(base, known):
#     return np.abs((base - known) / base) <= 0.001

# accuracy = np.sum(isClose(ypred, ytest))/len(ytest)
# print(rd(accuracy*100), "%")

In [None]:

# xax = range(len(ytest))
# plt.plot(xax, ytest, linewidth=1, label="original")
# plt.plot(xax, ypred, linewidth=1, label="predicted")
# plt.legend(loc='best')
# plt.show() 
