### Initialise

In [1]:
## Import necessary packages
import pandas as pd; 
import numpy as np; 

from sklearn.model_selection import train_test_split;
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

from sklearn.preprocessing import QuantileTransformer
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.compose import TransformedTargetRegressor
import xgboost as xg

import python.data as datameta

ModuleNotFoundError: No module named 'python'

In [72]:
def rd(a): return round(a,4)
def mse(a,b): return rd(mean_squared_error(a,b)); 
def rmse(a,b): return rd(mse(a,b)**0.5); 
def mae(a,b): return rd(mean_absolute_error(a,b)); 
def r2(a,b): return rd(r2_score(a,b)**0.5); 
def isClose(base, known, tol=0.001): return np.abs((base - known) / base) <= tol

def row(name,ytest, yhat, accuracy):
    accuracies = list(map(
        lambda x: str(round(x*100, 2))+"%"
        ,accuracy)) 
    return [
        name, 
        r2(ytest, yhat),mae(ytest, yhat),mse(ytest, yhat), 
        accuracies[0], accuracies[1], accuracies[2]
    ]

### Preprocess Data

In [65]:
data = datameta.processData('../data/clean_data.csv'); 
data.head()

Index(['Time (h)', 'Aeration rate(Fg:L/h)', 'Sugar feed rate(Fs:L/h)',
       'Acid flow rate(Fa:L/h)', 'Base flow rate(Fb:L/h)',
       'Air head pressure(pressure:bar)', 'Substrate concentration(S:g/L)',
       'Penicillin concentration(P:g/L)', 'Vessel Volume(V:L)', 'pH(pH:pH)',
       'Temperature(T:K)', 'PAA flow(Fpaa:PAA flow (L/h))',
       'Oil flow(Foil:L/hr)', 'Ammonia shots(NH3_shots:kgs)', 'Water Flow'],
      dtype='object') 

 (113935, 15)


Unnamed: 0,Time (h),Aeration rate(Fg:L/h),Sugar feed rate(Fs:L/h),Acid flow rate(Fa:L/h),Base flow rate(Fb:L/h),Air head pressure(pressure:bar),Substrate concentration(S:g/L),Penicillin concentration(P:g/L),Vessel Volume(V:L),pH(pH:pH),Temperature(T:K),PAA flow(Fpaa:PAA flow (L/h)),Oil flow(Foil:L/hr),Ammonia shots(NH3_shots:kgs),Water Flow
0,0.0,0.181818,0.040541,0.0,0.133858,0.0,0.008306,1.791808e-27,0.049276,0.767126,0.258427,0.333333,0.0,0.0,0.00447
1,0.00069,0.181818,0.040541,0.0,0.227649,0.0,0.008715,2.763729e-05,0.04948,0.800686,0.249064,0.333333,0.0,0.0,0.008252
2,0.00138,0.181818,0.040541,0.0,0.241342,0.0,0.009107,2.761905e-05,0.049685,0.836653,0.243446,0.333333,0.0,0.0,0.004363
3,0.00207,0.181818,0.040541,0.0,0.168071,0.0,0.009492,2.760247e-05,0.049787,0.860582,0.237828,0.333333,0.0,0.0,0.001972
4,0.002761,0.181818,0.040541,0.039866,0.084036,0.0,0.009864,2.758782e-05,0.049838,0.865835,0.234082,0.333333,0.0,0.0,0.000502


In [66]:
# Split into x and y
xkeys = [
    "Time (h)", "Aeration rate(Fg:L/h)", "Sugar feed rate(Fs:L/h)","Acid flow rate(Fa:L/h)",
    "Base flow rate(Fb:L/h)","Water Flow","Substrate concentration(S:g/L)","PAA flow(Fpaa:PAA flow (L/h))",
    "Oil flow(Foil:L/hr)", "Ammonia shots(NH3_shots:kgs)",
    # Converted variables
    "Air head pressure(pressure:bar)", "Temperature(T:K)", "pH(pH:pH)", "Vessel Volume(V:L)"
]
(x,y) = datameta.xy_split(data,xkeys)
xtrain, xtest, ytrain, ytest=train_test_split(x, y, test_size=0.1)
ytest = ytest.to_numpy().flatten(); 

x:  14  keys in  14 cols
y:  1  keys in  1 cols


In [67]:
models = [
    LinearRegression(), 
    DecisionTreeRegressor(), 
    SGDRegressor(loss='squared_error'),
    # 
    GradientBoostingRegressor(learning_rate=0.25,max_depth=7,n_estimators=80), 
    xg.XGBRFRegressor(max_depth=4,n_estimators=85,reg_lambda=0.18),
    RandomForestRegressor(max_depth=7,n_estimators=65)
]

In [68]:
for index,model in enumerate(models):
    name = model.__class__.__name__.replace("Regressor", ""); 
    if index>=3:
        model = TransformedTargetRegressor(regressor=model, transformer=QuantileTransformer(output_distribution='normal'))

    model.fit(xtrain, ytrain.values.ravel()); 
    yhat = model.predict(xtest)

    accuracy001 = np.sum(isClose(yhat, ytest, 0.001))/len(ytest)
    accuracy01 = np.sum(isClose(yhat, ytest, 0.01))/len(ytest)
    accuracy1 = np.sum(isClose(yhat, ytest, 0.1))/len(ytest)
    accuracy = [accuracy001, accuracy01, accuracy1]
    models[index] = row(name, ytest, model.predict(xtest), accuracy)

In [71]:
models = pd.DataFrame(models, columns=["","R2","MAE", "MSE","Acc@0.1%", "Acc@1%", "Acc@10%"]); 
models

Unnamed: 0,Unnamed: 1,R2,MAE,MSE,Acc@0.1%,Acc@1%,Acc@10%
0,LinearRegression,0.9862,0.0329,0.0021,58.86%,6.5%,0.66%
1,DecisionTree,0.9968,0.0082,0.0005,92.11%,69.13%,19.01%
2,SGD,0.985,0.0348,0.0022,54.83%,6.24%,0.69%
3,GradientBoosting,0.996,0.0163,0.0006,88.49%,25.26%,7.04%
4,XGBRF,0.9816,0.0355,0.0027,63.84%,11.64%,1.29%
5,RandomForest,0.9911,0.0248,0.0013,79.84%,19.4%,3.77%
