In [1]:
import os
import time

os.chdir(r"F:\planetseed")

import pickle
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import random as python_random
from numpy.polynomial.polynomial import polyfit

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.svm import SVR
from sklearn.inspection import permutation_importance
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.cross_decomposition import PLSRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression

In [2]:
font = {'family' : 'Arial',
        'weight' : 'normal',
        'size'   : 7}
matplotlib.rc('font', **font)

# Preprocessing

In [3]:
# Protein Oil Sucrose Fiber Starch Ash Comp_Carb Simp_Carb
y_var = 'Simp_Carb'
remarks = f'SVR {y_var}'

In [4]:
H1G_2017_VI = pd.read_csv(r".\data\csv_V3\2017_H1G_VI.csv", index_col=0)
H1G_2020_VI = pd.read_csv(r".\data\csv_V3\2020_H1G_VI.csv", index_col=0)
H1G_2021_VI = pd.read_csv(r".\data\csv_V3\2021_H1G_VI.csv", index_col=0)
L2_2021_VI = pd.read_csv(r".\data\csv_V3\2021_L2_VI.csv", index_col=0)

H1G_2017_TX = pd.read_csv(r".\data\csv_V3\2017_H1G_TX.csv", index_col=0)
H1G_2020_TX = pd.read_csv(r".\data\csv_V3\2020_H1G_TX.csv", index_col=0)
H1G_2021_TX = pd.read_csv(r".\data\csv_V3\2021_H1G_TX.csv", index_col=0)
L2_2021_TX = pd.read_csv(r".\data\csv_V3\2021_L2_TX.csv", index_col=0)

H1G_2017_y = pd.read_csv(r".\data\csv_V3\2017_H1G_Target.csv", index_col=0)[y_var] 
H1G_2020_y = pd.read_csv(r".\data\csv_V3\2020_H1G_Target.csv", index_col=0)[y_var] 
H1G_2021_y = pd.read_csv(r".\data\csv_V3\2021_H1G_Target.csv", index_col=0)[y_var] 
L2_2021_y = pd.read_csv(r".\data\csv_V3\2021_L2_Target.csv", index_col=0)[y_var] 

In [5]:
VI_data = pd.concat([H1G_2017_VI, H1G_2020_VI, H1G_2021_VI, L2_2021_VI])
TX_data = pd.concat([H1G_2017_TX, H1G_2020_TX, H1G_2021_TX, L2_2021_TX])
y_data = pd.DataFrame(pd.concat([H1G_2017_y, H1G_2020_y, H1G_2021_y, L2_2021_y]))
all_data = pd.concat([y_data, VI_data, TX_data], axis=1)

In [6]:
all_data = all_data.dropna()

#VI_data = all_data.iloc[:, 1:295]
VI_data = all_data.iloc[:, 1:]
#TX_data = all_data.iloc[:, 295:]
y_data = all_data.iloc[:, 0]

In [7]:
def feature_importance(data_X, data_y, n_featuers):
    
    # Train test split
    X_train, X_test, y_train, y_test = train_test_split(data_X, data_y, test_size=0.25, random_state=35)

    # Define a linear regression model
    perm_model = Pipeline([('scaler', MinMaxScaler()), ('model', LinearRegression())])
    perm_model.fit(X_train, y_train)

    #Perform a permutaion feature importance model
    result = permutation_importance(perm_model, X_train, y_train, n_repeats=500, random_state=42, n_jobs=-1)

    # get importance
    importance = result.importances_mean

    imp_feats = pd.DataFrame(importance, index=data_X.columns, columns=['Importance'])

    # Sort the dataframe so the important features are on the top
    imp_feats = imp_feats.sort_values(by=['Importance'], ascending=False)

    # Take the first 20 features
    imp_feats_20 = imp_feats.iloc[:n_featuers, :]

    # Only take the important features and return the whole data
    df = data_X[imp_feats_20.index]

    #plt.figure(dpi=150)
    #plt.barh(imp_feats_20.index, imp_feats_20['Importance'])
    #plt.show()
    
    return df

In [8]:
VI_data_X_imp = feature_importance(VI_data, y_data, 30)

In [9]:
train_X_VI, test_X_VI = train_test_split(VI_data_X_imp, test_size=0.25, random_state=35)
train_y, test_y = train_test_split(y_data, test_size=0.25, random_state=35)

In [10]:
print(train_X_VI.shape)
print(test_X_VI.shape)
print(train_y.shape)
print(test_y.shape)

(319, 30)
(107, 30)
(319,)
(107,)


# Modeling

In [11]:
%%time

#SVR

# Define pipeline
pipe = Pipeline(steps=[('scaler', MinMaxScaler()),
                       ('model', SVR())])

# Define pipeline parameters
param = {'model__gamma': [2**i for i in np.arange(-10, 7, 1, dtype='float')],
         'model__C': [2**i for i in np.arange(-10, 7, 1, dtype='float')]}

# Define grid
grid = GridSearchCV(estimator=pipe,
                    param_grid=param,
                    cv=5,
                    n_jobs=4)
grid.fit(train_X_VI, train_y)

Wall time: 4.81 s


GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('scaler',
                                        MinMaxScaler(copy=True,
                                                     feature_range=(0, 1))),
                                       ('model',
                                        SVR(C=1.0, cache_size=200, coef0=0.0,
                                            degree=3, epsilon=0.1,
                                            gamma='scale', kernel='rbf',
                                            max_iter=-1, shrinking=True,
                                            tol=0.001, verbose=False))],
                                verbose=False),
             iid='deprecated', n_jobs=4,
             param_grid={'model__C': [0.0009765625, 0.001953125, 0.00390625,
                                      0.0078125, 0.015625, 0.03125, 0.0625,
                                      0.125, 0.25, 0.5, 1.0, 2.0, 4.0, 8.

In [12]:
model_id = str(int(time.time()))
model_dir = os.path.join(r".\models", model_id)
os.makedirs(model_dir, exist_ok=True)
model_file_name = os.path.join(model_dir, f"model_{model_id}.pkl")

# Save model
pickle.dump(grid, open(model_file_name, 'wb'))

In [13]:
# Predict
train_y_pred = grid.predict(train_X_VI)
test_y_pred = grid.predict(test_X_VI)

train_y = np.array(train_y)
test_y = np.array(test_y)

train_y_pred = np.array(train_y_pred).reshape(-1)
test_y_pred = np.array(test_y_pred).reshape(-1)

# Save actual and predicted y
y_summary = pd.DataFrame(np.vstack((test_y, test_y_pred)).T,
                         columns=['Measured', 'Predicted'])
y_summary.to_csv(os.path.join(model_dir, 'y_summary.csv'))

In [14]:
# Define a function that calcualte error metrics from predicted and actual values
def reg_model_metrics(actual, pred):
    MSE = mean_squared_error(actual, pred)
    RMSE = np.sqrt(MSE)
    actual_mean = np.mean(actual)
    RRMSE = 100*(RMSE/actual_mean)
    R2 = np.square(np.corrcoef(actual, pred)[0, 1])# r2_score(actual, pred)
    return RMSE, RRMSE, R2

In [15]:
# Calculate metrics
RMSE_train, RRMSE_train, R2_train = reg_model_metrics(train_y, train_y_pred)
RMSE_test, RRMSE_test, R2_test = reg_model_metrics(test_y, test_y_pred)

In [16]:
# Update the main result vault

# Read it
result_vault = pd.read_csv(r".\models\_results.csv")

# Create a new result dataframe that needs to be appended
result_vault_updt = pd.DataFrame(data=[[model_id, y_var, "NA", "NA",
                                        R2_test, R2_train, RRMSE_test, RRMSE_train, remarks]],
                                 columns=result_vault.columns)

# Update it
result_vault_new = pd.concat([result_vault, result_vault_updt])

# Save it
result_vault_new.to_csv(r".\models\_results.csv", index=False)