In [1]:
!pip install mlxtend --user
!pip install lightgbm --user
!pip install xgboost --user

Collecting mlxtend
  Downloading mlxtend-0.22.0-py2.py3-none-any.whl (1.4 MB)
     |████████████████████████████████| 1.4 MB 25.9 MB/s            
Collecting scikit-learn>=1.0.2
  Downloading scikit_learn-1.2.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.8 MB)
     |████████████████████████████████| 9.8 MB 44.3 MB/s            
Collecting joblib>=0.13.2
  Downloading joblib-1.2.0-py3-none-any.whl (297 kB)
     |████████████████████████████████| 297 kB 59.0 MB/s            
Installing collected packages: joblib, scikit-learn, mlxtend
Successfully installed joblib-1.2.0 mlxtend-0.22.0 scikit-learn-1.2.2
You should consider upgrading via the '/opt/conda/bin/python -m pip install --upgrade pip' command.[0m
Collecting lightgbm
  Downloading lightgbm-3.3.5-py3-none-manylinux1_x86_64.whl (2.0 MB)
     |████████████████████████████████| 2.0 MB 28.1 MB/s            
Installing collected packages: lightgbm
Successfully installed lightgbm-3.3.5
You should consider upgrading via t

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
import scipy as sp
import math
import random
import seaborn as sn
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold, cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import ElasticNetCV, LassoCV, RidgeCV
from datetime import datetime
from scipy.stats import skew  # for some statistics
from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import mean_squared_error
from mlxtend.regressor import StackingCVRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
import matplotlib.pyplot as plt
import scipy.stats as stats
import sklearn.linear_model as linear_model
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import warnings
warnings.filterwarnings('ignore')

# Importing the Datasets

## Model Performance Metrics

In [2]:
def rmse(y, y_pred):
    return np.sqrt(np.mean((y - y_pred)**2))

def mae(y, y_pred):
    return np.mean(np.absolute(np.subtract(y, y_pred)))

def r2(y, y_pred):
    return 1 - np.sum((y - y_pred)**2)/np.sum((y - np.mean(y))**2)

def mape(y, y_pred):
    return (np.mean(np.divide(np.absolute(np.subtract(y,y_pred)), y))*100)

def errors(y, y_pred):
    return np.absolute(y - y_pred)

In [8]:
#dataset generated from wet_yield_feature_outlier_filtering_and_transformation_prediction
#Winter, subsetted by features to be more normal
df_winter = pd.read_csv('/mnt/Wet_yield_prediction/Organized_Work/Datasets/prescription_dataset.csv').drop(columns = ['Unnamed: 0'])
print(display(df_winter))

Unnamed: 0,Crop_Protection_Application_Doses,Soil_Organic_Matter,Yield_kg_per_Ha,Total_N,Total_P,Total_K
0,14.0,2.500000,3049.333333,205.5,48.0,48.0
1,13.0,2.500000,3116.200000,198.0,60.0,60.0
2,9.0,2.500000,7678.888889,202.4,95.2,96.0
3,9.0,2.500000,7756.363636,179.4,95.2,90.0
4,10.0,2.500000,8235.200000,179.4,95.2,102.0
...,...,...,...,...,...,...
368,19.0,3.333333,5610.500000,208.1,98.8,90.0
369,21.0,3.258333,5610.416667,208.1,98.8,90.0
370,22.0,2.893750,5610.540541,210.4,98.8,90.0
371,21.0,3.181250,8378.648649,170.3,98.8,90.0


None


## Using a Class

In [4]:
class Predictor:
    
    def __init__(self, predictor):
        self.predictor = predictor

    def predict(self, X):
        y_preds = self.predictor.predict(X).reshape(-1,1)
        return y_preds

In [5]:
def build_model(dataset, outlier_removal_lower_percentile, seed):
    
    dataset = dataset[(dataset['Yield_kg_per_Ha'] > dataset['Yield_kg_per_Ha'].quantile(outlier_removal_lower_percentile)) \
                                                                   & (dataset['Yield_kg_per_Ha'] < dataset['Yield_kg_per_Ha'].quantile(1-outlier_removal_lower_percentile))].reset_index(drop = True)

    train_Yield_kg_per_ha, test_Yield_kg_per_ha = train_test_split(dataset, test_size = 0.18, random_state = seed)

    X_train_Yield_kg_per_ha = train_Yield_kg_per_ha.drop(columns = ['Yield_kg_per_Ha'])
    X_test_Yield_kg_per_ha = test_Yield_kg_per_ha.drop(columns = ['Yield_kg_per_Ha'])

    y_train_Yield_kg_per_ha = np.array(train_Yield_kg_per_ha['Yield_kg_per_Ha']).reshape(-1, 1)
    y_test_Yield_kg_per_ha = np.array(test_Yield_kg_per_ha['Yield_kg_per_Ha']).reshape(-1, 1)
    mean_Yield_kg_per_ha = np.mean(y_train_Yield_kg_per_ha)

    xgboost = XGBRegressor(learning_rate = 0.005, n_estimators = 10000, max_depth = 3, min_child_weight = 0, gamma = 0, subsample = 0.7, \
                                              colsample_bytree = 0.7, objective = 'reg:linear', nthread = -1, scale_pos_weight = 1, seed = 27, reg_alpha = 0.00006)

    predictor = Predictor(xgboost)
    predictor.predictor = predictor.predictor.fit(X_train_Yield_kg_per_ha, y_train_Yield_kg_per_ha)
    
    return predictor

In [6]:
def eval_model(df_winter_combined_parents_final_test_set, model_build_dict):
    
    y_preds = np.zeros((len(df_winter_combined_parents_final_test_set), len(model_build_dict)))
    
    for i in range(len(model_build_dict)):

        X_df_winter_combined_parents_final_test_set = df_winter_combined_parents_final_test_set.drop(columns = ['Yield_kg_per_Ha'])
        y_df_winter_combined_parents_final_test_set = np.array(df_winter_combined_parents_final_test_set['Yield_kg_per_Ha']).reshape(-1,1)
        y_preds[:,i] = model_build_dict.get(i).predict(X_df_winter_combined_parents_final_test_set)[:,0]
        
    final_preds = np.mean(y_preds, axis = 1).reshape(-1,1)
    mean_abs_error = mean_absolute_error(y_df_winter_combined_parents_final_test_set, final_preds)
    root_mean_sq_error = np.sqrt(mean_squared_error(y_df_winter_combined_parents_final_test_set, final_preds))
    r_sq = r2_score(y_df_winter_combined_parents_final_test_set, final_preds)
    
    return mean_abs_error, root_mean_sq_error, r_sq

In [7]:
df_winter_combined_parents = pd.read_csv('/mnt/Wet_yield_prediction/Organized_Work/Datasets/prescription_dataset.csv').drop(columns = ['Unnamed: 0'])

num_seeds_list = [5, 5, 5, 5, 5]

random_state_list = random.sample(range(1000), len(num_seeds_list))

results_collection_df = pd.DataFrame(columns = ['num_models', 'mae', 'rmse', 'r_squared'])

for i,num_seeds in enumerate(num_seeds_list):
    
    df_winter_combined_parents_temp = df_winter_combined_parents.sample(frac = 1, random_state = random_state_list[i]).reset_index(drop = True)
    df_winter_combined_parents_final_test_set = df_winter_combined_parents_temp[:57]
    df_winter_combined_parents_temp = df_winter_combined_parents_temp[57:].reset_index(drop = True)
    
    seed_list = random.sample(range(1000), num_seeds)
    
    model_build_dict = {}
    
    for i,seed in enumerate(seed_list):
        model_build_dict.update({i: build_model(df_winter_combined_parents_temp, 0, seed)})
    
    eval_model_temp = eval_model(df_winter_combined_parents_final_test_set, model_build_dict)
    row = pd.DataFrame({'num_models': [num_seeds], 'mae': [eval_model_temp[0]], 'rmse': [eval_model_temp[1]], 'r_squared': [eval_model_temp[2]]})
    
    results_collection_df = pd.concat([results_collection_df, row], axis = 0)

print(display(results_collection_df))



Unnamed: 0,num_models,mae,rmse,r_squared
0,5,1576.850142,2002.600433,0.343317
0,5,1468.438606,1857.10779,0.138074
0,5,1646.917453,1988.279595,0.085779
0,5,1385.303114,1759.498856,0.144239
0,5,1691.884587,2102.933505,0.077304


None
