Wild Blueberry Yield Prediction
Wild blueberry yield prediction using machine learning 

Kaggle Link: https://www.kaggle.com/datasets/saurabhshahane/wild-blueberry-yield-prediction

# 1. Imports

In [1]:
import pandas as pd
import plotly.express as px
import numpy as np
import cut_the_tails as ct
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from xgboost import XGBClassifier, XGBRegressor
#from sklearn.model_selection import LeaveOneOut,KFold
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('data_sets\\Blueberry_Yield.csv')
target = 'yield'

In [3]:
features = ['clonesize',
 'honeybee',
 'bumbles',
 'andrena',
 'osmia',
 'MaxOfUpperTRange',
 'MinOfUpperTRange',
 'AverageOfUpperTRange',
 'MaxOfLowerTRange',
 'MinOfLowerTRange',
 'AverageOfLowerTRange',
 'RainingDays',
 'AverageRainingDays',
 'fruitset',
 'fruitmass',
 'seeds']

In [4]:
fig = px.histogram(df, x=target, nbins=40)
fig.show()

# Cauda Ótima

In [5]:
df.dropna(inplace=True)
df = df.reset_index(drop=True)

model = RandomForestRegressor(max_depth=5, random_state=0) 
classifier = RandomForestClassifier(max_depth=5, random_state=0)

x,fval = ct.get_cuts_direct_optimization(df, target, features, classifier, model, 'differential-evol')

print("Final Result: ")
print(x, fval)

[0.46423483 0.56426245] 0.023976639683510995
[0.18755375 0.85411748] 0.024816236611393924
[0.0232596  0.26511471] 0.025019421504090426
[0.55254781 0.71966577] 0.025026313825429008
[0.04742547 0.5774215 ] 0.025022404299849896
[0.68156131 0.72065884] 0.027226732142531802
[0.37117972 0.91096138] 0.024608887090122922
[0.16443209 0.16593334] 0.02516288984835982
[0.49754265 0.80290305] 0.024245202600507276
[0.52368994 0.74544238] 0.024784922405352357
[0.85553803 0.87732775] 0.02632228951079279
[0.07054934 0.17973807] 0.025896610545670187
[0.35634792 0.41564057] 0.025049355620416058
[0.39294027 0.93681906] 0.025100566549631793
[0.8952678 0.9098813] 0.026248085479005576
[0.03990903 0.98018519] 0.026159444659496815
[0.4951413 0.9807369] 0.024566661497737126
[0.21432489 0.53223374] 0.025477512389660156
[0.32727291 0.35386251] 0.024964437964538348
[0.68513795 0.76226069] 0.027361948631158654
[0.2519409  0.29924807] 0.025326046511280707
[0.02945682 0.42703708] 0.02485829846149951
[0.32456182 0.615

In [6]:
cdf = ct.split_by_quantile_class(df,target,[0.4,0.85])
cdf.drop('Row#', axis=1, inplace=True)

In [7]:
X = cdf[features].to_numpy()
y_tail = cdf['tail_class'].to_numpy()
y = cdf[target].to_numpy()

###Using split###
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
X_train_aux, X_test_aux, y_train_tail, y_test_tail = train_test_split(X, y_tail, test_size=0.2, random_state=0)



In [8]:
#baseline = RandomForestRegressor(max_depth=5, random_state=0)
baseline = XGBRegressor(n_estimators=100, max_leaves=0, random_state = 0)
#baseline = DecisionTreeRegressor(random_state=0, max_depth=5) 

In [9]:
#tail_classifier = ct.fit_tail_classifier(X,y_tail,RandomForestClassifier(max_depth=5, random_state=0))
#tail_classifier = ct.fit_tail_classifier(X,y_tail,XGBClassifier(n_estimators=100, max_leaves=0, random_state = 0))
#tail_classifier = ct.fit_tail_classifier(X,y_tail,DecisionTreeClassifier(random_state=0, max_depth=5)) 


###Using split###
#tail_classifier = ct.fit_tail_classifier(X_train,y_train_tail,RandomForestClassifier(max_depth=5, random_state=0))
tail_classifier = ct.fit_tail_classifier(X_train,y_train_tail,XGBClassifier(n_estimators=100, max_leaves=0, random_state = 0))
#tail_classifier = ct.fit_tail_classifier(X_train,y_train_tail,DecisionTreeClassifier(random_state=0, max_depth=5)) 

In [10]:
#models = ct.fit_tail_models(X,y,y_tail,baseline)

###Using split###
models = ct.fit_tail_models(X_train,y_train,y_train_tail,baseline)

In [11]:
#baseline.fit(X,y)
#y_base = baseline.predict(X)

###Using split###
baseline.fit(X_train,y_train)
y_base = baseline.predict(X_test)

In [12]:
#y_tail = ct.batch_tail_predict(X,tail_classifier,models)

###Using split###
y_tail = ct.batch_tail_predict(X_test,tail_classifier,models)

In [13]:
#print(mean_absolute_error(y_base,y))
#print(mean_absolute_error(y_tail,y))
#print(mean_absolute_error(y,y))

###Using split###
print(mean_absolute_error(y_base,y_test))
print(mean_absolute_error(y_tail,y_test))
print(mean_absolute_error(y_test,y_test))

124.54776199338939
115.15781155779244
0.0


In [14]:
#print(mean_absolute_percentage_error(y_base,y))
#print(mean_absolute_percentage_error(y_tail,y))
#print(mean_absolute_percentage_error(y,y))

###Using split###
print(mean_absolute_percentage_error(y_base,y_test))
print(mean_absolute_percentage_error(y_tail,y_test))
print(mean_absolute_percentage_error(y_test,y_test))

0.02329216119407331
0.021082707782797624
0.0
