# Import Packages

In [None]:
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV, train_test_split

# Load Data

In [None]:
blueberry_df = pd.read_csv("train.csv")
blueberry_df = blueberry_df.drop(columns=["id"])

label = "yield"
features = list(blueberry_df.columns)
features.remove(label)

blueberry_df.head()

Unnamed: 0,clonesize,honeybee,bumbles,andrena,osmia,MaxOfUpperTRange,MinOfUpperTRange,AverageOfUpperTRange,MaxOfLowerTRange,MinOfLowerTRange,AverageOfLowerTRange,RainingDays,AverageRainingDays,fruitset,fruitmass,seeds,yield
0,25.0,0.5,0.25,0.75,0.5,69.7,42.1,58.2,50.2,24.3,41.2,24.0,0.39,0.425011,0.417545,32.460887,4476.81146
1,25.0,0.5,0.25,0.5,0.5,69.7,42.1,58.2,50.2,24.3,41.2,24.0,0.39,0.444908,0.422051,33.858317,5548.12201
2,12.5,0.25,0.25,0.63,0.63,86.0,52.0,71.9,62.0,30.0,50.8,24.0,0.39,0.552927,0.470853,38.341781,6869.7776
3,12.5,0.25,0.25,0.63,0.5,77.4,46.8,64.7,55.8,27.0,45.8,24.0,0.39,0.565976,0.478137,39.467561,6880.7759
4,25.0,0.5,0.25,0.63,0.63,77.4,46.8,64.7,55.8,27.0,45.8,24.0,0.39,0.579677,0.494165,40.484512,7479.93417


# Model Generation

In [None]:
X = blueberry_df[features]
y = blueberry_df[label]

In [None]:
test_size = 0.2
seed = 25
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed)

In [None]:
# Try alphas ranging from 1e-5 to 1e^5 (jumping by x10).
param_grid = {"alpha": np.logspace(-5, 5, 11)}

ridge = Ridge()
grid_search = GridSearchCV(estimator=ridge, param_grid=param_grid, cv=5, scoring='neg_mean_absolute_error')
grid_search.fit(X_train, y_train)

best_alpha = grid_search.best_params_['alpha']
best_model = grid_search.best_estimator_
grid_search_results = pd.DataFrame(grid_search.cv_results_)

print("Best Alpha:", best_alpha)
grid_search_results[['param_alpha', 'mean_test_score', 'std_test_score', 'rank_test_score']]

Best Alpha: 0.1


Unnamed: 0,param_alpha,mean_test_score,std_test_score,rank_test_score
0,1e-05,-374.745155,7.734414,5
1,0.0001,-374.745002,7.734283,4
2,0.001,-374.743479,7.732973,3
3,0.01,-374.729939,7.720049,2
4,0.1,-374.72217,7.608128,1
5,1.0,-378.124874,6.918821,6
6,10.0,-400.011216,5.681669,7
7,100.0,-424.416364,5.909648,8
8,1000.0,-431.279908,6.282823,9
9,10000.0,-444.420732,5.792642,10


In [None]:
y_pred = best_model.predict(X_test)
metrics_df = pd.DataFrame({
    "MAE": [metrics.mean_absolute_error(y_test, y_pred)],
    "MSE": [metrics.mean_squared_error(y_test, y_pred)],
    "RMSE": [np.sqrt(metrics.mean_squared_error(y_test, y_pred))],
    "MAPE": [metrics.mean_absolute_percentage_error(y_test, y_pred)],
    "R2 Score": [metrics.r2_score(y_test, y_pred)],
})
metrics_df

Unnamed: 0,MAE,MSE,RMSE,MAPE,R2 Score
0,362.028708,312566.392355,559.076374,0.063777,0.823239
