In [None]:
!git clone https://github.com/referreira-wisc/digag2022.git

In [None]:
import os
os.chdir('digag2022/Lab03')

# <center> Advanced Digital Agriculture (DS/AS 875) <center> Module 01 - Data Analysis (Lab 03)
***
# Table of Contents
* [READING THE DATASET INTO PYTHON](#READING-THE-DATASET-INTO-PYTHON)
* [DATA EDITING](#DATA-EDITING)
    * [Creating dummy variables (One-Hot Encode)](#Creating-dummy-variables-(One-Hot-Encode))
    * [Creating training and testing datasets](#Creating-training-and-testing-datasets)
* [PARTIAL LEAST SQUARE](#PARTIAL-LEAST-SQUARE)
    * [Training (GridSearch)](#Training-(GridSearch))
    * [Testing](#Testing)
* [RIDGE REGRESSION](#RIDGE-REGRESSION)
    * [Training (GridSearch)](#Training-(GridSearch))
    * [Testing](#Testing)
* [NEURAL NETWORK](#NEURAL-NETWORK)
    * [Data pre-processing](#Data-pre-processing)
    * [Training (GridSearch)](#Training-(GridSearch))
    * [Testing](#Testing)
***

## READING THE DATASET INTO PYTHON

In [None]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
cowdata = pd.read_csv("CullDairyCow_Data.csv")
cowdata

## DATA EDITING

In [None]:
# Frequency for number of lactations
cowdata["lact"].value_counts(sort=True, ascending=False)

In [None]:
# Lactation number 6 or higher lumped together into a single class (6)
cowdata.loc[cowdata.lact > 6, "lact"] = 6

In [None]:
# Frequency for number of lamness cases
cowdata["lameness"].value_counts(sort=True, ascending=False)

In [None]:
# Number of lamness cases larger than 4 lumped together into a single class (4)
cowdata.loc[cowdata.lameness > 4, "lameness"] = 4

In [None]:
# Frequency for number of lamness cases
cowdata["mastitis"].value_counts(sort=True, ascending=False)

In [None]:
# Number of mastitis cases larger than 4 lumped together into a single class (4)
cowdata.loc[cowdata.mastitis > 4, "mastitis"] = 4

## Creating dummy variables (One-Hot Encode)

In [None]:
cowdata['reason'].value_counts(sort=True, ascending=False)

In [None]:
# Defining dummy variables
cowdata_oh = pd.get_dummies(cowdata, columns=["season", "lact", "calvingEase", "lameness", "mastitis", "reason", "lactStage"], drop_first=False)
cowdata_oh

## Creating training and testing datasets

In [None]:
# Adding continuous variables (305ME and BW) to the 30 dummy columns
x = pd.concat([cowdata_oh.loc[:,'305ME'], cowdata_oh.loc[:,'BW':]], axis=1)
x

In [None]:
print(x.columns.tolist())

In [None]:
# Center and Standardize all features (force mean=0 and standard deviation=1)
x_std = pd.concat([cowdata_oh.loc[:,'305ME'], cowdata_oh.loc[:,'BW':]], axis=1)
x_std = (x_std - x_std.mean()) / x_std.std() # Subtract mean and divide by standard deviation
x_std

In [None]:
# Response variable vector
y = cowdata_oh[["price"]]
y

In [None]:
# Split the data set into training (70%) and testing (30%)
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_std, y, test_size=0.30, shuffle=True, random_state=40)
print("N. samples training %s , N. samples testing %s" % (x_train.shape[0], x_test.shape[0]))

# PARTIAL LEAST SQUARE

##  Training (GridSearch)

In [None]:
# GridSearch using k-fold cross-validation
from sklearn.cross_decomposition import PLSRegression
from sklearn.model_selection import GridSearchCV
nfolds = 3
metrics = ('r2', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
parameters = {'n_components':[1, 3, 5]}
pls = PLSRegression()
pls = GridSearchCV(pls, parameters, scoring=metrics, cv=nfolds, refit = metrics[2], return_train_score = True)
pls.fit(x_train, y_train)

In [None]:
# Matrics - training
pd.DataFrame({'N. Components': pls.cv_results_["param_n_components"], 
              'R2': pls.cv_results_["mean_test_r2"], 
              'MAE': abs(pls.cv_results_['mean_test_neg_mean_absolute_error']),
              'RMSE': abs(pls.cv_results_['mean_test_neg_root_mean_squared_error'])})

## Testing

In [None]:
# Print best number of components and RMSE, and testing the best model using the test set
print("Best N. Components: %s, RMSE: %.6f" % (pls.best_params_['n_components'], abs(pls.best_score_)))
ypred = pls.best_estimator_.predict(x_test)

In [None]:
# Scatter plot - predicted and observed
import matplotlib.pyplot as plt
import numpy as np
ypred = np.ravel(ypred)
plt.plot(ypred, y_test, 'o')
m, b = np.polyfit(ypred, y_test, 1)
plt.plot(ypred, m*ypred + b)

In [None]:
# Metrics - testing
from sklearn.metrics import *

pd.DataFrame({'N. Components': [pls.best_params_['n_components']], 
              'R2': r2_score(y_test, ypred), 
              'MAE': [mean_absolute_error(y_test, ypred)],
              'RMSE': [mean_squared_error(y_test, ypred, squared=False)]})

# RIDGE REGRESSION

## Training (GridSearch)

In [None]:
# GridSearch using k-fold cross-validation
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
nfolds = 3
metrics = ('r2', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
parameters = {'alpha':[0, 0.3, 0.8]}
rr = Ridge()
rr = GridSearchCV(rr, parameters, scoring=metrics, cv=nfolds, refit = metrics[2], return_train_score = True)
rr.fit(x_train, y_train)

In [None]:
# Matrics - training
pd.DataFrame({'Alpha': rr.cv_results_["param_alpha"], 
              'R2': rr.cv_results_["mean_test_r2"], 
              'MAE': abs(rr.cv_results_['mean_test_neg_mean_absolute_error']),
              'RMSE': abs(rr.cv_results_['mean_test_neg_root_mean_squared_error'])})

## Testing

In [None]:
# Print best alpha and RMSE, and testing the best model using the test set
print("Best Alpha: %.1f, RMSE: %.6f" % (rr.best_params_['alpha'], abs(rr.best_score_)))
ypred = rr.best_estimator_.predict(x_test)

In [None]:
# Scatter plot - predicted and observed
import matplotlib.pyplot as plt
import numpy as np
ypred = np.ravel(ypred)
plt.plot(ypred, y_test, 'o')
m, b = np.polyfit(ypred, y_test, 1)
plt.plot(ypred, m*ypred + b)

In [None]:
# Metrics - testing
from sklearn.metrics import *

pd.DataFrame({'Alpha': [rr.best_params_['alpha']], 
              'R2': r2_score(y_test, ypred), 
              'MAE': [abs(mean_absolute_error(y_test, ypred))],
              'RMSE': [abs(mean_squared_error(y_test, ypred, squared=False))]})

# NEURAL NETWORKS

## Data pre-processing

In [None]:
# Response variable vector
y = cowdata_oh[["price"]]
y

##  Training (GridSearch)

In [None]:
# GridSearch using k-fold cross-validation
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV
nfolds = 3
metrics = ('r2', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
parameters = {'activation':('relu', 'tanh'), 
              'hidden_layer_sizes': [(100,80),(120,100),(180,120)],
              'random_state':[40]}

nn = MLPRegressor()
nn = GridSearchCV(nn, parameters, scoring=metrics, cv=nfolds, refit = metrics[2], return_train_score = True)
nn.fit(x_train, y_train)

In [None]:
# Matrics - training
pd.DataFrame({'Activation': nn.cv_results_["param_activation"],
              'Layer Sizes': nn.cv_results_['param_hidden_layer_sizes'],
              'R2': nn.cv_results_["mean_test_r2"], 
              'MAE': abs(nn.cv_results_['mean_test_neg_mean_absolute_error']),
              'RMSE': abs(nn.cv_results_['mean_test_neg_root_mean_squared_error'])})

## Testing

In [None]:
# Print best activation function, layer size, and RMSE, and testing the best model using the test set
print("Best Activation: %s, Layer Size: %s, RMSE: %.6f" % (nn.best_params_['activation'], nn.best_params_['hidden_layer_sizes'], abs(nn.best_score_)))
ypred = nn.best_estimator_.predict(x_test)

In [None]:
# Scatter plot - predicted and observed
import matplotlib.pyplot as plt
import numpy as np
ypred = np.ravel(ypred)
plt.plot(ypred, y_test, 'o')
m, b = np.polyfit(ypred, y_test, 1)
plt.plot(ypred, m*ypred + b)

In [None]:
# Metrics - testing
from sklearn.metrics import *

pd.DataFrame({'Activation': [nn.best_params_['activation']],
              'Layer Size': [nn.best_params_['hidden_layer_sizes']],
              'R2': r2_score(y_test, ypred), 
              'MAE': [abs(mean_absolute_error(y_test, ypred))],
              'RMSE': [abs(mean_squared_error(y_test, ypred, squared=False))]})