In [141]:
import numpy as np
import pandas as pd
import altair as alt
import copy
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    plot_confusion_matrix,
)
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [142]:
sap_sugar_df = pd.read_pickle('../data/processed/stinson2019/sap_sugar_weekly_summary')
gdd_frthw = pd.read_pickle('../data/processed/HF_weather/gdd_frthw')
gdd_frthw = gdd_frthw.set_index('datetime')

In [143]:
full_df = copy.copy(sap_sugar_df)
full_df = full_df.assign(cum_GDD = full_df.loc[:,'date_to'].apply(lambda x: gdd_frthw.loc[x]['cumGDD']),
                         weekly_frthw = full_df.loc[:,'date_to'].apply(lambda x: gdd_frthw.loc[x]['frthw']))
full_df.loc[:,'weekly_frthw'] -= full_df['date_from'].apply(lambda x: gdd_frthw.loc[x]['frthw'])

full_df['sap_binary'] = full_df['weekly_sap'].map(lambda x: 0 if ((pd.isnull(x)) | (x==0)) else 1)

full_df

Unnamed: 0,date_from,date_to,weekly_sugarwt,weekly_sap,site,cum_GDD,weekly_frthw,sap_binary
0,2014-03-11,2014-03-17,0.000543,0.04,DARTMOUTH ORGANIC FARM,4.870715,4.5,1
1,2014-03-12,2014-03-18,0.000183,0.01,DARTMOUTH ORGANIC FARM,4.870715,2.0,1
2,2014-03-13,2014-03-19,0.000183,0.01,DARTMOUTH ORGANIC FARM,4.870715,3.0,1
3,2014-03-14,2014-03-20,0.000183,0.01,DARTMOUTH ORGANIC FARM,4.870715,4.0,1
4,2014-03-15,2014-03-21,0.000183,0.01,DARTMOUTH ORGANIC FARM,4.870715,3.0,1
...,...,...,...,...,...,...,...,...
18278,2016-02-27,2016-03-04,0.731130,34.63,SOUTHERNMOST MAPLE,10.622917,5.0,1
18279,2016-02-28,2016-03-05,0.731130,34.63,SOUTHERNMOST MAPLE,10.622917,4.5,1
18280,2016-02-29,2016-03-06,0.720000,36.53,SOUTHERNMOST MAPLE,10.622917,5.5,1
18281,2016-03-01,2016-03-07,0.556020,27.42,SOUTHERNMOST MAPLE,10.622917,4.0,1


In [146]:
# full_df = full_df.dropna()
full_df = full_df.assign(weekly_frthw_sq = lambda x: x.weekly_frthw**2,
                         houle_production = np.nan,
                         houle_sap_production = np.nan,
                         houle_binary = np.nan)

In [261]:
full_df.loc[:,'houle_production'] = -(-5.09 + 0.733 * full_df['weekly_frthw'] - 0.014 * full_df['weekly_frthw_sq'] - 0.07 * full_df['cum_GDD'])
full_df.loc[:,'houle_sap_production'] = 1/(1 + np.exp(-full_df['houle_production']))
full_df.loc[:,'houle_binary'] = full_df['houle_sap_production'] > 0.50
full_df.loc[:,'houle_binary'].value_counts()

True     18224
False       59
Name: houle_binary, dtype: int64

In [262]:
print(confusion_matrix(full_df['sap_binary'], full_df['houle_binary']))

[[    0  2240]
 [   59 15984]]


In [263]:
tn, fp, fn, tp = confusion_matrix(full_df['sap_binary'], full_df['houle_binary']).ravel()
(tn, fp, fn, tp)

(0, 2240, 59, 15984)

In [264]:
print(accuracy_score(full_df['sap_binary'], full_df['houle_binary']))

0.8742547721927474


In [265]:
print(classification_report(full_df['sap_binary'], full_df['houle_binary']))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00      2240
           1       0.88      1.00      0.93     16043

    accuracy                           0.87     18283
   macro avg       0.44      0.50      0.47     18283
weighted avg       0.77      0.87      0.82     18283



In [266]:
full_df.site.value_counts(normalize=True)

HARVARD FOREST             0.429743
DARTMOUTH ORGANIC FARM     0.246841
NORTHERN RANGE - QUEBEC    0.132363
INDU                       0.091943
SOUTHERNMOST MAPLE         0.063009
DIVIDE RIDGE               0.036099
Name: site, dtype: float64

In [283]:
X = full_df[['weekly_frthw','weekly_frthw_sq', 'cum_GDD']]
y = full_df['sap_binary']

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state = 123, stratify = full_df.site)
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
logreg.score(X_train,y_train)


0.8776835771913032

In [280]:
print(logreg.intercept_, logreg.coef_)

[0.47552605] [[ 0.71497448 -0.06783927  0.00935915]]


In [275]:
# Test Houle parameters
logreg.intercept_ = [5.09]
logreg.coef_ = np.array([[-0.733, 0.014, 0.07]])
logreg.score(X_train,y_train)

0.8748119786681252

In [281]:
print(logreg.intercept_, logreg.coef_)

[0.47552605] [[ 0.71497448 -0.06783927  0.00935915]]
