# Version 1.0 Model Predictions

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns

## Import Data Sets
Here we will import `NetC_Expanded` which is data seperated by management style, result and slavage status, and we will import the predictors and join those datasets.

In [3]:
netc_expanded = pd.read_csv('../Data/NetC_Expanded.csv')
netc_expanded = netc_expanded.drop(['Unnamed: 0'], axis=1)
netc_expanded.head()

Unnamed: 0,TimeStep,Risk_Cat,Stand_ID,Salvage,Management,Result
0,0,4,0023200606030102900043,True,Heavy,-249.287884
1,0,4,0023200606030102900043,True,NoMgmt,-321.931519
2,0,4,0023200606030102900043,True,Moderate,-276.111511
3,0,4,0023200606030102900043,True,Comm-Ind,-250.583375
4,0,4,0023200606030102900043,True,HighGrade,-293.426896


In [4]:
predictors = pd.read_csv('../Data/Predict_SBW_wCarbon_T0to40.csv')
predictors = predictors.rename(columns={'StandID': 'Stand_ID'})
predictors = predictors.set_index('Stand_ID')
predictors = predictors[["BF_BA","OHost_BA","BF_Stock","OHost_Stock","NonHost_Stock","BF_QMD","ELEV","SLOPE","ASPECT","LAT","SiteInd"]]
predictors.head()

Unnamed: 0_level_0,BF_BA,OHost_BA,BF_Stock,OHost_Stock,NonHost_Stock,BF_QMD,ELEV,SLOPE,ASPECT,LAT,SiteInd
Stand_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0023200606030200300067,,0.498332,,0.8836,56.9255,,580,5.0,240.0,46.14358,
0023200606030200300826,3.89961,11.890484,21.8437,0.885569,18.6635,4.965398,1170,0.0,0.0,47.19684,28.0
0023200606030200300924,0.036869,7.44351,0.2679,1.166825,3.8019,2.6,990,0.0,0.0,46.64171,33.0
0023200606030301901813,0.967649,1.368845,63.7216,1.609179,12.1858,2.106063,180,0.0,0.0,45.09319,40.0
0023200606030400901513,3.352901,6.679677,73.4189,1.397641,18.2434,3.405766,250,0.0,0.0,44.73563,


## Selecting Management Style 
Gong to define a management style upfront to create rule set

In [41]:
MANAGEMENT_STYLE = 'NoMgmt'

In [42]:
mgmt_df = netc_expanded[netc_expanded['Management'] == MANAGEMENT_STYLE]

In [58]:
x = mgmt_df[(mgmt_df['Stand_ID'] == '0023200606030400901513') & 
            (mgmt_df['TimeStep'] == 40)].groupby('Salvage').head().set_index('Salvage')

In [59]:
x

Unnamed: 0_level_0,TimeStep,Risk_Cat,Stand_ID,Management,Result
Salvage,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
True,40,1,0023200606030400901513,NoMgmt,-340.709429
False,40,1,0023200606030400901513,NoMgmt,-286.246874


Choosing a left join to drop `na` for 

In [64]:
def get_mgmt_df(target_df, pred_df):
    """
    Returns labeled DF for salvage and non salvage decisions
    """
    temp_df = pd.DataFrame(columns=['Stand_ID', 'Salvage_Good', 'Result'])
    for stand in target_df['Stand_ID'].unique():
#         group_df = target_df[
#             target_df['Stand_ID'] == stand
#         ].groupby(['Salvage']).agg({'Result': np.min})

        group_df = target_df[(target_df['Stand_ID'] == stand)
                             & (target_df['TimeStep'] == 40)]
        group_df = group_df.set_index('Salvage')
        
        # If index is True where min is acheived
        if group_df['Result'].idxmin():
            temp_df = temp_df.append(
                pd.DataFrame({
                    'Stand_ID': [stand],
                    'Salvage_Good': [True],
                    'Result': [group_df['Result'].min()]
                })
            )
        else:
            temp_df = temp_df.append(
                pd.DataFrame({
                    'Stand_ID': [stand],
                    'Salvage_Good': [False],
                    'Result': [group_df['Result'].min()]
                })
            )
            
    temp_df = temp_df.set_index('Stand_ID')
    return pd.merge(pred_df, temp_df, on="Stand_ID", right_index=True)

In [65]:
heavy_df = get_mgmt_df(mgmt_df, predictors)

In [66]:
heavy_df = heavy_df.dropna()

In [67]:
heavy_df.head()

Unnamed: 0_level_0,BF_BA,OHost_BA,BF_Stock,OHost_Stock,NonHost_Stock,BF_QMD,ELEV,SLOPE,ASPECT,LAT,SiteInd,Salvage_Good,Result
Stand_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0023200606030200300826,3.89961,11.890484,21.8437,0.885569,18.6635,4.965398,1170,0.0,0.0,47.19684,28.0,True,-378.709805
0023200606030200300924,0.036869,7.44351,0.2679,1.166825,3.8019,2.6,990,0.0,0.0,46.64171,33.0,True,-384.995331
0023200606030301901813,0.967649,1.368845,63.7216,1.609179,12.1858,2.106063,180,0.0,0.0,45.09319,40.0,True,-537.624581
0023200606030702501209,2.604667,6.283662,46.1259,1.32792,37.9627,3.545086,1360,0.0,0.0,46.33241,31.0,True,-261.114995
0023200606030702501226,17.937933,19.655671,60.2632,0.808564,11.8988,6.622135,1480,25.0,216.0,45.69559,38.0,False,-258.286555


In [69]:
heavy_df['Salvage_Good'].value_counts()

True     3538
False     699
Name: Salvage_Good, dtype: int64

Here we can see that the option to not salvage benefited more than salvaging, so in this case we won't offer salvage credit.

## Split Data

In [70]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [71]:
heavy_df = heavy_df.reset_index().drop('Stand_ID', axis=1)

In [72]:
X = heavy_df.drop(['Salvage_Good', 'Result'], axis=1)
y = heavy_df[['Salvage_Good']].astype('int')

In [73]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

## Decision Tree
Going to explore what the upper bound is for classification error on this data set.

In [91]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import _tree

In [93]:
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.8077830188679245

In [94]:
accuracy_score(y_train, clf.predict(X_train))

1.0

In [95]:
y.value_counts() / len(y)

Salvage_Good
1               0.835025
0               0.164975
dtype: float64

In [98]:
def tree_to_code(tree, feature_names):
    tree_ = tree.tree_
    feature_name = [
        feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"
        for i in tree_.feature
    ]
    print("def tree({}):".format(", ".join(feature_names)))

    def recurse(node, depth):
        indent = "  " * depth
        if tree_.feature[node] != _tree.TREE_UNDEFINED:
            name = feature_name[node]
            threshold = tree_.threshold[node]
            print("{}if {} <= {}:".format(indent, name, threshold))
            recurse(tree_.children_left[node], depth + 1)
            print ("{}else:  # if {} > {}".format(indent, name, threshold))
            recurse(tree_.children_right[node], depth + 1)
        else:
            print ("{}return {}".format(indent, tree_.value[node]))

    recurse(0, 1)

tree_to_code(clf, X_train.columns)

def tree(BF_BA, OHost_BA, BF_Stock, OHost_Stock, NonHost_Stock, BF_QMD, ELEV, SLOPE, ASPECT, LAT, SiteInd):
  if BF_QMD <= 6.892452955245972:
    if BF_Stock <= 2.941999912261963:
      if OHost_Stock <= 0.44407157599925995:
        if BF_QMD <= 6.4006288051605225:
          if BF_BA <= 0.019252620171755552:
            return [[1. 0.]]
          else:  # if BF_BA > 0.019252620171755552
            if NonHost_Stock <= 44.36865043640137:
              if NonHost_Stock <= 43.633501052856445:
                if ASPECT <= 230.5:
                  return [[ 0. 13.]]
                else:  # if ASPECT > 230.5
                  return [[1. 0.]]
              else:  # if NonHost_Stock > 43.633501052856445
                return [[2. 0.]]
            else:  # if NonHost_Stock > 44.36865043640137
              if OHost_BA <= 0.18489060178399086:
                if BF_QMD <= 1.797840178012848:
                  return [[0. 1.]]
                else:  # if BF_QMD > 1.797840178012848
              

## SVM

In [88]:
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier

In [89]:
clf = AdaBoostClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy_score(y_test, y_pred)

  return f(**kwargs)


0.8372641509433962

In [90]:
clf = RandomForestClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy_score(y_test, y_pred)

  


0.8514150943396226

In [82]:
import matplotlib.pyplot as plt