# Part 3: Unbiased Evaluation using a New Test Set

In this part, we are given a new test set (`/dsa/data/all_datasets/back_order/Kaggle_Test_Dataset_v2.csv`). We can now take advantage of the entire smart sample that we created in Part I. 

* Retrain a pipeline using the optimal parameters that the pipeline learned. We don't need to repeat GridSearch here. 

## Import modules as needed

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt

import os, sys
import itertools
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

from sklearn.svm import SVC
from sklearn.svm import OneClassSVM
from sklearn.neighbors import LocalOutlierFactor
from sklearn.covariance import EllipticEnvelope
from sklearn.ensemble import IsolationForest

from sklearn.decomposition import FactorAnalysis
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, chi2, f_classif, mutual_info_classif

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

## Load smart sample and the best pipeline from Part II

In [2]:
import joblib

X_resampled = joblib.load('X_resampled.pkl')

y_resampled = joblib.load('y_resampled.pkl')

In [3]:
import pickle

y_pred3 = pickle.load( open( 'my_final_project_model.pkl', "rb" ) )


##  Retrain a pipeline using the full sampled training data set

Use the full sampled training data set to train the pipeline.

In [4]:
# Add code below this comment  (Question #E301)
# ----------------------------------

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size = .25)

In [5]:
from sklearn.feature_selection import RFECV

rfc = RandomForestClassifier(n_estimators=30, 
                                        random_state=42,
                                        class_weight="balanced") 

rfecv = RFECV(estimator=rfc, 
              step=1, 
              cv=5, 
              scoring = 'roc_auc')

pipeline  = Pipeline([('feature_sele',rfecv),
                      ('rfc',rfc)])

In [6]:
param_grid = RandomForestClassifier(n_estimators=10, 
                             random_state=42,
                             class_weight="balanced") 

In [7]:
clf = RandomForestClassifier(n_estimators=10, 
                             random_state=42,
                             class_weight="balanced") 

In [9]:
clf1 = GridSearchCV(clf, 
                      param_grid={'max_depth':[2,3]},
                      cv= 5, scoring = 'roc_auc')

In [10]:
clf1.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=RandomForestClassifier(class_weight='balanced',
                                              n_estimators=10,
                                              random_state=42),
             param_grid={'max_depth': [2, 3]}, scoring='roc_auc')

In [11]:
full_pred = clf1.predict(X_test)
print(classification_report(y_test, full_pred, zero_division = 1))

              precision    recall  f1-score   support

           0       0.81      0.86      0.84      2765
           1       0.85      0.80      0.82      2720

    accuracy                           0.83      5485
   macro avg       0.83      0.83      0.83      5485
weighted avg       0.83      0.83      0.83      5485



### Save the trained model with the pickle library.

In [12]:
# Add code below this comment  
# -----------------------------

filename = 'my_final_project_full_trained_model.pkl'

outfile = open(filename,'wb')

pickle.dump(clf1,outfile)

outfile.close()


## Load the Testing Data and evaluate your model

 * `/dsa/data/all_datasets/back_order/Kaggle_Test_Dataset_v2.csv`
 
* We need to preprocess this test data (follow the steps similar to Part I)
* If we have fitted any normalizer/standardizer in Part 2, then we have to transform this test data using the fitted normalizer/standardizer

In [13]:
# Preprocess the given test set  (Question #E302)
# ----------------------------------

DATASET = '/dsa/data/all_datasets/back_order/Kaggle_Test_Dataset_v2.csv'
assert os.path.exists(DATASET)

dataset_2 = pd.read_csv(DATASET).sample(frac = 1).reset_index(drop=True)

dataset_2.head().transpose()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,0,1,2,3,4
sku,3515671,3404784,3391030,3459196,3286519
national_inv,150.0,6.0,6.0,3.0,9.0
lead_time,9.0,4.0,8.0,8.0,2.0
in_transit_qty,0.0,0.0,0.0,0.0,0.0
forecast_3_month,0.0,0.0,2.0,0.0,0.0
forecast_6_month,0.0,0.0,5.0,0.0,0.0
forecast_9_month,0.0,0.0,8.0,0.0,0.0
sales_1_month,1.0,0.0,2.0,0.0,0.0
sales_3_month,4.0,0.0,5.0,0.0,0.0
sales_6_month,14.0,0.0,9.0,0.0,0.0


In [14]:
dataset_2 = dataset_2.drop(columns=['sku', 'min_bank'])

In [15]:
dataset_2 = dataset_2.dropna()

In [16]:
yes_no_columns = list(filter(lambda i: dataset_2[i].dtype!=np.float64, dataset_2.columns))
print(yes_no_columns)

['potential_issue', 'deck_risk', 'oe_constraint', 'ppap_risk', 'stop_auto_buy', 'rev_stop', 'went_on_backorder']


In [17]:
print('potential_issue', dataset_2['potential_issue'].unique())
print('deck_risk', dataset_2['deck_risk'].unique())
print('oe_constraint', dataset_2['oe_constraint'].unique())
print('ppap_risk', dataset_2['ppap_risk'].unique())
print('stop_auto_buy', dataset_2['stop_auto_buy'].unique())
print('rev_stop', dataset_2['rev_stop'].unique())
print('went_on_backorder', dataset_2['went_on_backorder'].unique())

potential_issue ['No' 'Yes']
deck_risk ['No' 'Yes']
oe_constraint ['No' 'Yes']
ppap_risk ['Yes' 'No']
stop_auto_buy ['Yes' 'No']
rev_stop ['No' 'Yes']
went_on_backorder ['No' 'Yes']


In [18]:
for column_name in yes_no_columns:
    mode = dataset_2[column_name].apply(str).mode()[0]
    print('Filling missing values of {} with {}'.format(column_name, mode))
    dataset_2[column_name].fillna(mode, inplace=True)

Filling missing values of potential_issue with No
Filling missing values of deck_risk with No
Filling missing values of oe_constraint with No
Filling missing values of ppap_risk with No
Filling missing values of stop_auto_buy with Yes
Filling missing values of rev_stop with No
Filling missing values of went_on_backorder with No


In [19]:
dataset_2['potential_issue'] = dataset_2['potential_issue'].apply(['Yes', 'No'].index)
dataset_2['deck_risk'] = dataset_2['deck_risk'].apply(['Yes', 'No'].index)
dataset_2['oe_constraint'] = dataset_2['oe_constraint'].apply(['Yes', 'No'].index)
dataset_2['ppap_risk'] = dataset_2['ppap_risk'].apply(['Yes', 'No'].index)
dataset_2['stop_auto_buy'] = dataset_2['stop_auto_buy'].apply(['Yes', 'No'].index)
dataset_2['rev_stop'] = dataset_2['rev_stop'].apply(['Yes', 'No'].index)
dataset_2['went_on_backorder'] = dataset_2['went_on_backorder'].apply(['Yes', 'No'].index)

In [20]:
dataset_2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 227351 entries, 0 to 242074
Data columns (total 21 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   national_inv       227351 non-null  float64
 1   lead_time          227351 non-null  float64
 2   in_transit_qty     227351 non-null  float64
 3   forecast_3_month   227351 non-null  float64
 4   forecast_6_month   227351 non-null  float64
 5   forecast_9_month   227351 non-null  float64
 6   sales_1_month      227351 non-null  float64
 7   sales_3_month      227351 non-null  float64
 8   sales_6_month      227351 non-null  float64
 9   sales_9_month      227351 non-null  float64
 10  potential_issue    227351 non-null  int64  
 11  pieces_past_due    227351 non-null  float64
 12  perf_6_month_avg   227351 non-null  float64
 13  perf_12_month_avg  227351 non-null  float64
 14  local_bo_qty       227351 non-null  float64
 15  deck_risk          227351 non-null  int64  
 16  oe

In [21]:
num_backorder = np.sum(dataset_2['went_on_backorder']==1)
print('backorder ratio:', num_backorder, '/', len(dataset_2), '=', num_backorder / len(dataset_2))

backorder ratio: 224747 / 227351 = 0.9885463446389063


In [22]:
y2 = dataset_2.went_on_backorder
X2 = dataset_2.drop('went_on_backorder', axis=1)

In [23]:
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=0)
X2_resampled, y2_resampled = rus.fit_resample(X2, y2)
print(sorted(Counter(y2_resampled).items()))

[(0, 2604), (1, 2604)]


In [24]:
from sklearn import preprocessing

scaler = preprocessing.StandardScaler()
X2_resampled = scaler.fit_transform(X2_resampled)

In [25]:
scaler2 = MinMaxScaler() 
X2_resampled = scaler2.fit_transform(X2_resampled)

In [26]:
print(X2_resampled.mean(axis=0))
print(X2_resampled.std(axis=0))

[2.11732654e-02 1.39485850e-01 4.05999139e-04 9.42117848e-04
 9.28844514e-04 8.85173819e-04 8.81836268e-04 8.63205456e-04
 8.24982976e-04 8.05569337e-04 9.99423963e-01 7.25824231e-04
 9.82009274e-01 9.86389766e-01 4.96945797e-04 8.48886329e-01
 9.99615975e-01 8.77688172e-01 2.82258065e-02 9.99231951e-01]
[0.02142387 0.11428358 0.01413135 0.0172961  0.01748448 0.01679379
 0.01652108 0.01658029 0.01646857 0.01647986 0.02399385 0.01617544
 0.12345868 0.10470889 0.01553004 0.35815964 0.01959278 0.32764561
 0.16561736 0.02770305]


In [27]:
print("X, y shape:", X2_resampled.shape, y2_resampled.shape)

X, y shape: (5208, 20) (5208,)


We can now predict and evaluate with the preprocessed test set. It would be interesting to see the performance with and without outliers removal from the test set. We can report confusion matrix, precision, recall, f1-score, accuracy, and other measures (if any). 

In [28]:
X2_train, X2_test, y2_train, y2_test = train_test_split(X2_resampled, y2_resampled, test_size = .25)

In [29]:
# Add code below this comment  (Question #E303)
# ----------------------------------

# Performace without outliers

lof = LocalOutlierFactor(n_neighbors=2).fit(X2_train, y2_train)

lof_outliers = lof.fit_predict(X2_train)==-1
print(f"Num of outliers = {np.sum(lof_outliers)}")

Num of outliers = 1182


In [30]:
X_lof = X2_train[~lof_outliers]
y_lof = y2_train[~lof_outliers]

In [31]:
print("X, y shape:", X_lof.shape, y_lof.shape)

X, y shape: (2724, 20) (2724,)


In [32]:
X3_train_lof, X3_test_lof, y3_train_lof, y3_test_lof = train_test_split(X_lof, y_lof, test_size=0.2)

In [33]:
X3_train_lof.shape, X3_test_lof.shape, y3_train_lof.shape, y3_test_lof.shape

((2179, 20), (545, 20), (2179,), (545,))

In [34]:
rfc = RandomForestClassifier(n_estimators=30, 
                                        random_state=42,
                                        class_weight="balanced") 

rfecv = RFECV(estimator=rfc, 
              step=1, 
              cv=5, 
              scoring = 'roc_auc')

pipeline  = Pipeline([('feature_sele',rfecv),
                      ('rfc',rfc)])

In [35]:
param_grid = RandomForestClassifier(n_estimators=10, 
                             random_state=42,
                             class_weight="balanced") 

In [36]:
clf = RandomForestClassifier(n_estimators=10, 
                             random_state=42,
                             class_weight="balanced") 

clf2 = GridSearchCV(clf, 
                      param_grid={'max_depth':[2,3]},
                      cv= 5, scoring = 'roc_auc')

In [37]:
clf2.fit(X3_train_lof, y3_train_lof)

GridSearchCV(cv=5,
             estimator=RandomForestClassifier(class_weight='balanced',
                                              n_estimators=10,
                                              random_state=42),
             param_grid={'max_depth': [2, 3]}, scoring='roc_auc')

In [38]:
pred1 = clf2.predict(X2_test)
print(classification_report(y2_test, pred1, zero_division = 1))

              precision    recall  f1-score   support

           0       0.81      0.85      0.83       673
           1       0.83      0.79      0.81       629

    accuracy                           0.82      1302
   macro avg       0.82      0.82      0.82      1302
weighted avg       0.82      0.82      0.82      1302



In [39]:
pd.DataFrame(confusion_matrix(y2_test, pred1))

Unnamed: 0,0,1
0,569,104
1,134,495


In [40]:
# Performace with outliers

rfc = RandomForestClassifier(n_estimators=30, 
                                        random_state=42,
                                        class_weight="balanced") 

rfecv = RFECV(estimator=rfc, 
              step=1, 
              cv=5, 
              scoring = 'roc_auc')

pipeline  = Pipeline([('feature_sele',rfecv),
                      ('rfc',rfc)])

In [41]:
param_grid = RandomForestClassifier(n_estimators=10, 
                             random_state=42,
                             class_weight="balanced") 

In [42]:
clf = RandomForestClassifier(n_estimators=10, 
                             random_state=42,
                             class_weight="balanced") 

clf3 = GridSearchCV(clf, 
                      param_grid={'max_depth':[2,3]},
                      cv= 5, scoring = 'roc_auc')

In [43]:
clf3.fit(X_train, y_train) # EDITED for mistake on using outliers before.. correct accuracy score now with outliers included

GridSearchCV(cv=5,
             estimator=RandomForestClassifier(class_weight='balanced',
                                              n_estimators=10,
                                              random_state=42),
             param_grid={'max_depth': [2, 3]}, scoring='roc_auc')

In [44]:
pred2 = clf3.predict(X2_test)
print(classification_report(y2_test, pred2, zero_division = 1))

              precision    recall  f1-score   support

           0       1.00      0.00      0.00       673
           1       0.48      1.00      0.65       629

    accuracy                           0.48      1302
   macro avg       0.74      0.50      0.33      1302
weighted avg       0.75      0.48      0.31      1302



In [45]:
pd.DataFrame(confusion_matrix(y2_test, pred2))

Unnamed: 0,0,1
0,0,673
1,0,629


## Conclusion

## Reflect

Imagine you are data scientist that has been tasked with developing a system to save your 
company money by predicting and preventing back orders of parts in the supply cha
Write a **brief summary** for "management" that details your findings, 
your level of certainty and trust in the models, 
and recommendations for operationalizing these models for the business.

# Save your notebook!
## Then `File > Close and Halt`