<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Baseline-Model-Selection-with-Original-Data-without-Cross-Validation" data-toc-modified-id="Baseline-Model-Selection-with-Original-Data-without-Cross-Validation-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Baseline Model Selection with Original Data without Cross Validation</a></span><ul class="toc-item"><li><span><a href="#Promising-models-based-on-balance-between-Recall-and-Precision-Scores-and-F0.5:" data-toc-modified-id="Promising-models-based-on-balance-between-Recall-and-Precision-Scores-and-F0.5:-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Promising models based on balance between Recall and Precision Scores and F0.5:</a></span></li></ul></li><li><span><a href="#Baseline-Model-Selection-with-Original-data-and-Cross-Validation" data-toc-modified-id="Baseline-Model-Selection-with-Original-data-and-Cross-Validation-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Baseline Model Selection with Original data and Cross Validation</a></span></li></ul></div>

In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
import pandas as pd
from sklearn.metrics import recall_score, precision_score, mean_squared_log_error, mean_squared_error, confusion_matrix, roc_auc_score, f1_score, plot_roc_curve, roc_curve, RocCurveDisplay, auc, precision_recall_curve, fbeta_score, make_scorer
from sklearn.model_selection import cross_validate, StratifiedKFold, train_test_split, cross_val_predict, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC, NuSVC, SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression, SGDClassifier
from sklearn.ensemble import BaggingClassifier, ExtraTreesClassifier, RandomForestClassifier,GradientBoostingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier, XGBRFClassifier
from imblearn import over_sampling

In [3]:
# load binaries
trIeng_df=pd.read_pickle("../data/trIeng_df.pkl")

In [4]:
#assign predictor and target variables

y= trIeng_df['PotentialFraud']
X= trIeng_df.drop('PotentialFraud', axis=1)

X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size = 0.2, random_state = 0, stratify = y)

In [5]:
# Standard Scale Data 
scaler = StandardScaler()
X_tr_scl= pd.DataFrame(scaler.fit_transform(X_tr), columns=X.columns)
X_te_scl= pd.DataFrame(scaler.transform(X_te), columns=X.columns)

In [6]:
# Balance data by oversampling using SMOTE
oversample = over_sampling.SMOTE(random_state=0)
X_tr_scl_SMOTE, y_tr_SMOTE = oversample.fit_resample(X_tr_scl, y_tr)

In [7]:
# Min Max Scale Data ( for scaling sparse data)
MMscaler = MinMaxScaler()
X_tr_mms = pd.DataFrame(MMscaler.fit_transform(X_tr), columns=X.columns)
X_te_mms = pd.DataFrame(MMscaler.transform(X_te), columns=X.columns)

In [8]:
# Balance data by oversampling using SMOTE
oversample = over_sampling.SMOTE(random_state=0)
X_tr_mms_SMOTE, y_tr_SMOTE = oversample.fit_resample(X_tr_mms, y_tr)

In [9]:
# make custom F0.5 scorer for sklearn
scorerF0_5 = make_scorer(fbeta_score, beta=0.5)


In [10]:
# # Function to return model Recall, Accuracy and F0.5 scores
def evaluate_model(model, X_train, y_train, X_test, y_test):
    '''
    Function to return model Recall and Accuracy scores
    Arguments:
    - model
    - X_train data
    - y_train data
    - X_test data
    - y_test data
    '''
    print(model,'\n')
    print('Train Recall score: {.3f}, Test Recall score: {.3f}'.format(recall_score(y_train, model.predict(X_train)),recall_score(y_test, model.predict(X_test))))
    print('-'*60)
    print('Train Precision score: {.3f}, Test Precision score: {.3f}'.format(precision_score(y_train, model.predict(X_train)),precision_score(y_test, model.predict(X_test))))
    print('-'*60)
    print('Train F0.5 score: {.3f}, Test F0.5 score: {.3f}'.format(fbeta_score(y_train, model.predict(X_train),beta=0.5),fbeta_score(y_test, model.predict(X_test),beta=0.5)))
    print('-'*60)
    print('Train F2 score: {.3f}, Test F2 score: {.3f}'.format(fbeta_score(y_train, model.predict(X_train),beta=2),fbeta_score(y_test, model.predict(X_test),beta=2)))
    print('-'*60)
    print('Train Confusion Matrix')
    print(confusion_matrix(y_train, model.predict(X_train)))
    print('Test Confusion Matrix')
    print(confusion_matrix(y_test, model.predict(X_test)))
    print('-'*60)

#  Baseline Model Selection with Original Data without Cross Validation

In [11]:
models = [
    LinearDiscriminantAnalysis(), GaussianNB(), SVC(gamma='auto'),  LinearSVC(), # NuSVC(gamma='auto'),
    SGDClassifier(max_iter=100, tol=1e-3), KNeighborsClassifier(),
    LogisticRegression(solver='liblinear', penalty='l1'), LogisticRegressionCV(solver='liblinear', penalty='l1',cv=3),
    BaggingClassifier(), ExtraTreesClassifier(n_estimators=300),
    RandomForestClassifier(n_estimators=300), GradientBoostingClassifier(),XGBClassifier(), XGBRFClassifier()
]

In [12]:
def score_model(X_train, y_train, X_test, y_test, estimator, **kwargs):
    """
    Test various estimators.
    """
    model = Pipeline([('estimator', estimator)])

    # Instantiate the classification model and visualizer
    model.fit(X_train, y_train, **kwargs)

    expected  = y_test
    predicted = model.predict(X_test)

    # Compute and return F1 (harmonic mean of precision and recall)
    print("Test Metrics, {}: Recall: {:.3f}, Precision: {:.3f}, F0.5: {:.3f}".format(estimator.__class__.__name__,
                                                                      recall_score(expected, predicted), 
                                                                      precision_score(expected, predicted),
                                                                      fbeta_score(expected, predicted, beta=0.5)))

In [13]:
for model in models:
    score_model(X_tr, y_tr, X_te, y_te, model)
    print('-'*100)

Test Metrics, LinearDiscriminantAnalysis: Recall: 0.455, Precision: 0.741, F0.5: 0.658
----------------------------------------------------------------------------------------------------
Test Metrics, GaussianNB: Recall: 0.568, Precision: 0.667, F0.5: 0.644
----------------------------------------------------------------------------------------------------


  _warn_prf(average, modifier, msg_start, len(result))


Test Metrics, SVC: Recall: 0.000, Precision: 0.000, F0.5: 0.000
----------------------------------------------------------------------------------------------------


  _warn_prf(average, modifier, msg_start, len(result))


Test Metrics, LinearSVC: Recall: 0.000, Precision: 0.000, F0.5: 0.000
----------------------------------------------------------------------------------------------------
Test Metrics, SGDClassifier: Recall: 0.602, Precision: 0.515, F0.5: 0.530
----------------------------------------------------------------------------------------------------
Test Metrics, KNeighborsClassifier: Recall: 0.489, Precision: 0.623, F0.5: 0.591
----------------------------------------------------------------------------------------------------
Test Metrics, LogisticRegression: Recall: 0.500, Precision: 0.746, F0.5: 0.679
----------------------------------------------------------------------------------------------------




Test Metrics, LogisticRegressionCV: Recall: 0.500, Precision: 0.746, F0.5: 0.679
----------------------------------------------------------------------------------------------------
Test Metrics, BaggingClassifier: Recall: 0.443, Precision: 0.684, F0.5: 0.617
----------------------------------------------------------------------------------------------------
Test Metrics, ExtraTreesClassifier: Recall: 0.409, Precision: 0.679, F0.5: 0.600
----------------------------------------------------------------------------------------------------
Test Metrics, RandomForestClassifier: Recall: 0.489, Precision: 0.768, F0.5: 0.689
----------------------------------------------------------------------------------------------------
Test Metrics, GradientBoostingClassifier: Recall: 0.602, Precision: 0.736, F0.5: 0.705
----------------------------------------------------------------------------------------------------
Test Metrics, XGBClassifier: Recall: 0.557, Precision: 0.742, F0.5: 0.696
-----------

# Baseline Model Selection with Original data and Cross Validation
Baseline models with 5-fold Cross validation

In [17]:
models = [
    LinearDiscriminantAnalysis(), GaussianNB(), SVC(gamma='scale'), LinearSVC(max_iter=2000), #NuSVC(gamma='auto'), 
    SGDClassifier(max_iter=300), KNeighborsClassifier(),
    LogisticRegression(solver='liblinear', penalty='l1'), #LogisticRegressionCV(solver='liblinear', penalty='l1',cv=3),
    BaggingClassifier(), ExtraTreesClassifier(n_estimators=300),
    RandomForestClassifier(n_estimators=300), GradientBoostingClassifier(),XGBClassifier(), XGBRFClassifier()
]

In [18]:
def score_modelCV(X_train, y_train, X_test, y_test, estimator, **kwargs):
    """
    Test various estimators with 5 fold Stratified Cross Validation.
    """
    # Stratified Cross Validation 
    skf = StratifiedKFold(n_splits = 5, random_state = 0, shuffle = True)
    model = Pipeline([('estimator', estimator)])

    # Instantiate the classification model and visualizer
    cvs = GridSearchCV(estimator= model, param_grid={}, scoring='recall', cv=skf, n_jobs=-1)
    cvs_out= cvs.fit(X_train, y_train, **kwargs)
    
    expected  = y_test
    predicted = cvs_out.predict(X_test)

    # Compute and return Recall, Precision, F1 (harmonic mean of precision and recall)
    print("Test Metrics, {}: Recall: {:.3f}, Precision: {:.3f}, F0.5: {:.3f}, F2: {:.3f}".format(estimator.__class__.__name__,
                                                                      recall_score(expected, predicted), 
                                                                      precision_score(expected, predicted),
                                                                      fbeta_score(expected, predicted, beta=0.5),
                                                                      fbeta_score(expected, predicted, beta=2)))

In [19]:
for model in models:
    score_modelCV(X_tr, y_tr, X_te, y_te, model)
    print('-'*100)

Test Metrics, LinearDiscriminantAnalysis: Recall: 0.455, Precision: 0.741, F0.5: 0.658, F2: 0.493
----------------------------------------------------------------------------------------------------
Test Metrics, GaussianNB: Recall: 0.568, Precision: 0.667, F0.5: 0.644, F2: 0.585
----------------------------------------------------------------------------------------------------
Test Metrics, SVC: Recall: 0.330, Precision: 0.853, F0.5: 0.647, F2: 0.376
----------------------------------------------------------------------------------------------------




Test Metrics, LinearSVC: Recall: 0.727, Precision: 0.516, F0.5: 0.548, F2: 0.672
----------------------------------------------------------------------------------------------------
Test Metrics, SGDClassifier: Recall: 0.670, Precision: 0.500, F0.5: 0.527, F2: 0.628
----------------------------------------------------------------------------------------------------
Test Metrics, KNeighborsClassifier: Recall: 0.489, Precision: 0.623, F0.5: 0.591, F2: 0.511
----------------------------------------------------------------------------------------------------
Test Metrics, LogisticRegression: Recall: 0.500, Precision: 0.746, F0.5: 0.679, F2: 0.535
----------------------------------------------------------------------------------------------------
Test Metrics, BaggingClassifier: Recall: 0.432, Precision: 0.644, F0.5: 0.586, F2: 0.462
----------------------------------------------------------------------------------------------------
Test Metrics, ExtraTreesClassifier: Recall: 0.409, Precisi