In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import time

from fast_ml import eda
from fast_ml.utilities import display_all, reduce_memory_usage
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

##### Load the saved sample data - Feather Format

In [2]:
%time trans = pd.read_feather('tmp/train_transaction_sample')
df_size = trans.memory_usage().sum() / 1024**2
print(f'Memory usage of dataframe is {df_size} MB')
print (f'Shape of dataframe is {trans.shape}')

CPU times: user 1.3 s, sys: 1.12 s, total: 2.42 s
Wall time: 1.89 s
Memory usage of dataframe is 183.67779541015625 MB
Shape of dataframe is (200000, 394)


## Summary

In [4]:
summary_df = eda.df_summary(trans)
display_all(summary_df)

Unnamed: 0,data_type,num_unique_values,sample_unique_values,num_missing,perc_missing
TransactionID,int32,200000,"[3340057, 3369734, 3462322, 2988794, 3031584, ...",0,0.0
isFraud,int8,2,"[0, 1]",0,0.0
TransactionDT,int32,198049,"[8713400, 9583269, 12275348, 139136, 1079439, ...",0,0.0
TransactionAmt,float16,6494,"[774.0, 29.0, 105.3125, 2182.0, 97.9375, 24.5,...",0,0.0
ProductCD,object,5,"[W, C, S, H, R]",0,0.0
card1,int16,9836,"[11596, 11919, 8406, 12570, 16085, 10532, 5772...",0,0.0
card2,float16,500,"[188.0, 170.0, 264.0, 462.0, 297.0, 534.0, 298...",3009,1.5045
card3,float16,94,"[150.0, 106.0, 185.0, 147.0, 143.0, 144.0, 146...",536,0.268
card4,object,4,"[visa, mastercard, discover, american express,...",539,0.2695
card5,float16,101,"[226.0, 224.0, 166.0, 117.0, 126.0, 138.0, 195...",1428,0.714


## All models

In [7]:
c_vars = ['C'+str(i) for i in range(1,15,1)]

In [13]:
import sklearn
from sklearn.utils.testing import all_estimators
from sklearn.metrics import accuracy_score, precision_score, 

In [21]:
models = [est for est in all_estimators(type_filter='classifier')]
print (len(models))
models

42


[('AdaBoostClassifier', sklearn.ensemble._weight_boosting.AdaBoostClassifier),
 ('BaggingClassifier', sklearn.ensemble._bagging.BaggingClassifier),
 ('BernoulliNB', sklearn.naive_bayes.BernoulliNB),
 ('CalibratedClassifierCV', sklearn.calibration.CalibratedClassifierCV),
 ('CategoricalNB', sklearn.naive_bayes.CategoricalNB),
 ('CheckingClassifier', sklearn.utils._mocking.CheckingClassifier),
 ('ClassifierChain', sklearn.multioutput.ClassifierChain),
 ('ComplementNB', sklearn.naive_bayes.ComplementNB),
 ('DecisionTreeClassifier', sklearn.tree._classes.DecisionTreeClassifier),
 ('DummyClassifier', sklearn.dummy.DummyClassifier),
 ('ExtraTreeClassifier', sklearn.tree._classes.ExtraTreeClassifier),
 ('ExtraTreesClassifier', sklearn.ensemble._forest.ExtraTreesClassifier),
 ('GaussianNB', sklearn.naive_bayes.GaussianNB),
 ('GaussianProcessClassifier',
  sklearn.gaussian_process._gpc.GaussianProcessClassifier),
 ('GradientBoostingClassifier',
  sklearn.ensemble._gb.GradientBoostingClassifier)

In [14]:
removed_classifiers = [
    ("ClassifierChain", sklearn.multioutput.ClassifierChain),
    ("ComplementNB", sklearn.naive_bayes.ComplementNB),
    ("GaussianProcessClassifier", sklearn.gaussian_process.gpc.GaussianProcessClassifier),
    ("HistGradientBoostingClassifier",sklearn.ensemble._hist_gradient_boosting.gradient_boosting.HistGradientBoostingClassifier),
    ("MLPClassifier", sklearn.neural_network.multilayer_perceptron.MLPClassifier),
    ("LogisticRegressionCV", sklearn.linear_model.logistic.LogisticRegressionCV),
    ("MultiOutputClassifier", sklearn.multioutput.MultiOutputClassifier),
    ("MultinomialNB", sklearn.naive_bayes.MultinomialNB),
    ("OneVsOneClassifier", sklearn.multiclass.OneVsOneClassifier),
    ("OneVsRestClassifier", sklearn.multiclass.OneVsRestClassifier),
    ("OutputCodeClassifier", sklearn.multiclass.OutputCodeClassifier),
    ("RadiusNeighborsClassifier", sklearn.neighbors.classification.RadiusNeighborsClassifier),
    ("VotingClassifier", sklearn.ensemble.voting.VotingClassifier),
    ]

In [18]:
models.index(("LogisticRegressionCV", sklearn.linear_model.logistic.LogisticRegressionCV))

22

In [22]:
for i in removed_classifiers:
    ix = models.index(i)
    models.pop(ix)

In [23]:
len(models)

29

In [None]:
accuracy_list = []
recall_list = []
precision_list = []
f1_list = []
time_taken = []
names = []

In [24]:
for name, model in models:
    print (name)
    print (model)

AdaBoostClassifier
<class 'sklearn.ensemble._weight_boosting.AdaBoostClassifier'>
BaggingClassifier
<class 'sklearn.ensemble._bagging.BaggingClassifier'>
BernoulliNB
<class 'sklearn.naive_bayes.BernoulliNB'>
CalibratedClassifierCV
<class 'sklearn.calibration.CalibratedClassifierCV'>
CategoricalNB
<class 'sklearn.naive_bayes.CategoricalNB'>
CheckingClassifier
<class 'sklearn.utils._mocking.CheckingClassifier'>
DecisionTreeClassifier
<class 'sklearn.tree._classes.DecisionTreeClassifier'>
DummyClassifier
<class 'sklearn.dummy.DummyClassifier'>
ExtraTreeClassifier
<class 'sklearn.tree._classes.ExtraTreeClassifier'>
ExtraTreesClassifier
<class 'sklearn.ensemble._forest.ExtraTreesClassifier'>
GaussianNB
<class 'sklearn.naive_bayes.GaussianNB'>
GradientBoostingClassifier
<class 'sklearn.ensemble._gb.GradientBoostingClassifier'>
KNeighborsClassifier
<class 'sklearn.neighbors._classification.KNeighborsClassifier'>
LabelPropagation
<class 'sklearn.semi_supervised._label_propagation.LabelPropagat

In [None]:
feature_importance