In [1]:
# !pip install evalml

In [2]:
import warnings
warnings.filterwarnings("ignore", message="Could not infer format, so each element will be parsed individually")

### Loading The Dataset
- We can also read the dataset from csv
- then convert to datatable

In [3]:
import evalml
X, y = evalml.demos.load_breast_cancer()
X_train, X_test, y_train, y_test = evalml.preprocessing.split_data(X, y, problem_type='binary')

         Number of Features
Numeric                  30

Number of training examples: 569
Targets
benign       62.74%
malignant    37.26%
Name: count, dtype: object


In [4]:
X_train.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
381,11.04,14.93,70.67,372.7,0.07987,0.07079,0.03546,0.02074,0.2003,0.06246,...,12.09,20.83,79.73,447.1,0.1095,0.1982,0.1553,0.06754,0.3202,0.07287
144,10.75,14.97,68.26,355.3,0.07793,0.05139,0.02251,0.007875,0.1399,0.05688,...,11.95,20.72,77.79,441.2,0.1076,0.1223,0.09755,0.03413,0.23,0.06769
136,11.71,16.67,74.72,423.6,0.1051,0.06095,0.03592,0.026,0.1339,0.05945,...,13.33,25.48,86.16,546.7,0.1271,0.1028,0.1046,0.06968,0.1712,0.07343
116,8.95,15.76,58.74,245.2,0.09462,0.1243,0.09263,0.02308,0.1305,0.07163,...,9.414,17.07,63.34,270.0,0.1179,0.1879,0.1544,0.03846,0.1652,0.07722
567,20.6,29.33,140.1,1265.0,0.1178,0.277,0.3514,0.152,0.2397,0.07016,...,25.74,39.42,184.6,1821.0,0.165,0.8681,0.9387,0.265,0.4087,0.124


### Running the AutoML to select the best algorithm

In [5]:
import evalml
evalml.problem_types.ProblemTypes.all_problem_types

[<ProblemTypes.BINARY: 'binary'>,
 <ProblemTypes.MULTICLASS: 'multiclass'>,
 <ProblemTypes.REGRESSION: 'regression'>,
 <ProblemTypes.TIME_SERIES_REGRESSION: 'time series regression'>,
 <ProblemTypes.TIME_SERIES_BINARY: 'time series binary'>,
 <ProblemTypes.TIME_SERIES_MULTICLASS: 'time series multiclass'>,
 <ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION: 'multiseries time series regression'>]

In [6]:
from evalml.automl import AutoMLSearch
automl = AutoMLSearch(X_train=X_train, y_train=y_train, problem_type='binary')
automl.search()

{1: {'Random Forest Classifier w/ Label Encoder + Imputer + RF Classifier Select From Model': 3.1075472831726074,
  'Total time of batch': 3.2621190547943115},
 2: {'LightGBM Classifier w/ Label Encoder + Imputer + Select Columns Transformer': 2.014968156814575,
  'Extra Trees Classifier w/ Label Encoder + Imputer + Select Columns Transformer': 2.3147449493408203,
  'Elastic Net Classifier w/ Label Encoder + Imputer + Standard Scaler + Select Columns Transformer': 1.9169294834136963,
  'XGBoost Classifier w/ Label Encoder + Imputer + Select Columns Transformer': 2.0119779109954834,
  'Logistic Regression Classifier w/ Label Encoder + Imputer + Standard Scaler + Select Columns Transformer': 6.585797548294067,
  'Total time of batch': 15.540068864822388}}

In [7]:
automl.rankings

Unnamed: 0,id,pipeline_name,search_order,ranking_score,mean_cv_score,standard_deviation_cv_score,percent_better_than_baseline,high_variance_cv,parameters
0,6,Logistic Regression Classifier w/ Label Encode...,6,0.111853,0.111853,0.033734,99.169406,False,"{'Label Encoder': {'positive_label': None}, 'I..."
1,4,Elastic Net Classifier w/ Label Encoder + Impu...,4,0.114153,0.114153,0.031724,99.152325,False,"{'Label Encoder': {'positive_label': None}, 'I..."
2,1,Random Forest Classifier w/ Label Encoder + Im...,1,0.12677,0.12677,0.035172,99.05864,False,"{'Label Encoder': {'positive_label': None}, 'I..."
3,3,Extra Trees Classifier w/ Label Encoder + Impu...,3,0.150551,0.150551,0.034153,98.882044,False,"{'Label Encoder': {'positive_label': None}, 'I..."
4,5,XGBoost Classifier w/ Label Encoder + Imputer ...,5,0.150885,0.150885,0.046259,98.879567,False,"{'Label Encoder': {'positive_label': None}, 'I..."
5,2,LightGBM Classifier w/ Label Encoder + Imputer...,2,0.194301,0.194301,0.052339,98.557171,False,"{'Label Encoder': {'positive_label': None}, 'I..."
6,0,Mode Baseline Binary Classification Pipeline,0,13.466641,13.466641,0.086133,0.0,False,"{'Label Encoder': {'positive_label': None}, 'B..."


### Getting The Best Pipeline

In [8]:
automl.best_pipeline

pipeline = BinaryClassificationPipeline(component_graph={'Label Encoder': ['Label Encoder', 'X', 'y'], 'Imputer': ['Imputer', 'X', 'Label Encoder.y'], 'Standard Scaler': ['Standard Scaler', 'Imputer.x', 'Label Encoder.y'], 'Select Columns Transformer': ['Select Columns Transformer', 'Standard Scaler.x', 'Label Encoder.y'], 'Logistic Regression Classifier': ['Logistic Regression Classifier', 'Select Columns Transformer.x', 'Label Encoder.y']}, parameters={'Label Encoder':{'positive_label': None}, 'Imputer':{'categorical_impute_strategy': 'most_frequent', 'numeric_impute_strategy': 'mean', 'boolean_impute_strategy': 'most_frequent', 'categorical_fill_value': None, 'numeric_fill_value': None, 'boolean_fill_value': None}, 'Select Columns Transformer':{'columns': ['mean radius', 'mean perimeter', 'mean area', 'mean concavity', 'mean concave points', 'radius error', 'perimeter error', 'area error', 'smoothness error', 'worst radius', 'worst perimeter', 'worst area', 'worst concave points', '

In [9]:
best_pipeline=automl.best_pipeline

### Let's Check the detailed desscription

In [10]:
automl.describe_pipeline(automl.rankings.iloc[0]["id"])


************************************************************************************************************
* Logistic Regression Classifier w/ Label Encoder + Imputer + Standard Scaler + Select Columns Transformer *
************************************************************************************************************

Problem Type: binary
Model Family: Linear

Pipeline Steps
1. Label Encoder
	 * positive_label : None
2. Imputer
	 * categorical_impute_strategy : most_frequent
	 * numeric_impute_strategy : mean
	 * boolean_impute_strategy : most_frequent
	 * categorical_fill_value : None
	 * numeric_fill_value : None
	 * boolean_fill_value : None
3. Standard Scaler
4. Select Columns Transformer
	 * columns : ['mean radius', 'mean perimeter', 'mean area', 'mean concavity', 'mean concave points', 'radius error', 'perimeter error', 'area error', 'smoothness error', 'worst radius', 'worst perimeter', 'worst area', 'worst concave points', 'worst symmetry', 'worst fractal dimension']


In [11]:
### Evaluate on hold out data
best_pipeline.score(X_test, y_test, objectives=["auc","f1","Precision","Recall"])

OrderedDict([('AUC', 0.9828042328042328),
             ('F1', 0.9069767441860465),
             ('Precision', 0.8863636363636364),
             ('Recall', 0.9285714285714286)])

### We can also optimize for a problem specific objective

In [12]:
automl_auc = AutoMLSearch(X_train=X_train, y_train=y_train,
                          problem_type='binary',
                          objective='auc',
                          additional_objectives=['f1', 'precision'],
                          max_batches=1,
                          optimize_thresholds=True)

automl_auc.search()

{1: {'Random Forest Classifier w/ Label Encoder + Imputer + RF Classifier Select From Model': 3.215963840484619,
  'Total time of batch': 3.36702036857605}}

In [13]:
automl_auc.rankings

Unnamed: 0,id,pipeline_name,search_order,ranking_score,mean_cv_score,standard_deviation_cv_score,percent_better_than_baseline,high_variance_cv,parameters
0,1,Random Forest Classifier w/ Label Encoder + Im...,1,0.989686,0.989686,0.006053,48.968584,False,"{'Label Encoder': {'positive_label': None}, 'I..."
1,0,Mode Baseline Binary Classification Pipeline,0,0.5,0.5,0.0,0.0,False,"{'Label Encoder': {'positive_label': None}, 'B..."


In [14]:
automl_auc.describe_pipeline(automl_auc.rankings.iloc[0]["id"])


*****************************************************************************************
* Random Forest Classifier w/ Label Encoder + Imputer + RF Classifier Select From Model *
*****************************************************************************************

Problem Type: binary
Model Family: Random Forest

Pipeline Steps
1. Label Encoder
	 * positive_label : None
2. Imputer
	 * categorical_impute_strategy : most_frequent
	 * numeric_impute_strategy : mean
	 * boolean_impute_strategy : most_frequent
	 * categorical_fill_value : None
	 * numeric_fill_value : None
	 * boolean_fill_value : None
3. RF Classifier Select From Model
	 * number_features : None
	 * n_estimators : 10
	 * max_depth : None
	 * percent_features : 0.5
	 * threshold : median
	 * n_jobs : -1
4. Random Forest Classifier
	 * n_estimators : 100
	 * max_depth : 6
	 * n_jobs : -1

Training
Training for binary problems.
Total training time (including CV): 3.2 seconds

Cross Validation
----------------
         

In [15]:
best_pipeline_auc = automl_auc.best_pipeline

In [16]:
# get the score on holdout data
best_pipeline_auc.score(X_test, y_test,  objectives=["auc"])

OrderedDict([('AUC', 0.9857804232804233)])

In [17]:
best_pipeline.save("model.pkl")

#### Loading the Model

In [18]:
check_model=automl.load('model.pkl')

In [19]:
check_model.predict_proba(X_test)

Unnamed: 0,benign,malignant
477,9.927723e-01,0.007228
558,9.632001e-01,0.036800
537,8.978461e-01,0.102154
322,9.762130e-01,0.023787
474,9.983410e-01,0.001659
...,...,...
364,9.939219e-01,0.006078
518,7.511339e-01,0.248866
354,9.978193e-01,0.002181
23,2.534423e-07,1.000000
