In [1]:
pip install evalml

In [2]:
import evalml
from evalml import AutoMLSearch
from evalml.objectives import LeadScoring

In [3]:
lead_scoring_objective = LeadScoring(
    true_positives=1000,
    false_positives=-10)

In [4]:
from urllib.request import urlopen
import pandas as pd
import woodwork as ww

In [5]:
customers_data = urlopen('https://featurelabs-static.s3.amazonaws.com/lead_scoring_ml_apps/customers.csv')
interactions_data = urlopen('https://featurelabs-static.s3.amazonaws.com/lead_scoring_ml_apps/interactions.csv')
leads_data = urlopen('https://featurelabs-static.s3.amazonaws.com/lead_scoring_ml_apps/previous_leads.csv')

In [6]:
customers = pd.read_csv(customers_data)
interactions = pd.read_csv(interactions_data)
leads = pd.read_csv(leads_data)

In [7]:
X = customers.merge(interactions, on='customer_id').merge(leads, on='customer_id')

In [8]:
y = X['label']

In [9]:
X = X.drop(['customer_id', 'date_registered', 'birthday','phone', 'email',
            'owner', 'company', 'id', 'time_x',
            'session', 'referrer', 'time_y', 'label', 'country'],axis=1)

In [10]:
X.head()

Unnamed: 0,job,state,zip,action,amount
0,"Engineer, mining",NY,60091.0,page_view,
1,"Psychologist, forensic",CA,,purchase,135.23
2,"Psychologist, forensic",CA,,page_view,
3,Air cabin crew,,60091.0,download,
4,Air cabin crew,,60091.0,page_view,


In [11]:
features_train,features_test,target_train,target_test = evalml.preprocessing.split_data(X,y,problem_type='binary',test_size=0.2)

In [12]:
automl = AutoMLSearch(X_train=features_train,y_train=target_train,problem_type='binary',
                     objective=lead_scoring_objective,
                     additional_objectives=['auc'],
                     max_batches=1,
                     optimize_thresholds=True)
automl.search()

The following labels fall below 10% of the target: [True]
Generating pipelines to search over...
*****************************
* Beginning pipeline search *
*****************************

Optimizing for Lead Scoring. 
Greater score is better.

Searching up to 1 batches for a total of 9 pipelines. 
Allowed model families: decision_tree, lightgbm, random_forest, extra_trees, catboost, linear_model, xgboost



FigureWidget({
    'data': [{'mode': 'lines+markers',
              'name': 'Best Score',
              'type'…

Batch 1: (1/9) Mode Baseline Binary Classification P... Elapsed:00:00
	Starting cross validation
	Finished cross validation - mean Lead Scoring: 0.000
Batch 1: (2/9) Logistic Regression Classifier w/ Imp... Elapsed:00:02
	Starting cross validation
			Fold 0: Encountered an error.
			Fold 0: All scores will be replaced with nan.
			Fold 0: Please check C:\Users\rohit.pratapwar\Python Learning\evalml_debug.log for the current hyperparameters and stack trace.
			Fold 0: Exception during automl search: np.nan is an invalid document, expected byte or unicode string.
			Fold 1: Encountered an error.
			Fold 1: All scores will be replaced with nan.
			Fold 1: Please check C:\Users\rohit.pratapwar\Python Learning\evalml_debug.log for the current hyperparameters and stack trace.
			Fold 1: Exception during automl search: np.nan is an invalid document, expected byte or unicode string.
			Fold 2: Encountered an error.
			Fold 2: All scores will be replaced with nan.
			Fold 2: Please check C:\Use

In [13]:
automl.rankings

Unnamed: 0,id,pipeline_name,score,validation_score,percent_better_than_baseline,high_variance_cv,parameters
0,0,Mode Baseline Binary Classification Pipeline,0.0,0.0,,False,{'Baseline Classifier': {'strategy': 'mode'}}
1,1,Logistic Regression Classifier w/ Imputer + Te...,,,,False,{'Imputer': {'categorical_impute_strategy': 'm...
2,2,Random Forest Classifier w/ Imputer + Text Fea...,,,,False,{'Imputer': {'categorical_impute_strategy': 'm...
3,3,XGBoost Classifier w/ Imputer + Text Featuriza...,,,,False,{'Imputer': {'categorical_impute_strategy': 'm...
4,4,CatBoost Classifier w/ Imputer + Text Featuriz...,,,,False,{'Imputer': {'categorical_impute_strategy': 'm...
5,5,Elastic Net Classifier w/ Imputer + Text Featu...,,,,False,{'Imputer': {'categorical_impute_strategy': 'm...
6,6,Extra Trees Classifier w/ Imputer + Text Featu...,,,,False,{'Imputer': {'categorical_impute_strategy': 'm...
7,7,LightGBM Classifier w/ Imputer + Text Featuriz...,,,,False,{'Imputer': {'categorical_impute_strategy': 'm...
8,8,Decision Tree Classifier w/ Imputer + Text Fea...,,,,False,{'Imputer': {'categorical_impute_strategy': 'm...


In [14]:
best_pipeline = automl.best_pipeline

In [15]:
automl.describe_pipeline(automl.rankings.iloc[0]["id"])

************************************************
* Mode Baseline Binary Classification Pipeline *
************************************************

Problem Type: binary
Model Family: Baseline

Pipeline Steps
1. Baseline Classifier
	 * strategy : mode

Training
Training for binary problems.
Objective to optimize binary classification pipeline thresholds for: <evalml.objectives.lead_scoring.LeadScoring object at 0x000001D68A5207C8>
Total training time (including CV): 2.7 seconds

Cross Validation
----------------
             Lead Scoring   AUC # Training # Validation
0                   0.000 0.500   2479.000     1550.000
1                   0.000 0.500   2479.000     1550.000
2                   0.000 0.500   2480.000     1549.000
mean                0.000 0.500          -            -
std                 0.000 0.000          -            -
coef of var           inf 0.000          -            -


In [16]:
best_pipeline.score(features_test,target_test,objectives=['auc',lead_scoring_objective])

OrderedDict([('AUC', 0.5), ('Lead Scoring', 0.0)])

In [19]:
automl_auc = evalml.AutoMLSearch(X_train=features_train,y_train=target_train,
                                problem_type='binary',
                                objective='auc',
                                additional_objectives=[],
                                max_batches=1,
                                optimize_thresholds=True
                                )
automl_auc.search()

The following labels fall below 10% of the target: [True]
Generating pipelines to search over...
*****************************
* Beginning pipeline search *
*****************************

Optimizing for AUC. 
Greater score is better.

Searching up to 1 batches for a total of 9 pipelines. 
Allowed model families: decision_tree, lightgbm, random_forest, extra_trees, catboost, linear_model, xgboost



FigureWidget({
    'data': [{'mode': 'lines+markers',
              'name': 'Best Score',
              'type'…

Batch 1: (1/9) Mode Baseline Binary Classification P... Elapsed:00:00
	Starting cross validation
	Finished cross validation - mean AUC: 0.500
Batch 1: (2/9) Logistic Regression Classifier w/ Imp... Elapsed:00:00
	Starting cross validation
			Fold 0: Encountered an error.
			Fold 0: All scores will be replaced with nan.
			Fold 0: Please check C:\Users\rohit.pratapwar\Python Learning\evalml_debug.log for the current hyperparameters and stack trace.
			Fold 0: Exception during automl search: np.nan is an invalid document, expected byte or unicode string.
			Fold 1: Encountered an error.
			Fold 1: All scores will be replaced with nan.
			Fold 1: Please check C:\Users\rohit.pratapwar\Python Learning\evalml_debug.log for the current hyperparameters and stack trace.
			Fold 1: Exception during automl search: np.nan is an invalid document, expected byte or unicode string.
			Fold 2: Encountered an error.
			Fold 2: All scores will be replaced with nan.
			Fold 2: Please check C:\Users\rohit.

In [20]:
automl_auc.rankings

Unnamed: 0,id,pipeline_name,score,validation_score,percent_better_than_baseline,high_variance_cv,parameters
0,0,Mode Baseline Binary Classification Pipeline,0.5,0.5,0.0,False,{'Baseline Classifier': {'strategy': 'mode'}}
1,1,Logistic Regression Classifier w/ Imputer + Te...,,,,False,{'Imputer': {'categorical_impute_strategy': 'm...
2,2,Random Forest Classifier w/ Imputer + Text Fea...,,,,False,{'Imputer': {'categorical_impute_strategy': 'm...
3,3,XGBoost Classifier w/ Imputer + Text Featuriza...,,,,False,{'Imputer': {'categorical_impute_strategy': 'm...
4,4,CatBoost Classifier w/ Imputer + Text Featuriz...,,,,False,{'Imputer': {'categorical_impute_strategy': 'm...
5,5,Elastic Net Classifier w/ Imputer + Text Featu...,,,,False,{'Imputer': {'categorical_impute_strategy': 'm...
6,6,Extra Trees Classifier w/ Imputer + Text Featu...,,,,False,{'Imputer': {'categorical_impute_strategy': 'm...
7,7,LightGBM Classifier w/ Imputer + Text Featuriz...,,,,False,{'Imputer': {'categorical_impute_strategy': 'm...
8,8,Decision Tree Classifier w/ Imputer + Text Fea...,,,,False,{'Imputer': {'categorical_impute_strategy': 'm...


In [21]:
best_pipeline_auc = automl_auc.best_pipeline

In [22]:
best_pipeline_auc.score(features_test,target_test,objectives=['auc',lead_scoring_objective])

OrderedDict([('AUC', 0.5), ('Lead Scoring', 0.0)])