<a href="https://colab.research.google.com/github/shahpriyanka26/shahpriyanka26/blob/main/Evalml_Leadscoring.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [16]:
!pip install evalml



In [17]:
import evalml
from evalml import AutoMLSearch
from evalml.objectives import LeadScoring

In [19]:
#Using these parameters, EvalML builds a pileline that will maximize the amount of revenue per lead generated.
lead_scoring_objective = LeadScoring(
    true_positives=25,
    false_positives=-5
)

In [20]:
#We will be utilizing a dataset detailing a customer's job, country, state, zip, online action, the dollar amount of that action and whether they were a successful lead.

from urllib.request import urlopen
import pandas as pd
import woodwork as ww
customers_data = urlopen('https://featurelabs-static.s3.amazonaws.com/lead_scoring_ml_apps/customers.csv')
interactions_data = urlopen('https://featurelabs-static.s3.amazonaws.com/lead_scoring_ml_apps/interactions.csv')
leads_data = urlopen('https://featurelabs-static.s3.amazonaws.com/lead_scoring_ml_apps/previous_leads.csv')
customers = pd.read_csv(customers_data)
interactions = pd.read_csv(interactions_data)
leads = pd.read_csv(leads_data)

X = customers.merge(interactions, on='customer_id').merge(leads, on='customer_id')


In [4]:
X.head(10)

Unnamed: 0,customer_id,date_registered,birthday,job,phone,email,country,state,zip,owner,company,id,time_x,action,amount,session,referrer,time_y,label
0,460429349361,2017-08-10 11:04:45,,"Engineer, mining",+1-283-990-1507x7713,christian92@gmail.com,,NY,60091.0,Kathleen Hawkins MD,618541400000.0,499214050533,2017-12-04 05:21:05,page_view,,573943098058,,2017-10-22 05:21:05,False
1,674438580580,2017-08-11 08:35:32,,"Psychologist, forensic",(299)543-9962,wwelch@lee.com,US,CA,,John Edwards,211482700000.0,542448787918,2018-03-13 20:10:17,purchase,135.23,680991389236,www.google.com,2018-02-11 20:10:17,True
2,674438580580,2017-08-11 08:35:32,,"Psychologist, forensic",(299)543-9962,wwelch@lee.com,US,CA,,John Edwards,211482700000.0,864068700968,2018-06-30 03:20:28,page_view,,680991389236,https://www.twitter.com,2018-02-11 20:10:17,True
3,364017777045,2017-08-11 10:15:37,,Air cabin crew,+1-213-455-5314,xjones@smith.net,US,,60091.0,Erica Anderson,,632245399666,2018-04-24 19:37:30,download,,894164276572,https://medium.com/article,2018-04-08 19:37:30,False
4,364017777045,2017-08-11 10:15:37,,Air cabin crew,+1-213-455-5314,xjones@smith.net,US,,60091.0,Erica Anderson,,182735438015,2018-07-20 02:50:45,page_view,,894164276572,https://www.twitter.com,2018-04-08 19:37:30,False
5,364017777045,2017-08-11 10:15:37,,Air cabin crew,+1-213-455-5314,xjones@smith.net,US,,60091.0,Erica Anderson,,591996521790,2018-08-19 08:10:40,contact_form,,894164276572,https://www.twitter.com,2018-04-08 19:37:30,False
6,451481549424,2017-08-13 10:38:53,,Geographical information systems officer,001-782-926-0100x616,hudsonbrianna@garza-thompson.com,US,CA,2116.0,Erica Anderson,259183000000.0,605531090825,2018-05-13 14:41:53,page_view,,198124134215,https://www.twitter.com,2018-04-29 14:41:53,False
7,451481549424,2017-08-13 10:38:53,,Geographical information systems officer,001-782-926-0100x616,hudsonbrianna@garza-thompson.com,US,CA,2116.0,Erica Anderson,259183000000.0,599569806066,2018-06-05 05:07:17,page_view,,868943730848,https://medium.com/article,2018-04-29 14:41:53,False
8,769958483731,2017-08-15 04:44:03,,"Geologist, engineering",091.946.9531,qbrooks@miller.com,US,IL,2116.0,Zachary Roberts,791546400000.0,664288122683,2017-09-16 10:39:37,download,,689017934929,https://medium.com/article,2018-11-01 04:32:58,False
9,769958483731,2017-08-15 04:44:03,,"Geologist, engineering",091.946.9531,qbrooks@miller.com,US,IL,2116.0,Zachary Roberts,791546400000.0,664288122683,2017-09-16 10:39:37,download,,689017934929,https://medium.com/article,2019-06-09 04:13:38,False


In [21]:
y = X['label']
X = X.drop(['customer_id', 'date_registered', 'birthday','phone', 'email',
            'owner', 'company', 'id', 'time_x',
            'session', 'referrer', 'time_y', 'label', 'country'], axis=1)
display(X.head())

Unnamed: 0,job,state,zip,action,amount
0,"Engineer, mining",NY,60091.0,page_view,
1,"Psychologist, forensic",CA,,purchase,135.23
2,"Psychologist, forensic",CA,,page_view,
3,Air cabin crew,,60091.0,download,
4,Air cabin crew,,60091.0,page_view,


In [6]:
#We will convert our data into Woodwork data structures. Doing so enables us to have more control over the types passed to and inferred by AutoML.

In [22]:
X.ww.init(semantic_tags={'job': 'category'}, logical_types={'job': 'Categorical'})
y = ww.init_series(y)
X.ww

Unnamed: 0_level_0,Physical Type,Logical Type,Semantic Tag(s)
Column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
job,category,Categorical,['category']
state,category,Categorical,['category']
zip,float64,Double,['numeric']
action,category,Categorical,['category']
amount,float64,Double,['numeric']


In [8]:
#EvalML natively supports one-hot encoding and imputation so the above NaN and categorical values will be taken care of.

In [23]:
X_train, X_holdout, y_train, y_holdout = evalml.preprocessing.split_data(X, y, problem_type='binary', test_size=0.2, random_seed=0)

X.ww

Unnamed: 0_level_0,Physical Type,Logical Type,Semantic Tag(s)
Column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
job,category,Categorical,['category']
state,category,Categorical,['category']
zip,float64,Double,['numeric']
action,category,Categorical,['category']
amount,float64,Double,['numeric']


In [10]:
#Because the lead scoring labels are binary, we will use set the problem type to "binary". When we call .search(), the search for the best pipeline will begin.

In [24]:
automl = AutoMLSearch(X_train=X_train, y_train=y_train,
                      problem_type='binary',
                      objective=lead_scoring_objective,
                      additional_objectives=['auc'],
                      allowed_model_families=["catboost", "random_forest", "linear_model"],
                      max_batches=3,
                      verbose=True)

automl.search()

Generating pipelines to search over...
4 pipelines ready for search.

*****************************
* Beginning pipeline search *
*****************************

Optimizing for Lead Scoring. 
Greater score is better.

Using SequentialEngine to train and score pipelines.
Searching up to 3 batches for a total of 15 pipelines. 
Allowed model families: linear_model, linear_model, catboost, random_forest



FigureWidget({
    'data': [{'mode': 'lines+markers',
              'name': 'Best Score',
              'type'…

Evaluating Baseline Pipeline: Mode Baseline Binary Classification Pipeline
Mode Baseline Binary Classification Pipeline:
	Starting cross validation
	Finished cross validation - mean Lead Scoring: 0.000

*****************************
* Evaluating Batch Number 1 *
*****************************



			Elastic Net Classifier w/ Label Encoder + Imputer + One Hot Encoder + Oversampler + Standard Scaler fold 0: Encountered an error.
			Elastic Net Classifier w/ Label Encoder + Imputer + One Hot Encoder + Oversampler + Standard Scaler fold 0: All scores will be replaced with nan.
			Fold 0: Exception during automl search: 'TableSchema' object has no attribute '_get_subset_schema'
			Fold 0: Parameters:
	{'Label Encoder': {'positive_label': None}, 'Imputer': {'categorical_impute_strategy': 'most_frequent', 'numeric_impute_strategy': 'mean', 'categorical_fill_value': None, 'numeric_fill_value': None}, 'One Hot Encoder': {'top_n': 10, 'features_to_encode': None, 'categories': None, 'drop': 'if_binary', 'handle_unknown': 'ignore', 'handle_missing': 'error'}, 'Oversampler': {'sampling_ratio': 0.25, 'k_neighbors_default': 5, 'n_jobs': -1, 'sampling_ratio_dict': None}, 'Elastic Net Classifier': {'penalty': 'elasticnet', 'C': 1.0, 'l1_ratio': 0.15, 'n_jobs': -1, 'multi_class': 'auto', 'solver

Elastic Net Classifier w/ Label Encoder + Imputer + One Hot Encoder + Oversampler + Standard Scaler:
	Starting cross validation
	Finished cross validation - mean Lead Scoring: nan


			Logistic Regression Classifier w/ Label Encoder + Imputer + One Hot Encoder + Oversampler + Standard Scaler fold 0: Encountered an error.
			Logistic Regression Classifier w/ Label Encoder + Imputer + One Hot Encoder + Oversampler + Standard Scaler fold 0: All scores will be replaced with nan.
			Fold 0: Exception during automl search: 'TableSchema' object has no attribute '_get_subset_schema'
			Fold 0: Parameters:
	{'Label Encoder': {'positive_label': None}, 'Imputer': {'categorical_impute_strategy': 'most_frequent', 'numeric_impute_strategy': 'mean', 'categorical_fill_value': None, 'numeric_fill_value': None}, 'One Hot Encoder': {'top_n': 10, 'features_to_encode': None, 'categories': None, 'drop': 'if_binary', 'handle_unknown': 'ignore', 'handle_missing': 'error'}, 'Oversampler': {'sampling_ratio': 0.25, 'k_neighbors_default': 5, 'n_jobs': -1, 'sampling_ratio_dict': None}, 'Logistic Regression Classifier': {'penalty': 'l2', 'C': 1.0, 'n_jobs': -1, 'multi_class': 'auto', 'solver':

Logistic Regression Classifier w/ Label Encoder + Imputer + One Hot Encoder + Oversampler + Standard Scaler:
	Starting cross validation
	Finished cross validation - mean Lead Scoring: nan


			CatBoost Classifier w/ Label Encoder + Imputer + Oversampler fold 0: Encountered an error.
			CatBoost Classifier w/ Label Encoder + Imputer + Oversampler fold 0: All scores will be replaced with nan.
			Fold 0: Exception during automl search: 'TableSchema' object has no attribute '_get_subset_schema'
			Fold 0: Parameters:
	{'Label Encoder': {'positive_label': None}, 'Imputer': {'categorical_impute_strategy': 'most_frequent', 'numeric_impute_strategy': 'mean', 'categorical_fill_value': None, 'numeric_fill_value': None}, 'Oversampler': {'sampling_ratio': 0.25, 'k_neighbors_default': 5, 'n_jobs': -1, 'sampling_ratio_dict': None}, 'CatBoost Classifier': {'n_estimators': 10, 'eta': 0.03, 'max_depth': 6, 'bootstrap_type': None, 'silent': True, 'allow_writing_files': False, 'n_jobs': -1}}
			Fold 0: Traceback:
  File "/usr/local/lib/python3.7/dist-packages/evalml/automl/engine/engine_base.py", line 224, in train_and_score_pipeline
    pipeline, X_train, y_train, automl_config, schema=Fal

CatBoost Classifier w/ Label Encoder + Imputer + Oversampler:
	Starting cross validation
	Finished cross validation - mean Lead Scoring: nan


			Random Forest Classifier w/ Label Encoder + Imputer + One Hot Encoder + Oversampler fold 0: Encountered an error.
			Random Forest Classifier w/ Label Encoder + Imputer + One Hot Encoder + Oversampler fold 0: All scores will be replaced with nan.
			Fold 0: Exception during automl search: 'TableSchema' object has no attribute '_get_subset_schema'
			Fold 0: Parameters:
	{'Label Encoder': {'positive_label': None}, 'Imputer': {'categorical_impute_strategy': 'most_frequent', 'numeric_impute_strategy': 'mean', 'categorical_fill_value': None, 'numeric_fill_value': None}, 'One Hot Encoder': {'top_n': 10, 'features_to_encode': None, 'categories': None, 'drop': 'if_binary', 'handle_unknown': 'ignore', 'handle_missing': 'error'}, 'Oversampler': {'sampling_ratio': 0.25, 'k_neighbors_default': 5, 'n_jobs': -1, 'sampling_ratio_dict': None}, 'Random Forest Classifier': {'n_estimators': 100, 'max_depth': 6, 'n_jobs': -1}}
			Fold 0: Traceback:
  File "/usr/local/lib/python3.7/dist-packages/evalml

Random Forest Classifier w/ Label Encoder + Imputer + One Hot Encoder + Oversampler:
	Starting cross validation
	Finished cross validation - mean Lead Scoring: nan


AutoMLSearchException: ignored

In [1]:
import evalml
from evalml import AutoMLSearch
from evalml.objectives import LeadScoring

Featuretools may not support Python 3.7 in next non-bugfix release.


In [2]:
lead_scoring_objective = LeadScoring(
    true_positives=1000,
    false_positives=-10)

In [3]:
from urllib.request import urlopen
import pandas as pd
import woodwork as ww

In [4]:
customers_data = urlopen('https://featurelabs-static.s3.amazonaws.com/lead_scoring_ml_apps/customers.csv')
interactions_data = urlopen('https://featurelabs-static.s3.amazonaws.com/lead_scoring_ml_apps/interactions.csv')
leads_data = urlopen('https://featurelabs-static.s3.amazonaws.com/lead_scoring_ml_apps/previous_leads.csv')

In [5]:
customers = pd.read_csv(customers_data)
interactions = pd.read_csv(interactions_data)
leads = pd.read_csv(leads_data)

In [6]:
X = customers.merge(interactions, on='customer_id').merge(leads, on='customer_id')

In [7]:
y = X['label']

In [8]:
X = X.drop(['customer_id', 'date_registered', 'birthday','phone', 'email',
            'owner', 'company', 'id', 'time_x',
            'session', 'referrer', 'time_y', 'label', 'country'],axis=1)

In [9]:
features_train,features_test,target_train,target_test = evalml.preprocessing.split_data(X,y,problem_type='binary',test_size=0.2)

In [10]:
automl = AutoMLSearch(X_train=features_train,y_train=target_train,problem_type='binary',
                     objective=lead_scoring_objective,
                     additional_objectives=['auc'],
                     max_batches=1,
                     optimize_thresholds=True)
automl.search()

			Elastic Net Classifier w/ Label Encoder + Imputer + One Hot Encoder + Oversampler + Standard Scaler fold 0: Encountered an error.
			Elastic Net Classifier w/ Label Encoder + Imputer + One Hot Encoder + Oversampler + Standard Scaler fold 0: All scores will be replaced with nan.
			Fold 0: Exception during automl search: 'TableSchema' object has no attribute '_get_subset_schema'
			Fold 0: Parameters:
	{'Label Encoder': {'positive_label': None}, 'Imputer': {'categorical_impute_strategy': 'most_frequent', 'numeric_impute_strategy': 'mean', 'categorical_fill_value': None, 'numeric_fill_value': None}, 'One Hot Encoder': {'top_n': 10, 'features_to_encode': None, 'categories': None, 'drop': 'if_binary', 'handle_unknown': 'ignore', 'handle_missing': 'error'}, 'Oversampler': {'sampling_ratio': 0.25, 'k_neighbors_default': 5, 'n_jobs': -1, 'sampling_ratio_dict': None}, 'Elastic Net Classifier': {'penalty': 'elasticnet', 'C': 1.0, 'l1_ratio': 0.15, 'n_jobs': -1, 'multi_class': 'auto', 'solver

AutoMLSearchException: ignored

In [12]:
import woodwork as ww

# X is your feature matrix
X.ww.init()

# This tells you the number of missing values 
na_count = X.ww.describe().loc['nan_count']
columns_with_missing_values = na_count[na_count > 0].index
columns_with_missing_values


Index(['job', 'state', 'zip', 'amount'], dtype='object')

In [13]:
# This tells you the columns that are natural language
nat_lang_columns = X.ww.select("NaturalLanguage").columns

nat_lang_with_nan = set(nat_lang_columns).intersection(columns_with_missing_values)
nat_lang_with_nan

set()

In [15]:
X.shape

(5812, 5)