# EvalML

GitHub: https://github.com/alteryx/evalml

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from evalml.automl import AutoMLSearch
from evalml.problem_types.problem_types import ProblemTypes
from evalml.pipelines import TimeSeriesBinaryClassificationPipeline
from evalml.preprocessing import split_data
from sklearn.metrics import classification_report
from sklearn.datasets import make_classification

%matplotlib inline

## Generate a synthetic dataset

In [17]:
X, y = make_classification(n_samples=10000)

X = pd.DataFrame(X)
y = pd.Series(y)

X.ww.init()

X.shape, X.columns

((10000, 20), RangeIndex(start=0, stop=20, step=1))

In [18]:
X['date'] = pd.date_range("2010-10-01", periods=X.shape[0])
X.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,date
0,0.298473,-1.218092,-0.477984,1.179264,-0.264375,-0.241809,-0.838345,-0.189705,1.509926,-0.572967,...,-1.244651,1.525125,0.60592,0.69524,-0.590302,-0.665014,-0.008399,1.846055,0.349236,2010-10-01
1,-0.940313,-0.483936,1.265297,-0.371966,-1.11878,0.70607,1.91442,-0.307438,-1.088141,-1.359594,...,0.171345,1.102213,0.058327,-0.052656,-1.849278,-1.071711,0.243545,0.238361,0.963782,2010-10-02
2,0.31164,1.314532,-1.421946,0.686917,1.274491,-1.89064,0.956771,-1.654531,0.293637,-0.701893,...,0.226707,0.528948,1.852916,-1.768396,-0.143401,0.665558,-0.08468,0.383047,-0.474083,2010-10-03
3,-0.378572,0.591799,1.340881,-0.228323,-0.688304,-1.277819,0.328915,-0.229157,1.662133,0.525428,...,1.362785,0.295219,1.29965,3.123191,0.108567,-0.947054,0.222324,1.428925,-0.321512,2010-10-04
4,-0.761131,0.844613,-1.865478,1.75467,0.217115,-1.315361,0.44507,-1.91401,0.933714,-1.10473,...,-0.898258,-0.309205,0.677011,-0.482433,0.538408,0.675033,0.304502,1.502091,-0.325404,2010-10-05


In [21]:
X_train, X_test, y_train, y_test = split_data(X, y, problem_type="time series binary", test_size=0.2)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((8000, 21), (2000, 21), (8000,), (2000,))

## Experiment 1: Synthetic data

In [22]:
automl = AutoMLSearch(
    X_train,
    y_train,
    problem_type="time series binary",
    problem_configuration={
        "max_delay": 5,
        "gap": 0,
        "forecast_horizon": 1,
        "time_index": "date",
        },
    optimize_thresholds=True,
    objective='f1',
    verbose=True,
    max_time=60*10, # time limit 
)


Time series support in evalml is still in beta, which means we are still actively building its core features. Please be mindful of that when running search().



In [23]:
%%time 

automl.search()

print('')


*****************************
* Beginning pipeline search *
*****************************

Optimizing for F1. 
Greater score is better.

Using SequentialEngine to train and score pipelines.
Will stop searching for new pipelines after 600 seconds.

Allowed model families: 



FigureWidget({
    'data': [{'mode': 'lines+markers',
              'name': 'Best Score',
              'type'…

Evaluating Baseline Pipeline: Time Series Baseline Binary Pipeline
Time Series Baseline Binary Pipeline:
	Starting cross validation
	Finished cross validation - mean F1: 0.493

*****************************
* Evaluating Batch Number 1 *
*****************************

Logistic Regression Classifier w/ Label Encoder + Imputer + Time Series Featurizer + DateTime Featurizer + Drop NaN Rows Transformer + Standard Scaler:
	Starting cross validation
	Finished cross validation - mean F1: 0.665
Random Forest Classifier w/ Label Encoder + Imputer + Time Series Featurizer + DateTime Featurizer + Drop NaN Rows Transformer:
	Starting cross validation
	Finished cross validation - mean F1: 0.550

*****************************
* Evaluating Batch Number 2 *
*****************************

Logistic Regression Classifier w/ Label Encoder + Imputer + Time Series Featurizer + DateTime Featurizer + Drop NaN Rows Transformer + Standard Scaler:
	Starting cross validation
	Finished cross validation - mean F1: 0

In [29]:
y_pred = automl.best_pipeline.predict(X=X_test, objective='f1', X_train=X_train, y_train=y_train) * 1
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00      1004
           1       0.50      1.00      0.66       996

    accuracy                           0.50      2000
   macro avg       0.25      0.50      0.33      2000
weighted avg       0.25      0.50      0.33      2000




Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



In [None]:
automl.best_pipeline.score(X=X_test, y=y_test, objectives=['f1'], X_train=X_train, y_train=y_train)

In [None]:
from evalml.objectives import get_all_objective_names
print(get_all_objective_names())

In [None]:
automl.describe_pipeline(0)

In [None]:
pl = automl.best_pipeline

pl.fit(X_train, y_train)

best_pipeline_score = pl.score(X_test, y_test, ['f1'], X_train, y_train)

In [None]:
best_pipeline_score

## Experiment 2: Human Activity Recognition - HAR

In [None]:
# Data source: https://www.kaggle.com/datasets/uciml/human-activity-recognition-with-smartphones
train = pd.read_csv("../data/human-activity-recognition/train.csv")

# Convert labels to binary
train['Activity'] = pd.DataFrame(np.where(train['Activity']=='WALKING_DOWNSTAIRS', 1, 0))

# Create a new date column
train['date'] = pd.date_range(start='1/1/2022', periods=len(train), freq='S')
train.head(3)

In [None]:
len(train[0:1000])

In [None]:
# The max value will be incremented by 1 second and used in test dataset (see below)
train['date'].max()

In [None]:
train['Activity'].unique()

In [None]:
X = train[train.columns.difference(['subject', 'Activity'])]
X.ww.init()

X_train = X[0:800]
X_val = X[800:200]

y_train = train['Activity'][0:800]
y_val = train['Activity'][800:200]


In [None]:
plt.hist(y_train)
plt.title('Histogram of activites')
plt.xticks(rotation = 90) 
plt.show()

In [None]:
from evalml.demos import load_weather
from evalml.automl import AutoMLSearch
from evalml.utils.gen_utils import validate_holdout_datasets
from evalml.problem_types.problem_types import ProblemTypes
import woodwork as ww

In [None]:
problem_config = {'gap': 0, 
                  'max_delay': 10, 
                  'forecast_horizon': 1, 
                  'time_index': 'date'
                 }

# model_families=['xgboost', 'random_forest', 'linear_model', 'extra_trees','decision_tree']
# model_families=[ModelFamily.XGBOOST, ModelFamily.LIGHTGBM, ModelFamily.CATBOOST]
model_families=[ModelFamily.XGBOOST]

automl = AutoMLSearch(X_train, 
                      y_train, 
                      problem_type=ProblemTypes.TIME_SERIES_BINARY,
                      max_batches=1,
                      problem_configuration=problem_config,
                      max_time=60*10, # limit the pipeline search duration
                      allowed_model_families=model_families,
                      objective='f1',
                      sampler_method=None, 
                      verbose=True
                     )

In [None]:
%%time

automl.search()

print('') # Started at 13:23

In [None]:
train['date'].max()

In [None]:
test = pd.read_csv("../data/human-activity-recognition/test.csv")
test['Activity'] = pd.DataFrame(np.where(test['Activity']=='WALKING_DOWNSTAIRS', 1, 0))
test['date'] = pd.date_range(start='1/1/2022 02:02:32', periods=len(test), freq='S')
test.head()

In [None]:
# Select the first 562 features. Ignore the 'subject' column
X_test = test[test.columns.difference(['subject', 'Activity'])]
# X_test['date'] = pd.date_range(start='1/1/2022 02:02:32', periods=len(X_test), freq='S')
# Select the 'activity' column as an outcome
y_test = test['Activity'] #.iloc[:,-1]

In [None]:
pl

In [None]:
pl = automl.best_pipeline

pl.fit(X_train, y_train)

best_pipeline_score = pl.score(X_test, y_test, ['f1'], X_train, y_train)

TODO: try to debug the code and find out the reason of the following exception:


```
PipelineScoreError: F1 encountered AttributeError with message ('NoneType' object has no attribute 'iloc'):
```

In [None]:
tsc_pipeline = TimeSeriesBinaryClassificationPipeline(
    # component_graph=[], 
    component_graph = ['Logistic Regression Classifier'],
    parameters={"pipeline": {"gap": 0, "max_delay": 1, "forecast_horizon": 1, "time_index": "date"}}, 
    custom_name=None, 
    random_seed=42)

**ValueError**: no such file ../automl-time-series/venv/lib/python3.8/site-packages/prophet/stan_model/prophet_model.bin

In [None]:
import scipy.stats as stats

In [None]:


np.random.seed(12)

races =   ["asian","black","hispanic","other","white"]

# Generate random data
voter_race = np.random.choice(a= races,
                              p = [0.05, 0.15 ,0.25, 0.05, 0.5],
                              size=1000)

voter_age = stats.poisson.rvs(loc=18,
                              mu=30,
                              size=1000)

# Group age data by race
voter_frame = pd.DataFrame({"race":voter_race,"age":voter_age})
groups = voter_frame.groupby("race").groups

# Etract individual groups
asian = voter_age[groups["asian"]]
black = voter_age[groups["black"]]
hispanic = voter_age[groups["hispanic"]]
other = voter_age[groups["other"]]
white = voter_age[groups["white"]]

# Perform the ANOVA
stats.f_oneway(asian, black, hispanic, other, white)

In [None]:
np.mean(asian),\
np.mean(black),\
np.mean(hispanic),\
np.mean(other),\
np.mean(white)

In [None]:
np.random.seed(12)

# Generate random data
voter_race = np.random.choice(a= races,
                              p = [0.05, 0.15 ,0.25, 0.05, 0.5],
                              size=1000)

# Use a different distribution for white ages
white_ages = stats.poisson.rvs(loc=18, 
                              mu=32,
                              size=1000)

voter_age = stats.poisson.rvs(loc=18,
                              mu=30,
                              size=1000)

voter_age = np.where(voter_race=="white", white_ages, voter_age)

# Group age data by race
voter_frame = pd.DataFrame({"race":voter_race,"age":voter_age})
groups = voter_frame.groupby("race").groups   

# Extract individual groups
asian = voter_age[groups["asian"]]
black = voter_age[groups["black"]]
hispanic = voter_age[groups["hispanic"]]
other = voter_age[groups["other"]]
white = voter_age[groups["white"]]

# Perform the ANOVA
stats.f_oneway(asian, black, hispanic, other, white)

In [None]:
np.mean(asian),\
np.mean(black),\
np.mean(hispanic),\
np.mean(other),\
np.mean(white)

In [None]:
white_ages