# EvalML

GitHub: https://github.com/alteryx/evalml

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from evalml.automl import AutoMLSearch
from evalml.problem_types.problem_types import ProblemTypes
from evalml.pipelines import TimeSeriesBinaryClassificationPipeline

%matplotlib inline

## Generate a synthetic dataset

In [None]:
X = pd.DataFrame(
    {"date": pd.date_range("2010-10-01", periods=500), 
     "feature1": range(101, 601), 
     # "feature2": range(351, 851), 
     # "feature3": range(11, 511),
    }
)

y = (pd.Series([1] * 50 + [0] * 450)
     .sample(frac=1, random_state=0, replace=False)
     .reset_index(drop=True)
    )

X.ww.init()

In [None]:
X.ww

In [None]:
X.head(3)

In [None]:
y.head(3)

In [None]:
plt.plot(y.index, y)
plt.show()

In [None]:
# 80/20 train and test sets
sample_size = int(len(X) * 0.8)

X_train = X[0:sample_size]
y_train = y[0:sample_size]

X_test = X[sample_size:]
y_test = y[sample_size:]

## Experiment 1: Synthetic data

In [None]:
from evalml.pipelines.components.utils import allowed_model_families
# list(map(str, allowed_model_families("binary")))
from evalml.model_family import ModelFamily

print(allowed_model_families("binary"))

In [None]:
problem_config = {
    "max_delay": 10,
    "gap": 0,
    "forecast_horizon": 1,
    "time_index": "date",
}

# model_families=[ModelFamily.XGBOOST, ModelFamily.LIGHTGBM, ModelFamily.CATBOOST]
model_families=[ModelFamily.CATBOOST]

automl = AutoMLSearch(
    X_train,
    X_train,
    problem_type=ProblemTypes.TIME_SERIES_BINARY,
    problem_configuration=problem_config,
    max_time=60*10, # Limit the pipeline search to 10 minutes
    allowed_model_families=model_families,
    objective='f1',
    sampler_method=None, 
    verbose=True
)

In [None]:
%%time 

automl.search()

print('')

In [None]:
from evalml.objectives import get_all_objective_names
print(get_all_objective_names())

In [None]:
automl.describe_pipeline(0)

In [None]:
pl = automl.best_pipeline

pl.fit(X_train, y_train)

best_pipeline_score = pl.score(X_test, y_test, ['f1'], X_train, y_train)

In [None]:
best_pipeline_score

## Experiment 2: Human Activity Recognition - HAR

In [None]:
# Data source: https://www.kaggle.com/datasets/uciml/human-activity-recognition-with-smartphones
train = pd.read_csv("../data/human-activity-recognition/train.csv")

# Convert labels to binary
train['Activity'] = pd.DataFrame(np.where(train['Activity']=='WALKING_DOWNSTAIRS', 1, 0))

# Create a new date column
train['date'] = pd.date_range(start='1/1/2022', periods=len(train), freq='S')
train.head(3)

In [None]:
len(train[0:1000])

In [None]:
# The max value will be incremented by 1 second and used in test dataset (see below)
train['date'].max()

In [None]:
train['Activity'].unique()

In [None]:
X = train[train.columns.difference(['subject', 'Activity'])]
X.ww.init()

X_train = X[0:800]
X_val = X[800:200]

y_train = train['Activity'][0:800]
y_val = train['Activity'][800:200]


In [None]:
plt.hist(y_train)
plt.title('Histogram of activites')
plt.xticks(rotation = 90) 
plt.show()

In [None]:
from evalml.demos import load_weather
from evalml.automl import AutoMLSearch
from evalml.utils.gen_utils import validate_holdout_datasets
from evalml.problem_types.problem_types import ProblemTypes
import woodwork as ww

In [None]:
problem_config = {'gap': 0, 
                  'max_delay': 10, 
                  'forecast_horizon': 1, 
                  'time_index': 'date'
                 }

# model_families=['xgboost', 'random_forest', 'linear_model', 'extra_trees','decision_tree']
# model_families=[ModelFamily.XGBOOST, ModelFamily.LIGHTGBM, ModelFamily.CATBOOST]
model_families=[ModelFamily.XGBOOST]

automl = AutoMLSearch(X_train, 
                      y_train, 
                      problem_type=ProblemTypes.TIME_SERIES_BINARY,
                      max_batches=1,
                      problem_configuration=problem_config,
                      max_time=60*10, # limit the pipeline search duration
                      allowed_model_families=model_families,
                      objective='f1',
                      sampler_method=None, 
                      verbose=True
                     )

In [None]:
%%time

automl.search()

print('') # Started at 13:23

In [None]:
train['date'].max()

In [None]:
test = pd.read_csv("../data/human-activity-recognition/test.csv")
test['Activity'] = pd.DataFrame(np.where(test['Activity']=='WALKING_DOWNSTAIRS', 1, 0))
test['date'] = pd.date_range(start='1/1/2022 02:02:32', periods=len(test), freq='S')
test.head()

In [None]:
# Select the first 562 features. Ignore the 'subject' column
X_test = test[test.columns.difference(['subject', 'Activity'])]
# X_test['date'] = pd.date_range(start='1/1/2022 02:02:32', periods=len(X_test), freq='S')
# Select the 'activity' column as an outcome
y_test = test['Activity'] #.iloc[:,-1]

In [None]:
pl

In [None]:
pl = automl.best_pipeline

pl.fit(X_train, y_train)

best_pipeline_score = pl.score(X_test, y_test, ['f1'], X_train, y_train)

TODO: try to debug the code and find out the reason of the following exception:


```
PipelineScoreError: F1 encountered AttributeError with message ('NoneType' object has no attribute 'iloc'):
```

In [None]:
tsc_pipeline = TimeSeriesBinaryClassificationPipeline(
    # component_graph=[], 
    component_graph = ['Logistic Regression Classifier'],
    parameters={"pipeline": {"gap": 0, "max_delay": 1, "forecast_horizon": 1, "time_index": "date"}}, 
    custom_name=None, 
    random_seed=42)

**ValueError**: no such file ../automl-time-series/venv/lib/python3.8/site-packages/prophet/stan_model/prophet_model.bin

In [None]:
import scipy.stats as stats

In [None]:


np.random.seed(12)

races =   ["asian","black","hispanic","other","white"]

# Generate random data
voter_race = np.random.choice(a= races,
                              p = [0.05, 0.15 ,0.25, 0.05, 0.5],
                              size=1000)

voter_age = stats.poisson.rvs(loc=18,
                              mu=30,
                              size=1000)

# Group age data by race
voter_frame = pd.DataFrame({"race":voter_race,"age":voter_age})
groups = voter_frame.groupby("race").groups

# Etract individual groups
asian = voter_age[groups["asian"]]
black = voter_age[groups["black"]]
hispanic = voter_age[groups["hispanic"]]
other = voter_age[groups["other"]]
white = voter_age[groups["white"]]

# Perform the ANOVA
stats.f_oneway(asian, black, hispanic, other, white)

In [None]:
np.mean(asian),\
np.mean(black),\
np.mean(hispanic),\
np.mean(other),\
np.mean(white)

In [None]:
np.random.seed(12)

# Generate random data
voter_race = np.random.choice(a= races,
                              p = [0.05, 0.15 ,0.25, 0.05, 0.5],
                              size=1000)

# Use a different distribution for white ages
white_ages = stats.poisson.rvs(loc=18, 
                              mu=32,
                              size=1000)

voter_age = stats.poisson.rvs(loc=18,
                              mu=30,
                              size=1000)

voter_age = np.where(voter_race=="white", white_ages, voter_age)

# Group age data by race
voter_frame = pd.DataFrame({"race":voter_race,"age":voter_age})
groups = voter_frame.groupby("race").groups   

# Extract individual groups
asian = voter_age[groups["asian"]]
black = voter_age[groups["black"]]
hispanic = voter_age[groups["hispanic"]]
other = voter_age[groups["other"]]
white = voter_age[groups["white"]]

# Perform the ANOVA
stats.f_oneway(asian, black, hispanic, other, white)

In [None]:
np.mean(asian),\
np.mean(black),\
np.mean(hispanic),\
np.mean(other),\
np.mean(white)

In [None]:
white_ages