Machine Learning Data Pipeline Workflow
1. Data Sourcing
2. Data Exploration
3. Data Cleaning
4. Data Wrangling
5. Data Integration
6. Feature Engineering
7. Feature Selection
8. Data Splitting
9. Model Selection
10. Model Training
11. Model Evaluation
12. Hyperparameter Tuning
13. Final Testing
14. Model Deployment
15. Model Monitoring
16. Model Retraining

In [8]:
import dask.dataframe as dd
import dask.array as da
from dask_ml.preprocessing import (RobustScaler,
                                   MinMaxScaler,
                                   OneHotEncoder)
from dask_ml.model_selection import (KFold,
                                     train_test_split)
from dask_ml.datasets import make_regression
from dask_ml.linear_model import LinearRegression
from statistics import mean
from sklearn.datasets import load_iris
import numpy as np
from typing import List

DASK-ML Data Preprocessing (parallel scikit-learn)

In [9]:

iris = load_iris()

X = iris.data

df = dd.from_array(X)

# subtract the median, devide by the interquartile range 
r_scaler = (RobustScaler()
            .fit_transform(df)
            .compute())

print(r_scaler.head())

# Min-Max-Scaler [0, 1]
mm_scaler = (MinMaxScaler()
            .fit_transform(df)
            .compute())

print(mm_scaler.head()) 

# One Hot Encoder
var_1 = da.from_array(
    np.array(
        [["Apples"], ["Melons"], ["Melons"], ["Oranges"]]
    ),
    chunks=2
)

encoder = (OneHotEncoder(sparse_output=False)
          .fit_transform(var_1)
          .compute())

print(encoder)

          0    1         2         3
0 -0.538462  1.0 -0.842857 -0.733333
1 -0.692308  0.0 -0.842857 -0.733333
2 -0.846154  0.4 -0.871429 -0.733333
3 -0.923077  0.2 -0.814286 -0.733333
4 -0.615385  1.2 -0.842857 -0.733333
          0         1         2         3
0  0.222222  0.625000  0.067797  0.041667
1  0.166667  0.416667  0.067797  0.041667
2  0.111111  0.500000  0.050847  0.041667
3  0.083333  0.458333  0.084746  0.041667
4  0.194444  0.666667  0.067797  0.041667
[[1. 0. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 0. 1.]]


Cross Validations

In [10]:
# create data
X, y = make_regression(
    n_samples=1000,
    n_features=10,
    random_state=87,
    chunks=20
)

# create model
model = LinearRegression()

# create KFold
kf = KFold(n_splits=5)

# train loop
train_scores: List[int] = []
test_scores: List[int] = []

for i, j in kf.split(X):
    X_train, X_test = X[i], X[j]
    y_train, y_test = y[i], y[j]
    
    model.fit(X_train, y_train)
    
    train_scores.append(model.score(X_train, y_train))
    test_scores.append(model.score(X_test, y_test))
    
print("Train scores: ", train_scores)
print("Test scores: ", test_scores)
print("Mean train score: ", mean(train_scores))
print("Mean test score: ", mean(test_scores))

# splitting dask arrays
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2
)
print(X_train)
print(X_train.compute()[:5])

  p = blockwise(
  p = blockwise(
  p = blockwise(
  p = blockwise(
  p = blockwise(
  p = blockwise(
  p = blockwise(
  p = blockwise(
  p = blockwise(
  p = blockwise(


Train scores:  [0.9999970941045471, 0.9999974919985399, 0.9999976779775921, 0.9999983303663906, 0.9999974034370218]
Test scores:  [0.9999972607168518, 0.9999973046581543, 0.999997626961904, 0.9999981517869371, 0.999997407355171]
Mean train score:  0.9999975995768183
Mean test score:  0.9999975502958036
dask.array<concatenate, shape=(800, 10), dtype=float64, chunksize=(16, 10), chunktype=numpy.ndarray>
[[-0.72778132  1.75358014 -1.63589511 -0.42851612  1.94106055  0.68488772
   1.62225142 -0.1354828  -0.57206439 -1.22621463]
 [ 1.79248468  1.2641595   1.02104097  0.21974283 -0.23121309 -2.34420804
   0.27449248  0.60585058 -1.16237758 -0.2244763 ]
 [ 0.55767448 -1.14652438  1.01880474  1.82038923 -0.03150098  0.02437674
   0.48671356  0.51179773  0.35363546 -1.47521035]
 [-1.25521125  0.20963487  0.70151135  0.66632827  0.17211713  1.33436612
   1.63551535 -0.23293505  0.00712698  0.23032439]
 [-0.34070806  1.06811311  0.29066025  0.35284983  0.16147256  0.02007247
   1.18881733  0.9595

Hyperparameter Tuning with DASK-ML

In [11]:
from dask.distributed import Client
from dask_ml.datasets import make_classification
from dask_ml.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from scipy.stats import uniform, loguniform
from dask_ml.model_selection import HyperbandSearchCV

In [12]:
# start client
client = Client(processes=False)

# create data
X, y = make_classification(
    chunks=20,
    random_state=87
)

# split data
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2
)

# create classifier model
clf = SGDClassifier(
    tol=1e-3,
    penalty="elasticnet",
    random_state=87
)

# create parameter grid
params = {
    "alpha": loguniform(1e-2, 1e0),
    "l1_ratio": uniform(0.0, 1.0)
}

# create hyperparameter search
search = HyperbandSearchCV(
    clf,
    params,
    max_iter=80,
    random_state=87
)

# fit the search
search.fit(X_train, y_train,
           classes=[0, 1])

print(search.best_params_)
print(search.best_score_)

# perform evaluation
print(search.score(X_test, y_test))

Perhaps you already have a cluster running?
Hosting the HTTP server on port 51710 instead


{'alpha': 0.8744844070862653, 'l1_ratio': 0.09783217731538718}
0.75
0.5


Statistical Imputation with DASK-ML

In [13]:
import dask.dataframe as dd
import pandas as pd
from dask_ml.preprocessing import OneHotEncoder
from dask_ml.impute import SimpleImputer
from dask_ml.model_selection import train_test_split
from dask_ml.linear_model import LinearRegression
from dask_ml.metrics import accuracy_score

In [30]:
# create data
data = {
    'color':
        ['red', 'orange', 'green', None, 'yellow', 'green'],
    'weight':
        [150, 180, 200, 160, None, 220],
    'taste':
        ['sweet', 'sweet', 'sour', 'sweet', 'sweet', 'sour'],
    'fruit':
        ['apple', 'orange', 'apple', 'apple', 'orange', 'melon']
}

df = dd.from_pandas(
    pd.DataFrame(data),
    npartitions=2
)
print(df.compute())

ddf = df.copy()

# imputer for colors
imputer_1 = SimpleImputer(
    strategy="most_frequent"
)

df_fit = imputer_1.fit(df)
color = imputer_1.transform(df[["color"]])

# execute
print(color.compute())


# imputer for weights
imputer_2 = SimpleImputer(
    strategy="mean"
)

weight = df_fit.fit_transform(df[["weight"]])

# execute
print(weight.compute())

    color  weight  taste   fruit
0     red   150.0  sweet   apple
1  orange   180.0  sweet  orange
2   green   200.0   sour   apple
3    <NA>   160.0  sweet   apple
4  yellow     NaN  sweet  orange
5   green   220.0   sour   melon
    color
0     red
1  orange
2   green
3   green
4  yellow
5   green
   weight
0   150.0
1   180.0
2   200.0
3   160.0
4   180.0
5   220.0


Linear Regression with DASK-ML

In [None]:
import dask.dataframe as dd
import numpy as np
import pandas as pd
from dask_ml.linear_model import LinearRegression
from dask_ml.datasets import make_regression
from dask_ml.model_selection import train_test_split

In [31]:
num_samples = 100
num_features = 4

# create data
X, y = make_regression(
    n_samples=num_samples,
    n_features=num_features,
    noise=0.1,
    chunks=4
)

y = np.exp(y / 200).astype(int)

# split the data
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2
)

# create and train model
model = LinearRegression()
model.fit(
    X_train,
    y_train)

# score model on test set
score = model.score(
    X_test,
    y_test)

print(f"Model score: {score}")

print(model.get_params())

Model score: 0.5660094720589386
{'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1.0, 'max_iter': 100, 'multi_class': 'ovr', 'n_jobs': 1, 'penalty': 'l2', 'random_state': None, 'solver': 'admm', 'solver_kwargs': None, 'tol': 0.0001, 'verbose': 0, 'warm_start': False}
