In [1]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingRegressor
from dask_ml.linear_model import LogisticRegression
import joblib
from dask_ml.model_selection import train_test_split
import pandas as pd
import warnings

warnings.filterwarnings("ignore")

import warnings
warnings.filterwarnings("ignore")

from dask.distributed import Client, progress

client = Client(n_workers=4, threads_per_worker=2, memory_limit='2GB')
client



0,1
Client  Scheduler: tcp://127.0.0.1:51971  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 4  Cores: 8  Memory: 8.00 GB


In [2]:
# Dataframes implement the Pandas API
import dask.dataframe as dd

# This loads the data into Dask dataframe
df = dd.read_csv('https://tf-assets-prod.s3.amazonaws.com/tf-curric/data-science/creditcard.csv', dtype={'Time': 'float64'})

In [3]:
# This is our feature set
X = df.drop(["Class"], axis=1)

# This is our target variable
Y = df["Class"]

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

# Since our data can fit into memory
# we persist them to the RAM.
X_train.persist()
X_test.persist()
y_train.persist()
y_test.persist()

Dask Series Structure:
npartitions=3
    int64
      ...
      ...
      ...
Name: Class, dtype: int64
Dask Name: split, 3 tasks

In [4]:
rf = RandomForestClassifier()

with joblib.parallel_backend('dask'):
    rf.fit(X_train.values.compute(), y_train.values.compute())
    preds_train = rf.predict(X_train.values.compute())
    preds_test = rf.predict(X_test.values.compute())

    print("R-squared of the model in training set is: {}".format(rf.score(X_train.values.compute(), y_train.values.compute())))
    print("R-squared of the model on the test set is: {}".format(rf.score(X_test.values.compute(), y_test.values.compute())))

R-squared of the model in training set is: 0.9999122683885457
R-squared of the model on the test set is: 0.9993490385122891


In [5]:
gb = GradientBoostingRegressor()

with joblib.parallel_backend('dask'):
    gb.fit(X_train.values.compute(), y_train.values.compute())
    preds_train = rf.predict(X_train.values.compute())
    preds_test = rf.predict(X_test.values.compute())

    print("R-squared of the model in training set is: {}".format(gb.score(X_train.values.compute(), y_train.values.compute())))
    print("R-squared of the model on the test set is: {}".format(gb.score(X_test.values.compute(), y_test.values.compute())))

R-squared of the model in training set is: 0.8099027775869385
R-squared of the model on the test set is: 0.7493103561735075


In [6]:
lr = LogisticRegression()

with joblib.parallel_backend('dask'):
    lr.fit(X_train.values.compute(), y_train.values.compute())

    preds_train = lr.predict(X_train.values.compute())
    preds_test = lr.predict(X_test.values.compute())

    print("R-squared of the model in training set is: {}".format(lr.score(X_train.values.compute(), y_train.values.compute())))
    print("R-squared of the model on the test set is: {}".format(lr.score(X_test.values.compute(), y_test.values.compute())))

R-squared of the model in training set is: 0.9992279618192027
R-squared of the model on the test set is: 0.9990499480990165


In [7]:
client.close()