# Mini Batch SGD classifier and regressor
Mini Batch SGD (MBSGD) models are linear models which are fitted by minimizing a regularized empirical loss with mini-batch SGD. In this notebook we compare the performance of cuMl's MBSGD classifier and regressor models with their respective scikit-learn counterparts.

The model can take array-like objects, either in host as NumPy arrays or in device (as Numba or cuda_array_interface-compliant), as well as cuDF DataFrames as the input.

For information about cuDF, refer to the cuDF documentation: https://docs.rapids.ai/api/cudf/stable/

In [None]:
import cudf as gd
import cuml
import numpy as np
import pandas as pd
import sklearn

from sklearn import linear_model
from sklearn.datasets.samples_generator import make_classification, make_regression
from sklearn.metrics import accuracy_score, r2_score
from sklearn.model_selection import train_test_split

## Define parameters

### Data parameters

In [None]:
num_samples = 2**13
num_features = 300
n_informative = 270
random_state = 0
train_size = 0.8
datatype = np.float32

### Model parameters

In [None]:
learning_rate = 'constant'
penalty = 'elasticnet'
eta0 = 0.005
max_iter = 100
fit_intercept = True
tol=0.0
batch_size=2

## Generate data

### Host

In [None]:
%%time
X_class, y_class = make_classification(n_samples=num_samples, n_features=num_features,
                                       n_informative=n_informative, random_state=random_state)
# change the datatype of the input data
X_class = X_class.astype(datatype)
y_class = y_class.astype(datatype)

# convert numpy arrays to pandas dataframe
X_class = pd.DataFrame(X_class)
y_class = pd.DataFrame(y_class)

X_class_train, X_class_test, y_class_train, y_class_test = train_test_split(X_class, y_class,
                                                                            train_size=train_size,
                                                                            random_state=random_state)
X_reg, y_reg = make_regression(n_samples=num_samples, n_features=num_features,
                               n_informative=n_informative, random_state=random_state)

# change the datatype of the input data
X_reg = X_reg.astype(datatype)
y_reg = y_reg.astype(datatype)

# convert numpy arrays to pandas dataframe
X_reg = pd.DataFrame(X_reg)
y_reg = pd.DataFrame(y_reg)

X_reg_train, X_reg_test, y_reg_train, y_reg_test = train_test_split(X_reg, y_reg,
                                                                    train_size=train_size,
                                                                    random_state=random_state)

### GPU

In [None]:
%%time
# classification dataset
X_class_cudf = gd.DataFrame.from_pandas(X_class_train)
X_class_cudf_test = gd.DataFrame.from_pandas(X_class_test)

y_class_cudf = gd.Series(y_class_train.values[:,0])

# regression dataset
X_reg_cudf = gd.DataFrame.from_pandas(X_reg_train)
X_reg_cudf_test = gd.DataFrame.from_pandas(X_reg_test)

y_reg_cudf = gd.Series(y_reg_train.values[:,0])

## Scikit-learn Model

### Classification :

#### Fit

In [None]:
%%time
skl_sgd_classifier = sklearn.linear_model.SGDClassifier(learning_rate=learning_rate,
                                                        eta0=eta0,
                                                        max_iter=max_iter,
                                                        fit_intercept=fit_intercept,
                                                        tol=tol,
                                                        penalty=penalty,
                                                        random_state=random_state)

skl_sgd_classifier.fit(X_class_train, y_class_train)

#### Predict

In [None]:
%%time
skl_class_pred = skl_sgd_classifier.predict(X_class_test)
skl_class_acc = accuracy_score(skl_class_pred, y_class_test)

## Scikit-learn Model

### Regression :

#### Fit

In [None]:
%%time
skl_sgd_regressor = sklearn.linear_model.SGDRegressor(learning_rate=learning_rate,
                                                      eta0=eta0,
                                                      max_iter=max_iter,
                                                      fit_intercept=fit_intercept,
                                                      tol=tol,
                                                      penalty=penalty,
                                                      random_state=random_state)

skl_sgd_regressor.fit(X_reg_train, y_reg_train)

#### Predict

In [None]:
%%time
skl_reg_pred = skl_sgd_regressor.predict(X_reg_test)
skl_reg_r2 = r2_score(skl_reg_pred, y_reg_test)

## cuML Model

### Classification:

#### Fit

In [None]:
%%time
cu_mbsgd_classifier = cuml.linear_model.MBSGDClassifier(learning_rate=learning_rate,
                                                        eta0=eta0,
                                                        epochs=max_iter,
                                                        fit_intercept=fit_intercept,
                                                        batch_size=batch_size,
                                                        tol=tol,
                                                        penalty=penalty)

cu_mbsgd_classifier.fit(X_class_cudf, y_class_cudf)

#### Predict

In [None]:
%%time
cu_class_pred = cu_mbsgd_classifier.predict(X_class_cudf_test).to_array()
cu_class_acc = accuracy_score(cu_class_pred, y_class_test)

### Regression:

#### Fit

In [None]:
%%time
cu_mbsgd_regressor = cuml.linear_model.MBSGDRegressor(learning_rate=learning_rate,
                                                      eta0=eta0,
                                                      epochs=max_iter,
                                                      fit_intercept=fit_intercept,
                                                      batch_size=batch_size,
                                                      tol=tol,
                                                      penalty=penalty)

cu_mbsgd_regressor.fit(X_reg_cudf, y_reg_cudf)

#### Predict

In [None]:
%%time
cu_reg_pred = cu_mbsgd_regressor.predict(X_reg_cudf_test).to_array()
cu_reg_r2 = r2_score(cu_reg_pred, y_reg_test)

## Evaluate Results

### Classification

In [None]:
print("Sklearn's R^2 score for classification : %s" % skl_class_acc)
print("cuML's R^2 score for classification : %s" % cu_class_acc)

### Regression

In [None]:
print("Sklearn's R^2 score for regression : %s" % skl_reg_r2)
print("cuML's R^2 score for regression : %s" % cu_reg_r2)