# RAPIDS CuML
The RAPIDS library is now available in all Kaggle notebooks. Hooray! Simply type `import cuml` or `import cudf` to load the two most popular packages.

RAPIDS is described [here][1]. RAPIDS `cuDF` accelerates dataframe operations using GPU and has a similar api as Pandas. RAPIDS `cuML` accelerates machine learning algorithms using GPU and has a similar api as Scikit-Learn. Since RAPIDS ML algorithms are so fast, we can do things that were never possible like applying genetic algorithms to ML hyperparameter searchs!

[1]: https://rapids.ai/

# Load Libraries

In [None]:
import sys, warnings
warnings.filterwarnings("ignore")
sys.path.append('../input/iterativestratification')

import pandas as pd, numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import KFold
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

from sklearn.metrics import log_loss

import cuml
print('RAPIDS',cuml.__version__)

# Load Data

## First 30% test 70% trainig

In [None]:
train = pd.read_csv('../input/otto-group-product-classification-challenge/train.csv')
print('train shape',train.shape)
test = pd.read_csv('../input/otto-group-product-classification-challenge/test.csv')
print('test shape',test.shape)

train['target'].nunique()
#targets = pd.read_csv('../input/lish-moa/train_targets_scored.csv')

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [None]:
y= le.fit_transform(train['target'])
y

In [None]:
train

In [None]:
train = train.iloc[:,1:-1] #to remove the 1st coloumn which is ID and target.
train.shape
#train

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(train,y, test_size=0.3, random_state=0)

In [None]:
x_train.shape

In [None]:
x_test.shape

In [None]:
y_train.shape

In [None]:
y_test.shape

In [None]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=3)
classifier

In [None]:
import cudf, cuml
from cuml.neighbors import KNeighborsClassifier as cuKNeighbors
model = cuKNeighbors(n_neighbors=3)
model.fit(x_train, y_train)

In [None]:
y_pred = model.predict(x_test)
y_pred

In [None]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
accuracy

In [None]:
from sklearn.model_selection import cross_val_score
roc_acc = cross_val_score(classifier, x_train, y_train, cv=5, scoring='roc_auc_ovr').mean()
roc_acc

In [None]:
NGL = cross_val_score(model, x_train, y_train, scoring='neg_log_loss').mean()
NGL

   
  ## Now for 20% test and 80% train

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(train,y, test_size=0.2, random_state=0)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=3)
classifier

In [None]:
from cuml.neighbors import KNeighborsClassifier as cuKNeighbors
model = cuKNeighbors(n_neighbors=3)
model.fit(x_train, y_train)

In [None]:
y_pred = model.predict(x_test)
y_pred

In [None]:

accuracy = accuracy_score(y_test, y_pred)
accuracy

In [None]:

roc_acc = cross_val_score(classifier, x_train, y_train, cv=5, scoring='roc_auc_ovr').mean()
roc_acc

In [None]:
NGL = cross_val_score(model, x_train, y_train, scoring='neg_log_loss').mean()
NGL

## For the last one 10% test and 90% train

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(train,y, test_size=0.1, random_state=0)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=3)
classifier

In [None]:
from cuml.neighbors import KNeighborsClassifier as cuKNeighbors
model = cuKNeighbors(n_neighbors=3)
model.fit(x_train, y_train)

In [None]:
y_pred = model.predict(x_test)
y_pred

In [None]:

accuracy = accuracy_score(y_test, y_pred)
accuracy

In [None]:

roc_acc = cross_val_score(classifier, x_train, y_train, cv=5, scoring='roc_auc_ovr').mean()
roc_acc

In [None]:
NGL = cross_val_score(model, x_train, y_train, scoring='neg_log_loss').mean()
NGL

### As we can see... the increasing in accurecy is small. but its increasing when we but more data in training and less in testing.

### for the ROC_ACC its also increasing but very small increase. Same for the Neg_log_loss.

### So, the 10% test and 90% training is the best.

# K-fold validation:

In [None]:
kmodel = cuKNeighbors(n_neighbors=3)
kmodel.fit(train, y)

In [None]:
from sklearn.model_selection import cross_val_score
cross_val_score(model, train, y, cv=5, scoring='accuracy')

In [None]:
avrg = cross_val_score(model, train, y, cv=5, scoring='accuracy').mean()
avrg

In [None]:
roc_acc = cross_val_score(model, train, y, cv=5, scoring='roc_auc_ovr').mean()
roc_acc

### From this output it is safe to say that this methond "k-fold" gave us the same results as the previous ones. But, we can notice that this method has a very small increase more than the previous methods. Not big diffrence though.

## For the optimal K number:

In [None]:
k_list = list(range(2, 200))
k_list

In [None]:
cv_scores = []

for k in k_list:
    knn = cuKNeighbors(n_neighbors=k)
    scores = cross_val_score(knn, train, y, cv=5, scoring='accuracy').mean()
    cv_scores.append(scores)

In [None]:
cv_scores

In [None]:
import seaborn as sns

plt.figure()
plt.title("The optimal number of neighbors")
plt.xlabel("Number of Neighbors K")
plt.ylabel("Accuracy")
sns.set_style("whitegrid")
plt.plot(k_list, cv_scores)

plt.show()

In [None]:
best_k = k_list[cv_scores.index(max(cv_scores))]
best_k

#optimal number of neighbors