In [1]:
import cudf
from cuml.cluster import DBSCAN

In [3]:
gdf_float = cudf.DataFrame()
gdf_float['0'] = [1.0, 2.0, 5.0]
gdf_float['1'] = [4.0, 2.0, 1.0]
gdf_float['2'] = [4.0, 2.0, 1.0]

In [4]:
gdf_float

Unnamed: 0,0,1,2
0,1.0,4.0,4.0
1,2.0,2.0,2.0
2,5.0,1.0,1.0


In [5]:
dbscan_float = DBSCAN(eps=1.0, min_samples=1)
dbscan_float.fit(gdf_float)

DBSCAN()

In [6]:
print(dbscan_float.labels_)

0    0
1    1
2    2
dtype: int32


# Run on CPU-

In [1]:
import os

# Disable all GPUs
os.environ["CUDA_VISIBLE_DEVICES"] = ""

In [1]:
import cuml
from cuml.cluster import DBSCAN

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import cudf
gdf_float = cudf.DataFrame()
gdf_float['0'] = [1.0, 2.0, 5.0]
gdf_float['1'] = [4.0, 2.0, 1.0]
gdf_float['2'] = [4.0, 2.0, 1.0]

In [4]:
import pandas as pd
df_float = pd.DataFrame()
df_float['0'] = [1.0, 2.0, 5.0]
df_float['1'] = [4.0, 2.0, 1.0]
df_float['2'] = [4.0, 2.0, 1.0]

In [5]:
dbscan_float = DBSCAN(eps=1.0, min_samples=1)
dbscan_float.fit(df_float)

In [2]:
import cuml # no change is needed for even the importing!
import pandas as pd

from cuml.manifold.umap import UMAP
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.manifold import trustworthiness

# load the iris dataset from sklearn and extract the required information
iris = datasets.load_iris()
dataset = iris.data

iris_df = pd.DataFrame(iris.data, columns=iris.feature_names)

# define the cuml UMAP model and use fit_transform function to obtain the low dimensional output of the input dataset
embedding = UMAP(
    n_neighbors=10, min_dist=0.01,  init="random"
).fit_transform(iris_df)

# calculate the trust worthiness of the results obtaind from the cuml UMAP
trust = trustworthiness(iris_df, embedding)
print(trust)

0.9832676056338028


In [2]:
import cuml # no change is needed for even the importing!
import pandas as pd
from cuml.manifold.umap import UMAP
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.manifold import trustworthiness
from cuml.common.device_selection import using_device_type

# load the iris dataset from sklearn and extract the required information
iris = datasets.load_iris()
dataset = iris.data

iris_df = pd.DataFrame(iris.data, columns=iris.feature_names)

In [3]:
iris_df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


In [4]:
with using_device_type('gpu'):
    # define the cuml UMAP model and use fit_transform function to obtain the low dimensional output of the input dataset
    embedding = UMAP(
        n_neighbors=10, min_dist=0.01,  init="random"
    ).fit_transform(iris_df)

    # calculate the trust worthiness of the results obtaind from the cuml UMAP
    trust = trustworthiness(iris_df, embedding)
    print(trust)

0.982037558685446


# CPU / GPU experience is broken

In [5]:
with using_device_type('cpu'):
    # define the cuml UMAP model and use fit_transform function to obtain the low dimensional output of the input dataset
    embedding = UMAP(
        n_neighbors=10, min_dist=0.01,  init="random"
    ).fit_transform(iris_df)

    # calculate the trust worthiness of the results obtaind from the cuml UMAP
    trust = trustworthiness(iris_df, embedding)
    print(trust)

ModuleNotFoundError: No module named 'umap'

# umap with a large dataset-

In [5]:
# Load the transaction dataset and perform ==

In [1]:
import cudf

taxi_df = cudf.read_csv('/nvme/1/manass/notebooks/cuml_benchmarks/Data/yellow_tripdata_2015-01.csv')

In [2]:
taxi_df.info()

<class 'cudf.core.dataframe.DataFrame'>
RangeIndex: 12748986 entries, 0 to 12748985
Data columns (total 19 columns):
 #   Column                 Dtype
---  ------                 -----
 0   VendorID               int64
 1   tpep_pickup_datetime   object
 2   tpep_dropoff_datetime  object
 3   passenger_count        int64
 4   trip_distance          float64
 5   pickup_longitude       float64
 6   pickup_latitude        float64
 7   RateCodeID             int64
 8   store_and_fwd_flag     object
 9   dropoff_longitude      float64
 10  dropoff_latitude       float64
 11  payment_type           int64
 12  fare_amount            float64
 13  extra                  float64
 14  mta_tax                float64
 15  tip_amount             float64
 16  tolls_amount           float64
 17  improvement_surcharge  float64
 18  total_amount           float64
dtypes: float64(12), int64(4), object(3)
memory usage: 2.1+ GB


In [3]:
taxi_df.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RateCodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
0,2,2015-01-15 19:05:39,2015-01-15 19:23:42,1,1.59,-73.993896,40.750111,1,N,-73.974785,40.750618,1,12.0,1.0,0.5,3.25,0.0,0.3,17.05
1,1,2015-01-10 20:33:38,2015-01-10 20:53:28,1,3.3,-74.001648,40.724243,1,N,-73.994415,40.759109,1,14.5,0.5,0.5,2.0,0.0,0.3,17.8
2,1,2015-01-10 20:33:38,2015-01-10 20:43:41,1,1.8,-73.963341,40.802788,1,N,-73.95182,40.824413,2,9.5,0.5,0.5,0.0,0.0,0.3,10.8
3,1,2015-01-10 20:33:39,2015-01-10 20:35:31,1,0.5,-74.009087,40.713818,1,N,-74.004326,40.719986,2,3.5,0.5,0.5,0.0,0.0,0.3,4.8
4,1,2015-01-10 20:33:39,2015-01-10 20:52:58,1,3.0,-73.971176,40.762428,1,N,-74.004181,40.742653,2,15.0,0.5,0.5,0.0,0.0,0.3,16.3


In [4]:
duration =cudf.to_datetime(taxi_df['tpep_dropoff_datetime']) - cudf.to_datetime(taxi_df['tpep_pickup_datetime'])

In [5]:
taxi_df['duration_seconds']=duration.astype('int64') // 10**9

In [6]:
taxi_df['duration_seconds']

0           1083
1           1190
2            603
3            112
4           1159
            ... 
12748981     236
12748982     342
12748983     797
12748984     919
12748985     348
Name: duration_seconds, Length: 12748986, dtype: int64

In [7]:
taxi_df.info()

<class 'cudf.core.dataframe.DataFrame'>
RangeIndex: 12748986 entries, 0 to 12748985
Data columns (total 20 columns):
 #   Column                 Dtype
---  ------                 -----
 0   VendorID               int64
 1   tpep_pickup_datetime   object
 2   tpep_dropoff_datetime  object
 3   passenger_count        int64
 4   trip_distance          float64
 5   pickup_longitude       float64
 6   pickup_latitude        float64
 7   RateCodeID             int64
 8   store_and_fwd_flag     object
 9   dropoff_longitude      float64
 10  dropoff_latitude       float64
 11  payment_type           int64
 12  fare_amount            float64
 13  extra                  float64
 14  mta_tax                float64
 15  tip_amount             float64
 16  tolls_amount           float64
 17  improvement_surcharge  float64
 18  total_amount           float64
 19  duration_seconds       int64
dtypes: float64(12), int64(5), object(3)
memory usage: 2.2+ GB


In [8]:
clustering_df=taxi_df[list(set(list(taxi_df.columns)) - set(['tpep_pickup_datetime','tpep_dropoff_datetime','VendorID','store_and_fwd_flag','RateCodeID']))]

In [9]:
clustering_df.fillna(0, inplace=True)

In [None]:
# Memory usage of each column in bytes
memory_usage = clustering_df.memory_usage(deep=True)
print(memory_usage)

# Perform UMAP-

In [10]:
import cuml # no change is needed for even the importing!
import pandas as pd

from cuml.manifold.umap import UMAP
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.manifold import trustworthiness

In [11]:
%%time
# define the cuml UMAP model and use fit_transform function to obtain the low dimensional output of the input dataset
embedding = UMAP(
    build_algo="nn_descent"
).fit_transform(clustering_df)

MemoryError: std::bad_alloc: out_of_memory: CUDA error at: /nvme/1/manass/miniconda3/envs/rapids_cuml_testing_2408/include/rmm/mr/device/cuda_memory_resource.hpp

: 

In [None]:
# calculate the trust worthiness of the results obtaind from the cuml UMAP
trust = trustworthiness(clustering_df, embedding)
print(trust)

# let's try with a little smaller datasets-

In [1]:
%%time
import cudf

taxi_df = cudf.read_csv('/nvme/1/manass/notebooks/cuml_benchmarks/Data/yellow_tripdata_2015-01_reduced.csv')

CPU times: user 11.1 s, sys: 1.37 s, total: 12.5 s
Wall time: 5.28 s


In [2]:
taxi_df.info()

<class 'cudf.core.dataframe.DataFrame'>
RangeIndex: 6374493 entries, 0 to 6374492
Data columns (total 19 columns):
 #   Column                 Dtype
---  ------                 -----
 0   VendorID               int64
 1   tpep_pickup_datetime   object
 2   tpep_dropoff_datetime  object
 3   passenger_count        int64
 4   trip_distance          float64
 5   pickup_longitude       float64
 6   pickup_latitude        float64
 7   RateCodeID             int64
 8   store_and_fwd_flag     object
 9   dropoff_longitude      float64
 10  dropoff_latitude       float64
 11  payment_type           int64
 12  fare_amount            float64
 13  extra                  float64
 14  mta_tax                float64
 15  tip_amount             float64
 16  tolls_amount           float64
 17  improvement_surcharge  float64
 18  total_amount           float64
dtypes: float64(12), int64(4), object(3)
memory usage: 1.1+ GB


In [3]:
taxi_df.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RateCodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
0,1,2015-01-10 20:33:38,2015-01-10 20:53:28,1,3.3,-74.001648,40.724243,1,N,-73.994415,40.759109,1,14.5,0.5,0.5,2.0,0.0,0.3,17.8
1,1,2015-01-10 20:33:39,2015-01-10 20:35:31,1,0.5,-74.009087,40.713818,1,N,-74.004326,40.719986,2,3.5,0.5,0.5,0.0,0.0,0.3,4.8
2,1,2015-01-10 20:33:39,2015-01-10 20:53:52,1,9.0,-73.874374,40.774048,1,N,-73.986977,40.758194,1,27.0,0.5,0.5,6.7,5.33,0.3,40.33
3,1,2015-01-10 20:33:39,2015-01-10 20:42:20,3,0.8,-74.002663,40.734142,1,N,-73.99501,40.726326,1,7.0,0.5,0.5,1.66,0.0,0.3,9.96
4,1,2015-01-10 20:33:40,2015-01-10 20:40:44,2,0.9,-73.985588,40.767948,1,N,-73.985916,40.759365,1,6.5,0.5,0.5,1.55,0.0,0.3,9.35


In [4]:
clustering_df=taxi_df[list(set(list(taxi_df.columns)) - set(['tpep_pickup_datetime','tpep_dropoff_datetime','VendorID','store_and_fwd_flag','RateCodeID']))]

In [5]:
clustering_df.fillna(0, inplace=True)

In [6]:
# Memory usage of each column in bytes
memory_usage = clustering_df.memory_usage(deep=True)
print(memory_usage)

total_amount             50995944
dropoff_longitude        50995944
fare_amount              50995944
dropoff_latitude         50995944
pickup_latitude          50995944
passenger_count          50995944
improvement_surcharge    50995944
payment_type             50995944
trip_distance            50995944
tip_amount               50995944
mta_tax                  50995944
pickup_longitude         50995944
extra                    50995944
tolls_amount             50995944
Index                           0
dtype: int64


In [1]:
import cuml # no change is needed for even the importing!
import pandas as pd

from cuml.manifold.umap import UMAP
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.manifold import trustworthiness

In [8]:
%%time
# define the cuml UMAP model and use fit_transform function to obtain the low dimensional output of the input dataset
embedding = UMAP(
    build_algo="nn_descent"
).fit_transform(clustering_df)

MemoryError: std::bad_alloc: out_of_memory: CUDA error at: /nvme/1/manass/miniconda3/envs/rapids_cuml_testing_2408/include/rmm/mr/device/cuda_memory_resource.hpp

# Reducing the data size to 600 MB

In [2]:
%%time
import cudf

taxi_df = cudf.read_csv('/nvme/1/manass/notebooks/cuml_benchmarks/Data/yellow_tripdata_2015-01_fourth.csv')

CPU times: user 1.05 s, sys: 998 ms, total: 2.05 s
Wall time: 2.16 s


In [3]:
taxi_df.info()

<class 'cudf.core.dataframe.DataFrame'>
RangeIndex: 3187246 entries, 0 to 3187245
Data columns (total 19 columns):
 #   Column                 Dtype
---  ------                 -----
 0   VendorID               int64
 1   tpep_pickup_datetime   object
 2   tpep_dropoff_datetime  object
 3   passenger_count        int64
 4   trip_distance          float64
 5   pickup_longitude       float64
 6   pickup_latitude        float64
 7   RateCodeID             int64
 8   store_and_fwd_flag     object
 9   dropoff_longitude      float64
 10  dropoff_latitude       float64
 11  payment_type           int64
 12  fare_amount            float64
 13  extra                  float64
 14  mta_tax                float64
 15  tip_amount             float64
 16  tolls_amount           float64
 17  improvement_surcharge  float64
 18  total_amount           float64
dtypes: float64(12), int64(4), object(3)
memory usage: 544.5+ MB


In [4]:
clustering_df=taxi_df[list(set(list(taxi_df.columns)) - set(['tpep_pickup_datetime','tpep_dropoff_datetime','VendorID','store_and_fwd_flag','RateCodeID']))]

In [5]:
clustering_df.fillna(0, inplace=True)

In [13]:
%%time
# define the cuml UMAP model and use fit_transform function to obtain the low dimensional output of the input dataset
embedding = UMAP(
    build_algo="nn_descent", n_neighbors=4
).fit_transform(clustering_df)

CPU times: user 7min 10s, sys: 36.3 s, total: 7min 47s
Wall time: 2min 1s


In [6]:
%%time
# define the cuml UMAP model and use fit_transform function to obtain the low dimensional output of the input dataset
embedding = UMAP(build_algo = 'brute_force_knn'
).fit_transform(clustering_df)

CPU times: user 5min 22s, sys: 685 ms, total: 5min 23s
Wall time: 5min 12s


: 

In [14]:
%%time
# define the cuml UMAP model and use fit_transform function to obtain the low dimensional output of the input dataset
embedding = UMAP(
    build_algo="nn_descent", n_neighbors=4
).fit_transform(clustering_df)

CPU times: user 6min 55s, sys: 53.9 s, total: 7min 49s
Wall time: 1min 42s


# Seamlessly go from CPU to GPU-

## Managing Execution Platform with GPU package

### using_device_type context manager

In [1]:
import cuml # no change is needed for even the importing!
import pandas as pd
from cuml.manifold.umap import UMAP
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.manifold import trustworthiness
from cuml.common.device_selection import using_device_type

In [2]:
import cudf

taxi_df = cudf.read_csv('/nvme/1/manass/notebooks/cuml_benchmarks/Data/yellow_tripdata_2015-01.csv')

In [3]:
clustering_df=taxi_df[list(set(list(taxi_df.columns)) - set(['tpep_pickup_datetime','tpep_dropoff_datetime','VendorID','store_and_fwd_flag','RateCodeID']))]

In [4]:
clustering_df.info()

<class 'cudf.core.dataframe.DataFrame'>
RangeIndex: 12748986 entries, 0 to 12748985
Data columns (total 14 columns):
 #   Column                 Dtype
---  ------                 -----
 0   fare_amount            float64
 1   tolls_amount           float64
 2   dropoff_latitude       float64
 3   improvement_surcharge  float64
 4   pickup_longitude       float64
 5   payment_type           int64
 6   passenger_count        int64
 7   mta_tax                float64
 8   trip_distance          float64
 9   tip_amount             float64
 10  extra                  float64
 11  dropoff_longitude      float64
 12  pickup_latitude        float64
 13  total_amount           float64
dtypes: float64(12), int64(2)
memory usage: 1.3 GB


In [5]:
clustering_df.fillna(0, inplace=True)

In [6]:
%%time
with using_device_type('gpu'):
# define the cuml UMAP model and use fit_transform function to obtain the low dimensional output of the input dataset
    embedding = UMAP(
        build_algo="nn_descent", n_neighbors=4
    ).fit_transform(clustering_df)

# Doesn't allow partial execution between CPU and GPU-

In [8]:
%%time
with using_device_type('cpu'):
# define the cuml UMAP model and use fit_transform function to obtain the low dimensional output of the input dataset
    embedding = UMAP(
        build_algo="nn_descent", n_neighbors=4
    ).fit_transform(clustering_df)

ModuleNotFoundError: No module named 'umap'

In [5]:
clustering_df.shape

(12748986, 14)

In [6]:
from cuml import PCA

# Initialize PCA model
pca = PCA(n_components=10)

In [9]:
%%time
with using_device_type('cpu'):
    transformed_df=pca.fit_transform(clustering_df)

CPU times: user 14.6 s, sys: 2.64 s, total: 17.2 s
Wall time: 2.09 s


In [10]:
type(transformed_df)

numpy.ndarray

In [11]:
%%time
with using_device_type('gpu'):
    transformed_df=pca.fit_transform(clustering_df)

CPU times: user 17.3 s, sys: 70.2 ms, total: 17.3 s
Wall time: 17.1 s


In [13]:
type(transformed_df)

cudf.core.dataframe.DataFrame

# Global configuration with `set_global_device_type`

In [7]:
from cuml.common.device_selection import set_global_device_type, get_global_device_type

initial_device_type = get_global_device_type()
print('default execution device:', initial_device_type)

default execution device: DeviceType.device


In [11]:
clustering_df.fillna(0, inplace=True)

#### CPU execution-

In [None]:
set_global_device_type('cpu')
print('new device type:', get_global_device_type())

In [12]:
%%time
transformed_df=pca.fit_transform(clustering_df)

CPU times: user 13.2 s, sys: 2.76 s, total: 16 s
Wall time: 2 s


In [13]:
type(transformed_df)

numpy.ndarray

# GPU execution-

In [14]:
set_global_device_type('gpu')
print('new device type:', get_global_device_type())

new device type: DeviceType.device


In [8]:
%%time
transformed_df=pca.fit_transform(clustering_df)

CPU times: user 17.8 s, sys: 73.8 ms, total: 17.8 s
Wall time: 17.3 s


In [9]:
type(transformed_df)

cudf.core.dataframe.DataFrame

### using cuml-cpu library-

In [11]:
import cuml

In [12]:
cuml

<module 'cuml' from '/nvme/1/manass/miniconda3/envs/rapids_cuml_testing_2408/lib/python3.11/site-packages/cuml/__init__.py'>

In [1]:
import cuml # no change is needed for even the importing!
import pandas as pd
from cuml.manifold.umap import UMAP
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.manifold import trustworthiness
from cuml.common.device_selection import using_device_type

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import pandas as pd

taxi_df = pd.read_csv('/nvme/1/manass/notebooks/cuml_benchmarks/Data/yellow_tripdata_2015-01_fourth.csv')

In [3]:
type(taxi_df)

pandas.core.frame.DataFrame

In [5]:
clustering_df=taxi_df[list(set(list(taxi_df.columns)) - set(['tpep_pickup_datetime','tpep_dropoff_datetime','VendorID','store_and_fwd_flag','RateCodeID']))]

In [6]:
clustering_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3187246 entries, 0 to 3187245
Data columns (total 14 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   trip_distance          float64
 1   tolls_amount           float64
 2   passenger_count        int64  
 3   tip_amount             float64
 4   dropoff_longitude      float64
 5   fare_amount            float64
 6   improvement_surcharge  float64
 7   pickup_latitude        float64
 8   dropoff_latitude       float64
 9   total_amount           float64
 10  extra                  float64
 11  mta_tax                float64
 12  payment_type           int64  
 13  pickup_longitude       float64
dtypes: float64(12), int64(2)
memory usage: 340.4 MB


In [7]:
clustering_df.fillna(0, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clustering_df.fillna(0, inplace=True)


In [8]:
embedding = UMAP(
    build_algo="nn_descent", n_neighbors=4
).fit_transform(clustering_df)

TypeError: 'NoneType' object is not iterable

# Uninstalling the cuml package-

In [10]:
!pip uninstall cuml -y 

Found existing installation: cuml 24.8.0
Uninstalling cuml-24.8.0:
  Successfully uninstalled cuml-24.8.0


In [12]:
!conda install -c rapidsai -c nvidia -c conda-forge cuml-cpu=24.08

Channels:
 - rapidsai
 - nvidia
 - conda-forge
 - defaults
Platform: linux-64
Collecting package metadata (repodata.json): done
Solving environment: done

# All requested packages already installed.



#### I had to uninstall the gpu package, install cuml-cpu because it was default to the GPU package if I was only doing import cpu.

#### I had to remove it from conda and pip, I also had to remove the cuml namespace header which was lingering by manual deleting it. 

### Then reinstalled cuml-cpu

In [4]:
import cuml # no change is needed for even the importing!
import pandas as pd

from cuml.manifold.umap import UMAP
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.manifold import trustworthiness

In [5]:
cuml

<module 'cuml' from '/nvme/1/manass/miniconda3/envs/rapids_cuml_testing_2408/lib/python3.11/site-packages/cuml/__init__.py'>

In [7]:
import pandas as pd

taxi_df = pd.read_csv('/nvme/1/manass/notebooks/cuml_benchmarks/Data/yellow_tripdata_2015-01_fourth.csv')

clustering_df=taxi_df[list(set(list(taxi_df.columns)) - set(['tpep_pickup_datetime','tpep_dropoff_datetime','VendorID','store_and_fwd_flag','RateCodeID']))]

clustering_df.fillna(0, inplace=True)

embedding = UMAP(
    build_algo="nn_descent", n_neighbors=4
).fit_transform(clustering_df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clustering_df.fillna(0, inplace=True)


TypeError: 'NoneType' object is not iterable

In [9]:
import pandas as pd

taxi_df = pd.read_csv('/nvme/1/manass/notebooks/cuml_benchmarks/Data/yellow_tripdata_2015-01_fourth.csv')

clustering_df=taxi_df[list(set(list(taxi_df.columns)) - set(['tpep_pickup_datetime','tpep_dropoff_datetime','VendorID','store_and_fwd_flag','RateCodeID']))]

clustering_df.fillna(0, inplace=True)

embedding = UMAP(
    n_neighbors=4
).fit_transform(clustering_df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clustering_df.fillna(0, inplace=True)


TypeError: 'NoneType' object is not iterable

In [10]:
import cuml # no change is needed for even the importing!
import pandas as pd

from cuml.manifold.umap import UMAP
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.manifold import trustworthiness

# load the iris dataset from sklearn and extract the required information
iris = datasets.load_iris()
dataset = iris.data

iris_df = pd.DataFrame(iris.data, columns=iris.feature_names)

# define the cuml UMAP model and use fit_transform function to obtain the low dimensional output of the input dataset
embedding = UMAP(
    n_neighbors=10, min_dist=0.01,  init="random"
).fit_transform(iris_df)

# calculate the trust worthiness of the results obtaind from the cuml UMAP
trust = trustworthiness(iris_df, embedding)
print(trust)

TypeError: 'NoneType' object is not iterable

In [11]:
UMAP

cuml.manifold.umap.UMAP