<a href="https://colab.research.google.com/github/srivatsan88/Mastering-Apache-Spark/blob/master/Spark_ML_on_GPU_in_Colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!nvidia-smi

In [0]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/spark-2.4.5/spark-2.4.5-bin-hadoop2.7.tgz
!tar xf spark-2.4.5-bin-hadoop2.7.tgz
!pip install -q findspark

In [0]:
!wget https://repo1.maven.org/maven2/ai/rapids/cudf/0.9.2/cudf-0.9.2-cuda10-1.jar
!wget https://repo1.maven.org/maven2/ai/rapids/xgboost4j_2.x/1.0.0-Beta5/xgboost4j_2.x-1.0.0-Beta5.jar
!wget https://repo1.maven.org/maven2/ai/rapids/xgboost4j-spark_2.x/1.0.0-Beta5/xgboost4j-spark_2.x-1.0.0-Beta5.jar

In [0]:
!ls
!pwd

In [0]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.5-bin-hadoop2.7"

In [0]:
os.environ['PYSPARK_SUBMIT_ARGS'] = '--jars /content/cudf-0.9.2-cuda10-1.jar,/content/xgboost4j_2.x-1.0.0-Beta5.jar,/content/xgboost4j-spark_2.x-1.0.0-Beta5.jar pyspark-shell'

In [0]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()
spark.conf.set("spark.executor.memory", "2g")
spark.conf.set("spark.driver.memory", "2g")
spark.sparkContext.addPyFile('/content/xgboost4j-spark_2.x-1.0.0-Beta5.jar')

In [0]:
from ml.dmlc.xgboost4j.scala.spark import XGBoostClassificationModel, XGBoostClassifier
from ml.dmlc.xgboost4j.scala.spark.rapids import GpuDataReader
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import numpy as np
import pandas as pd

In [0]:
from sklearn.datasets import fetch_openml
covtyp = fetch_openml(name='covertype', version=4)

In [0]:
covtyp.data.shape

In [0]:
np.unique(covtyp.target)

In [0]:
cov_df = pd.DataFrame(data= np.c_[covtyp['data'], covtyp['target']],
                     columns= covtyp['feature_names'] + ['target'])

In [0]:
cov_df.memory_usage(index=True).sum()

In [0]:
cov_df.head()

In [0]:
print ("Rows     : " ,cov_df.shape[0])
print ("Columns  : " ,cov_df.shape[1])

In [0]:
cov_df.target.value_counts()

In [0]:
cov_df.dtypes

In [0]:
for cols in cov_df.columns:
  cov_df[cols] = pd.to_numeric(cov_df[cols])

In [0]:
cov_df['target'] = cov_df['target']-1

In [0]:
train_df=cov_df.sample(frac=0.8,random_state=10) 
test_df=cov_df.drop(train_df.index)

In [0]:
train_df.to_parquet(fname='covtype_train.parquet',compression='snappy', index=False)
test_df.to_parquet(fname='covtype_test.parquet',compression='snappy', index=False)

In [0]:
!ls 

In [0]:
import pyarrow.parquet as pq

In [0]:
pq.read_table('covtype_train.parquet')

In [0]:
train_data = GpuDataReader(spark).format('parquet').load('covtype_train.parquet')
test_data = GpuDataReader(spark).format('parquet').load('covtype_test.parquet')

In [0]:
train_data.schema

In [0]:
pq_file=pq.read_table('covtype_train.parquet')

In [0]:
pq_file.column_names

In [0]:
label="target"
features = [ x for x in pq_file.column_names if x != label ]

In [0]:
features

In [0]:
import time
params = { 
    'eta': 0.1,
    'gamma': 0.1,
    'missing': 0.0,
    'treeMethod': 'gpu_hist',
    'maxDepth': 8, 
    'growPolicy': 'depthwise',
    'lambda_': 1.0,
    'subsample': 1.0,
    'numRound': 1000,
    'numWorkers': 1,
    'verbosity': 1
}
classifier = XGBoostClassifier(**params).setLabelCol(label).setFeaturesCols(features)

In [0]:
start_time = time.time()

model=classifier.fit(train_data)

print("GPU Training Time: %s seconds" % (str(time.time() - start_time)))

In [0]:
!nvidia-smi

In [0]:
model.write().overwrite().save('/content/model/')

In [0]:
!ls

In [0]:
loaded_model = XGBoostClassificationModel().load('/content/model/')

In [0]:
result=loaded_model.transform(test_data)

In [0]:
result.show()

In [0]:
MulticlassClassificationEvaluator().setLabelCol(label).evaluate(result)